holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +17 -4
- holmes/common/env_vars.py +40 -1
- holmes/config.py +114 -144
- holmes/core/conversations.py +53 -14
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +18 -22
- holmes/core/llm.py +489 -88
- holmes/core/models.py +103 -1
- holmes/core/openai_formatting.py +13 -0
- holmes/core/prompt.py +1 -1
- holmes/core/safeguards.py +4 -4
- holmes/core/supabase_dal.py +293 -100
- holmes/core/tool_calling_llm.py +423 -323
- holmes/core/tools.py +311 -33
- holmes/core/tools_utils/token_counting.py +14 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
- holmes/core/tools_utils/tool_executor.py +13 -8
- holmes/core/toolset_manager.py +155 -4
- holmes/core/tracing.py +6 -1
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +62 -0
- holmes/core/transformers/llm_summarize.py +174 -0
- holmes/core/transformers/registry.py +122 -0
- holmes/core/transformers/transformer.py +31 -0
- holmes/core/truncation/compaction.py +59 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +218 -0
- holmes/interactive.py +177 -24
- holmes/main.py +7 -4
- holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
- holmes/plugins/prompts/generic_ask.jinja2 +2 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
- holmes/plugins/runbooks/__init__.py +117 -18
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/toolsets/__init__.py +21 -8
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
- holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
- holmes/plugins/toolsets/bash/common/bash.py +7 -7
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
- holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
- holmes/plugins/toolsets/git.py +51 -46
- holmes/plugins/toolsets/grafana/common.py +15 -3
- holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
- holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
- holmes/plugins/toolsets/internet/internet.py +6 -7
- holmes/plugins/toolsets/internet/notion.py +5 -6
- holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
- holmes/plugins/toolsets/kafka.py +25 -36
- holmes/plugins/toolsets/kubernetes.yaml +58 -84
- holmes/plugins/toolsets/kubernetes_logs.py +6 -6
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
- holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
- holmes/plugins/toolsets/newrelic/__init__.py +0 -0
- holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
- holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
- holmes/plugins/toolsets/robusta/robusta.py +236 -65
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/global_instructions.py +75 -10
- holmes/utils/holmes_status.py +2 -1
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/sentry_helper.py +41 -0
- holmes/utils/stream.py +61 -7
- holmes/version.py +34 -14
- holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
|
@@ -2,7 +2,11 @@ import logging
|
|
|
2
2
|
from typing import Dict
|
|
3
3
|
from datetime import datetime, timezone
|
|
4
4
|
|
|
5
|
-
from holmes.core.tools import
|
|
5
|
+
from holmes.core.tools import (
|
|
6
|
+
StructuredToolResult,
|
|
7
|
+
StructuredToolResultStatus,
|
|
8
|
+
ToolInvokeContext,
|
|
9
|
+
)
|
|
6
10
|
from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
|
|
7
11
|
BaseAzureSQLTool,
|
|
8
12
|
BaseAzureSQLToolset,
|
|
@@ -147,9 +151,7 @@ class GetActiveAlerts(BaseAzureSQLTool):
|
|
|
147
151
|
|
|
148
152
|
return "\n".join(report_sections)
|
|
149
153
|
|
|
150
|
-
def _invoke(
|
|
151
|
-
self, params: dict, user_approved: bool = False
|
|
152
|
-
) -> StructuredToolResult:
|
|
154
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
153
155
|
try:
|
|
154
156
|
db_config = self.toolset.database_config()
|
|
155
157
|
api_client = self.toolset.api_client()
|
|
@@ -170,7 +172,7 @@ class GetActiveAlerts(BaseAzureSQLTool):
|
|
|
170
172
|
# Check for errors
|
|
171
173
|
if "error" in alerts_data:
|
|
172
174
|
return StructuredToolResult(
|
|
173
|
-
status=
|
|
175
|
+
status=StructuredToolResultStatus.ERROR,
|
|
174
176
|
error=alerts_data["error"],
|
|
175
177
|
params=params,
|
|
176
178
|
)
|
|
@@ -179,7 +181,7 @@ class GetActiveAlerts(BaseAzureSQLTool):
|
|
|
179
181
|
report_text = self._build_alerts_report(db_config, alerts_data, "active")
|
|
180
182
|
|
|
181
183
|
return StructuredToolResult(
|
|
182
|
-
status=
|
|
184
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
183
185
|
data=report_text,
|
|
184
186
|
params=params,
|
|
185
187
|
)
|
|
@@ -187,7 +189,7 @@ class GetActiveAlerts(BaseAzureSQLTool):
|
|
|
187
189
|
error_msg = f"Failed to retrieve active alerts: {str(e)}"
|
|
188
190
|
logging.error(error_msg)
|
|
189
191
|
return StructuredToolResult(
|
|
190
|
-
status=
|
|
192
|
+
status=StructuredToolResultStatus.ERROR,
|
|
191
193
|
error=error_msg,
|
|
192
194
|
params=params,
|
|
193
195
|
)
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Dict, List, Tuple
|
|
3
3
|
|
|
4
|
-
from holmes.core.tools import
|
|
4
|
+
from holmes.core.tools import (
|
|
5
|
+
StructuredToolResult,
|
|
6
|
+
ToolInvokeContext,
|
|
7
|
+
ToolParameter,
|
|
8
|
+
StructuredToolResultStatus,
|
|
9
|
+
)
|
|
5
10
|
from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
|
|
6
11
|
BaseAzureSQLTool,
|
|
7
12
|
BaseAzureSQLToolset,
|
|
@@ -99,9 +104,7 @@ class GetSlowQueries(BaseAzureSQLTool):
|
|
|
99
104
|
|
|
100
105
|
return "\n".join(report_sections)
|
|
101
106
|
|
|
102
|
-
def _invoke(
|
|
103
|
-
self, params: dict, user_approved: bool = False
|
|
104
|
-
) -> StructuredToolResult:
|
|
107
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
105
108
|
try:
|
|
106
109
|
top_count = params.get("top_count", 15)
|
|
107
110
|
hours_back = params.get("hours_back", 2)
|
|
@@ -125,7 +128,7 @@ class GetSlowQueries(BaseAzureSQLTool):
|
|
|
125
128
|
)
|
|
126
129
|
|
|
127
130
|
return StructuredToolResult(
|
|
128
|
-
status=
|
|
131
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
129
132
|
data=report_text,
|
|
130
133
|
params=params,
|
|
131
134
|
)
|
|
@@ -133,7 +136,7 @@ class GetSlowQueries(BaseAzureSQLTool):
|
|
|
133
136
|
error_msg = f"Failed to get slow queries: {str(e)}"
|
|
134
137
|
logging.error(error_msg)
|
|
135
138
|
return StructuredToolResult(
|
|
136
|
-
status=
|
|
139
|
+
status=StructuredToolResultStatus.ERROR,
|
|
137
140
|
error=error_msg,
|
|
138
141
|
params=params,
|
|
139
142
|
)
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Dict, List, Tuple
|
|
3
3
|
|
|
4
|
-
from holmes.core.tools import
|
|
4
|
+
from holmes.core.tools import (
|
|
5
|
+
StructuredToolResult,
|
|
6
|
+
ToolInvokeContext,
|
|
7
|
+
ToolParameter,
|
|
8
|
+
StructuredToolResultStatus,
|
|
9
|
+
)
|
|
5
10
|
from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
|
|
6
11
|
BaseAzureSQLTool,
|
|
7
12
|
BaseAzureSQLToolset,
|
|
@@ -97,9 +102,7 @@ class GetTopCPUQueries(BaseAzureSQLTool):
|
|
|
97
102
|
|
|
98
103
|
return "\n".join(report_sections)
|
|
99
104
|
|
|
100
|
-
def _invoke(
|
|
101
|
-
self, params: dict, user_approved: bool = False
|
|
102
|
-
) -> StructuredToolResult:
|
|
105
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
103
106
|
try:
|
|
104
107
|
top_count = params.get("top_count", 15)
|
|
105
108
|
hours_back = params.get("hours_back", 2)
|
|
@@ -123,7 +126,7 @@ class GetTopCPUQueries(BaseAzureSQLTool):
|
|
|
123
126
|
)
|
|
124
127
|
|
|
125
128
|
return StructuredToolResult(
|
|
126
|
-
status=
|
|
129
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
127
130
|
data=report_text,
|
|
128
131
|
params=params,
|
|
129
132
|
)
|
|
@@ -131,7 +134,7 @@ class GetTopCPUQueries(BaseAzureSQLTool):
|
|
|
131
134
|
error_msg = f"Failed to get top CPU queries: {str(e)}"
|
|
132
135
|
logging.error(error_msg)
|
|
133
136
|
return StructuredToolResult(
|
|
134
|
-
status=
|
|
137
|
+
status=StructuredToolResultStatus.ERROR,
|
|
135
138
|
error=error_msg,
|
|
136
139
|
params=params,
|
|
137
140
|
)
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Dict, List, Tuple
|
|
3
3
|
|
|
4
|
-
from holmes.core.tools import
|
|
4
|
+
from holmes.core.tools import (
|
|
5
|
+
StructuredToolResult,
|
|
6
|
+
ToolInvokeContext,
|
|
7
|
+
ToolParameter,
|
|
8
|
+
StructuredToolResultStatus,
|
|
9
|
+
)
|
|
5
10
|
from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
|
|
6
11
|
BaseAzureSQLTool,
|
|
7
12
|
BaseAzureSQLToolset,
|
|
@@ -115,9 +120,7 @@ class GetTopDataIOQueries(BaseAzureSQLTool):
|
|
|
115
120
|
|
|
116
121
|
return "\n".join(report_sections)
|
|
117
122
|
|
|
118
|
-
def _invoke(
|
|
119
|
-
self, params: dict, user_approved: bool = False
|
|
120
|
-
) -> StructuredToolResult:
|
|
123
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
121
124
|
try:
|
|
122
125
|
top_count = params.get("top_count", 15)
|
|
123
126
|
hours_back = params.get("hours_back", 2)
|
|
@@ -141,7 +144,7 @@ class GetTopDataIOQueries(BaseAzureSQLTool):
|
|
|
141
144
|
)
|
|
142
145
|
|
|
143
146
|
return StructuredToolResult(
|
|
144
|
-
status=
|
|
147
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
145
148
|
data=report_text,
|
|
146
149
|
params=params,
|
|
147
150
|
)
|
|
@@ -149,7 +152,7 @@ class GetTopDataIOQueries(BaseAzureSQLTool):
|
|
|
149
152
|
error_msg = f"Failed to get top data I/O queries: {str(e)}"
|
|
150
153
|
logging.error(error_msg)
|
|
151
154
|
return StructuredToolResult(
|
|
152
|
-
status=
|
|
155
|
+
status=StructuredToolResultStatus.ERROR,
|
|
153
156
|
error=error_msg,
|
|
154
157
|
params=params,
|
|
155
158
|
)
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Dict, List, Tuple
|
|
3
3
|
|
|
4
|
-
from holmes.core.tools import
|
|
4
|
+
from holmes.core.tools import (
|
|
5
|
+
StructuredToolResult,
|
|
6
|
+
ToolInvokeContext,
|
|
7
|
+
ToolParameter,
|
|
8
|
+
StructuredToolResultStatus,
|
|
9
|
+
)
|
|
5
10
|
from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
|
|
6
11
|
BaseAzureSQLTool,
|
|
7
12
|
BaseAzureSQLToolset,
|
|
@@ -107,9 +112,7 @@ class GetTopLogIOQueries(BaseAzureSQLTool):
|
|
|
107
112
|
|
|
108
113
|
return "\n".join(report_sections)
|
|
109
114
|
|
|
110
|
-
def _invoke(
|
|
111
|
-
self, params: dict, user_approved: bool = False
|
|
112
|
-
) -> StructuredToolResult:
|
|
115
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
113
116
|
try:
|
|
114
117
|
top_count = params.get("top_count", 15)
|
|
115
118
|
hours_back = params.get("hours_back", 2)
|
|
@@ -133,7 +136,7 @@ class GetTopLogIOQueries(BaseAzureSQLTool):
|
|
|
133
136
|
)
|
|
134
137
|
|
|
135
138
|
return StructuredToolResult(
|
|
136
|
-
status=
|
|
139
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
137
140
|
data=report_text,
|
|
138
141
|
params=params,
|
|
139
142
|
)
|
|
@@ -141,7 +144,7 @@ class GetTopLogIOQueries(BaseAzureSQLTool):
|
|
|
141
144
|
error_msg = f"Failed to get top log I/O queries: {str(e)}"
|
|
142
145
|
logging.error(error_msg)
|
|
143
146
|
return StructuredToolResult(
|
|
144
|
-
status=
|
|
147
|
+
status=StructuredToolResultStatus.ERROR,
|
|
145
148
|
error=error_msg,
|
|
146
149
|
params=params,
|
|
147
150
|
)
|
|
@@ -16,8 +16,9 @@ from holmes.core.tools import (
|
|
|
16
16
|
CallablePrerequisite,
|
|
17
17
|
StructuredToolResult,
|
|
18
18
|
Tool,
|
|
19
|
+
ToolInvokeContext,
|
|
19
20
|
ToolParameter,
|
|
20
|
-
|
|
21
|
+
StructuredToolResultStatus,
|
|
21
22
|
Toolset,
|
|
22
23
|
ToolsetTag,
|
|
23
24
|
)
|
|
@@ -82,9 +83,7 @@ class KubectlRunImageCommand(BaseBashTool):
|
|
|
82
83
|
command_str = get_param_or_raise(params, "command")
|
|
83
84
|
return f"kubectl run {pod_name} --image={image} --namespace={namespace} --rm --attach --restart=Never -i -- {command_str}"
|
|
84
85
|
|
|
85
|
-
def _invoke(
|
|
86
|
-
self, params: dict, user_approved: bool = False
|
|
87
|
-
) -> StructuredToolResult:
|
|
86
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
88
87
|
timeout = params.get("timeout", 60)
|
|
89
88
|
|
|
90
89
|
image = get_param_or_raise(params, "image")
|
|
@@ -94,7 +93,7 @@ class KubectlRunImageCommand(BaseBashTool):
|
|
|
94
93
|
|
|
95
94
|
if namespace and not re.match(SAFE_NAMESPACE_PATTERN, namespace):
|
|
96
95
|
return StructuredToolResult(
|
|
97
|
-
status=
|
|
96
|
+
status=StructuredToolResultStatus.ERROR,
|
|
98
97
|
error=f"Error: The namespace is invalid. Valid namespaces must match the following regexp: {SAFE_NAMESPACE_PATTERN}",
|
|
99
98
|
params=params,
|
|
100
99
|
)
|
|
@@ -118,7 +117,7 @@ class KubectlRunImageCommand(BaseBashTool):
|
|
|
118
117
|
}
|
|
119
118
|
)
|
|
120
119
|
return StructuredToolResult(
|
|
121
|
-
status=
|
|
120
|
+
status=StructuredToolResultStatus.ERROR,
|
|
122
121
|
error=str(e),
|
|
123
122
|
params=params,
|
|
124
123
|
)
|
|
@@ -164,22 +163,20 @@ class RunBashCommand(BaseBashTool):
|
|
|
164
163
|
toolset=toolset,
|
|
165
164
|
)
|
|
166
165
|
|
|
167
|
-
def _invoke(
|
|
168
|
-
self, params: dict, user_approved: bool = False
|
|
169
|
-
) -> StructuredToolResult:
|
|
166
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
170
167
|
command_str = params.get("command")
|
|
171
168
|
timeout = params.get("timeout", 60)
|
|
172
169
|
|
|
173
170
|
if not command_str:
|
|
174
171
|
return StructuredToolResult(
|
|
175
|
-
status=
|
|
172
|
+
status=StructuredToolResultStatus.ERROR,
|
|
176
173
|
error="The 'command' parameter is required and was not provided.",
|
|
177
174
|
params=params,
|
|
178
175
|
)
|
|
179
176
|
|
|
180
177
|
if not isinstance(command_str, str):
|
|
181
178
|
return StructuredToolResult(
|
|
182
|
-
status=
|
|
179
|
+
status=StructuredToolResultStatus.ERROR,
|
|
183
180
|
error=f"The 'command' parameter must be a string, got {type(command_str).__name__}.",
|
|
184
181
|
params=params,
|
|
185
182
|
)
|
|
@@ -187,7 +184,7 @@ class RunBashCommand(BaseBashTool):
|
|
|
187
184
|
command_to_execute = command_str
|
|
188
185
|
|
|
189
186
|
# Only run the safety check if user has NOT approved the command
|
|
190
|
-
if not user_approved:
|
|
187
|
+
if not context.user_approved:
|
|
191
188
|
try:
|
|
192
189
|
command_to_execute = make_command_safe(command_str, self.toolset.config)
|
|
193
190
|
|
|
@@ -202,7 +199,7 @@ class RunBashCommand(BaseBashTool):
|
|
|
202
199
|
logging.info(f"Refusing LLM tool call {command_str}")
|
|
203
200
|
|
|
204
201
|
return StructuredToolResult(
|
|
205
|
-
status=
|
|
202
|
+
status=StructuredToolResultStatus.APPROVAL_REQUIRED,
|
|
206
203
|
error=f"Refusing to execute bash command. {str(e)}",
|
|
207
204
|
params=params,
|
|
208
205
|
invocation=command_str,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import subprocess
|
|
2
|
-
from holmes.core.tools import StructuredToolResult,
|
|
2
|
+
from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def execute_bash_command(cmd: str, timeout: int, params: dict) -> StructuredToolResult:
|
|
@@ -18,11 +18,11 @@ def execute_bash_command(cmd: str, timeout: int, params: dict) -> StructuredTool
|
|
|
18
18
|
stdout = process.stdout.strip() if process.stdout else ""
|
|
19
19
|
result_data = f"{cmd}\n" f"{stdout}"
|
|
20
20
|
|
|
21
|
-
status =
|
|
21
|
+
status = StructuredToolResultStatus.ERROR
|
|
22
22
|
if process.returncode == 0 and stdout:
|
|
23
|
-
status =
|
|
23
|
+
status = StructuredToolResultStatus.SUCCESS
|
|
24
24
|
elif not stdout:
|
|
25
|
-
status =
|
|
25
|
+
status = StructuredToolResultStatus.NO_DATA
|
|
26
26
|
|
|
27
27
|
return StructuredToolResult(
|
|
28
28
|
status=status,
|
|
@@ -33,20 +33,20 @@ def execute_bash_command(cmd: str, timeout: int, params: dict) -> StructuredTool
|
|
|
33
33
|
)
|
|
34
34
|
except subprocess.TimeoutExpired:
|
|
35
35
|
return StructuredToolResult(
|
|
36
|
-
status=
|
|
36
|
+
status=StructuredToolResultStatus.ERROR,
|
|
37
37
|
error=f"Error: Command '{cmd}' timed out after {timeout} seconds.",
|
|
38
38
|
params=params,
|
|
39
39
|
)
|
|
40
40
|
except FileNotFoundError:
|
|
41
41
|
# This might occur if /bin/bash is not found, or if shell=False and command is not found
|
|
42
42
|
return StructuredToolResult(
|
|
43
|
-
status=
|
|
43
|
+
status=StructuredToolResultStatus.ERROR,
|
|
44
44
|
error="Error: Bash executable or command not found. Ensure bash is installed and the command is valid.",
|
|
45
45
|
params=params,
|
|
46
46
|
)
|
|
47
47
|
except Exception as e:
|
|
48
48
|
return StructuredToolResult(
|
|
49
|
-
status=
|
|
49
|
+
status=StructuredToolResultStatus.ERROR,
|
|
50
50
|
error=f"Error executing command '{cmd}': {str(e)}",
|
|
51
51
|
params=params,
|
|
52
52
|
)
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
toolsets:
|
|
2
|
+
cilium/core:
|
|
3
|
+
description: "Cilium CNI and Hubble network observability tools for troubleshooting network connectivity and security"
|
|
4
|
+
docs_url: "https://robusta-dev.github.io/holmesgpt/data-sources/builtin-toolsets/cilium/"
|
|
5
|
+
icon_url: "https://cilium.io/static/logo-dark-text.png"
|
|
6
|
+
llm_instructions: |
|
|
7
|
+
You have access to comprehensive Cilium and Hubble tools for debugging Kubernetes networking and security.
|
|
8
|
+
Use these tools to investigate:
|
|
9
|
+
- Network connectivity issues between pods/services
|
|
10
|
+
- Network policy violations
|
|
11
|
+
- Load balancing problems
|
|
12
|
+
- Network flows and traffic patterns
|
|
13
|
+
- Cilium component health and status
|
|
14
|
+
|
|
15
|
+
ALWAYS follow these troubleshooting steps:
|
|
16
|
+
1. Check Cilium agent status and health, and attempt to remediate if broken.
|
|
17
|
+
2. Use Hubble to observe network flows and identify blocked or dropped traffic
|
|
18
|
+
3. Check network policies if traffic is being denied
|
|
19
|
+
4. Examine Cilium endpoints and services for configuration issues
|
|
20
|
+
5. Analyze BPF maps and load balancer state if needed
|
|
21
|
+
6. Run connectivity tests ONLY as a last resort (they take 5+ minutes)
|
|
22
|
+
|
|
23
|
+
{% if tool_names|list|length > 0 %}
|
|
24
|
+
The following Cilium commands are available: {{ ", ".join(tool_names) }}
|
|
25
|
+
{% endif %}
|
|
26
|
+
|
|
27
|
+
ALWAYS provide specific actionable solutions based on the observed data rather than generic troubleshooting advice.
|
|
28
|
+
tags:
|
|
29
|
+
- cli
|
|
30
|
+
prerequisites:
|
|
31
|
+
- command: "cilium status"
|
|
32
|
+
config:
|
|
33
|
+
timeout: 300 # Default timeout in seconds - connectivity tests can take 5+ minutes to complete
|
|
34
|
+
|
|
35
|
+
tools:
|
|
36
|
+
# Cilium Status and Health
|
|
37
|
+
- name: "cilium_status"
|
|
38
|
+
description: "Display overall Cilium agent status and health"
|
|
39
|
+
command: "cilium status"
|
|
40
|
+
|
|
41
|
+
- name: "cilium_status_verbose"
|
|
42
|
+
description: "Display detailed Cilium agent status with verbose output"
|
|
43
|
+
command: "cilium status --verbose"
|
|
44
|
+
|
|
45
|
+
- name: "cilium_version"
|
|
46
|
+
description: "Show Cilium version information"
|
|
47
|
+
command: "cilium version"
|
|
48
|
+
|
|
49
|
+
- name: "cilium_config"
|
|
50
|
+
description: "Display current Cilium configuration"
|
|
51
|
+
command: "cilium config view"
|
|
52
|
+
|
|
53
|
+
# Connectivity Testing
|
|
54
|
+
- name: "cilium_connectivity_test"
|
|
55
|
+
description: "Run comprehensive connectivity tests to validate network functionality (SLOW: 5+ minutes - use as last resort)"
|
|
56
|
+
command: "timeout {{ config.timeout | default(600) }} cilium connectivity test --test-concurrency 1"
|
|
57
|
+
|
|
58
|
+
- name: "cilium_connectivity_test_namespace"
|
|
59
|
+
description: "Run connectivity tests in a specific namespace (SLOW: 5+ minutes - use as last resort)"
|
|
60
|
+
command: "timeout {{ config.timeout | default(600) }} cilium connectivity test --test-namespace {{ namespace }}"
|
|
61
|
+
args:
|
|
62
|
+
- name: "namespace"
|
|
63
|
+
type: "string"
|
|
64
|
+
description: "Kubernetes namespace to run connectivity tests in"
|
|
65
|
+
|
|
66
|
+
# Cluster and Node Information
|
|
67
|
+
- name: "cilium_clustermesh_status"
|
|
68
|
+
description: "Display cluster mesh status for multi-cluster networking"
|
|
69
|
+
command: "cilium clustermesh status"
|
|
70
|
+
|
|
71
|
+
# Features and Configuration
|
|
72
|
+
- name: "cilium_features_status"
|
|
73
|
+
description: "Report which features are enabled in Cilium agents"
|
|
74
|
+
command: "cilium features status"
|
|
75
|
+
|
|
76
|
+
# BGP Control Plane
|
|
77
|
+
- name: "cilium_bgp_peers"
|
|
78
|
+
description: "List BGP peers for nodes running Cilium (requires BGP enabled)"
|
|
79
|
+
command: "cilium bgp peers || echo 'BGP not enabled or not properly configured'"
|
|
80
|
+
|
|
81
|
+
- name: "cilium_bgp_routes"
|
|
82
|
+
description: "List BGP routes for nodes running Cilium (requires BGP enabled)"
|
|
83
|
+
command: "cilium bgp routes || echo 'BGP not enabled or not properly configured'"
|
|
84
|
+
|
|
85
|
+
# Encryption (requires Cilium 1.18+)
|
|
86
|
+
- name: "cilium_encryption_status"
|
|
87
|
+
description: "Show encryption status and configuration (requires Cilium 1.18+)"
|
|
88
|
+
command: "cilium encryption status || echo 'Encryption status not supported in this Cilium version'"
|
|
89
|
+
|
|
90
|
+
# System Diagnostics
|
|
91
|
+
- name: "cilium_sysdump"
|
|
92
|
+
description: "Collect system information for troubleshooting Cilium issues"
|
|
93
|
+
command: "cilium sysdump --output-filename cilium-sysdump-$(date +%Y%m%d-%H%M%S).zip"
|
|
94
|
+
|
|
95
|
+
# Installation and Upgrade
|
|
96
|
+
- name: "cilium_install_status"
|
|
97
|
+
description: "Check Cilium installation status in the cluster"
|
|
98
|
+
command: "cilium status --wait"
|
|
99
|
+
|
|
100
|
+
- name: "cilium_context"
|
|
101
|
+
description: "Display the current Kubernetes context configuration"
|
|
102
|
+
command: "cilium context"
|
|
103
|
+
|
|
104
|
+
# Multicast (only works if multicast is enabled)
|
|
105
|
+
- name: "cilium_multicast_groups"
|
|
106
|
+
description: "List multicast groups and their members (requires multicast enabled)"
|
|
107
|
+
command: "cilium multicast list group || echo 'Multicast not enabled in this cluster'"
|
|
108
|
+
|
|
109
|
+
hubble/observability:
|
|
110
|
+
description: "Hubble network observability tools for monitoring and troubleshooting network flows"
|
|
111
|
+
docs_url: "https://robusta-dev.github.io/holmesgpt/data-sources/builtin-toolsets/cilium/#hubble"
|
|
112
|
+
icon_url: "https://raw.githubusercontent.com/cilium/hubble/main/Documentation/images/hubble_logo.png"
|
|
113
|
+
llm_instructions: |
|
|
114
|
+
Use Hubble to observe and analyze network traffic flows in your Kubernetes cluster.
|
|
115
|
+
|
|
116
|
+
IMPORTANT: Hubble commands require a running Hubble server (hubble-relay) to be accessible.
|
|
117
|
+
If Hubble server is not available, these commands will fail with connection errors.
|
|
118
|
+
Use 'cilium hubble enable' to enable Hubble if needed.
|
|
119
|
+
|
|
120
|
+
Hubble provides deep visibility into:
|
|
121
|
+
- HTTP/gRPC/DNS traffic flows
|
|
122
|
+
- Network policy drops and allows
|
|
123
|
+
- Service-to-service communication patterns
|
|
124
|
+
- Security events and anomalies
|
|
125
|
+
- Cilium agent events and debug information
|
|
126
|
+
|
|
127
|
+
When troubleshooting with Hubble:
|
|
128
|
+
1. Start with broad flow observations to understand traffic patterns
|
|
129
|
+
2. Filter by specific pods, namespaces, or protocols as needed
|
|
130
|
+
3. Look for dropped flows to identify policy issues
|
|
131
|
+
4. Check DNS resolution problems
|
|
132
|
+
5. Analyze L7 protocols for application-level issues
|
|
133
|
+
6. Use policy verdicts to understand network policy behavior
|
|
134
|
+
|
|
135
|
+
Use time windows and limits to focus on recent events during incident investigation.
|
|
136
|
+
Note: Some advanced features like agent-events and debug-events may not be available in all Hubble versions.
|
|
137
|
+
tags:
|
|
138
|
+
- cli
|
|
139
|
+
prerequisites:
|
|
140
|
+
- command: "hubble version"
|
|
141
|
+
- command: "hubble status"
|
|
142
|
+
config:
|
|
143
|
+
timeout: 300 # Default timeout in seconds for potentially long-running commands
|
|
144
|
+
|
|
145
|
+
tools:
|
|
146
|
+
# Flow Observation
|
|
147
|
+
- name: "hubble_observe"
|
|
148
|
+
description: "Observe network flows in real-time (last 100 flows)"
|
|
149
|
+
command: "hubble observe --last 1000"
|
|
150
|
+
transformers:
|
|
151
|
+
- name: llm_summarize
|
|
152
|
+
config:
|
|
153
|
+
input_threshold: 1000
|
|
154
|
+
prompt: |
|
|
155
|
+
Summarize this hubble observe output focusing on
|
|
156
|
+
- Notable traffic patterns.
|
|
157
|
+
- Traffic drops of any kind, source, destingation, protocol, etc.
|
|
158
|
+
- Errors that might indicate network issues.
|
|
159
|
+
|
|
160
|
+
- name: "hubble_observe_namespace"
|
|
161
|
+
description: "Observe flows for a specific namespace"
|
|
162
|
+
command: "hubble observe --namespace {{ namespace }} --last 100"
|
|
163
|
+
args:
|
|
164
|
+
- name: "namespace"
|
|
165
|
+
type: "string"
|
|
166
|
+
description: "Kubernetes namespace to observe flows for"
|
|
167
|
+
|
|
168
|
+
- name: "hubble_observe_pod"
|
|
169
|
+
description: "Observe flows to/from a specific pod (format: namespace/pod-name)"
|
|
170
|
+
command: "hubble observe --pod {{ pod_name }} --last 100"
|
|
171
|
+
args:
|
|
172
|
+
- name: "pod_name"
|
|
173
|
+
type: "string"
|
|
174
|
+
description: "Pod name in format namespace/pod-name or just pod-name (defaults to 'default' namespace)"
|
|
175
|
+
|
|
176
|
+
- name: "hubble_observe_since"
|
|
177
|
+
description: "Observe flows since a specific time (e.g., '5m', '1h', '2023-01-01T10:00:00Z')"
|
|
178
|
+
command: "timeout {{ config.timeout | default(30) }} hubble observe --since {{ time_duration }}"
|
|
179
|
+
args:
|
|
180
|
+
- name: "time_duration"
|
|
181
|
+
type: "string"
|
|
182
|
+
description: "Time duration or timestamp (e.g., '5m', '1h', '2023-01-01T10:00:00Z')"
|
|
183
|
+
|
|
184
|
+
# Protocol-Specific Observation
|
|
185
|
+
- name: "hubble_observe_http"
|
|
186
|
+
description: "Observe HTTP traffic flows"
|
|
187
|
+
command: "hubble observe --protocol http --last 100"
|
|
188
|
+
|
|
189
|
+
- name: "hubble_observe_dns"
|
|
190
|
+
description: "Observe DNS queries and responses"
|
|
191
|
+
command: "hubble observe --protocol dns --last 100"
|
|
192
|
+
|
|
193
|
+
- name: "hubble_observe_grpc"
|
|
194
|
+
description: "Observe gRPC traffic flows"
|
|
195
|
+
command: "hubble observe --protocol grpc --last 100"
|
|
196
|
+
|
|
197
|
+
# Traffic Analysis
|
|
198
|
+
- name: "hubble_observe_drops"
|
|
199
|
+
description: "Show only dropped network flows (policy denials, etc.)"
|
|
200
|
+
command: "hubble observe --verdict DROPPED --last 100"
|
|
201
|
+
|
|
202
|
+
- name: "hubble_observe_forwarded"
|
|
203
|
+
description: "Show flows that were successfully forwarded"
|
|
204
|
+
command: "hubble observe --verdict FORWARDED --last 100"
|
|
205
|
+
|
|
206
|
+
- name: "hubble_observe_service"
|
|
207
|
+
description: "Observe flows to/from a specific service (format: namespace/service-name)"
|
|
208
|
+
command: "hubble observe --service {{ service_name }} --last 100"
|
|
209
|
+
args:
|
|
210
|
+
- name: "service_name"
|
|
211
|
+
type: "string"
|
|
212
|
+
description: "Service name in format namespace/service-name or just service-name (defaults to 'default' namespace)"
|
|
213
|
+
|
|
214
|
+
- name: "hubble_observe_port"
|
|
215
|
+
description: "Observe flows on a specific port"
|
|
216
|
+
command: "hubble observe --port {{ port }} --last 100"
|
|
217
|
+
args:
|
|
218
|
+
- name: "port"
|
|
219
|
+
type: "integer"
|
|
220
|
+
description: "Port number to filter flows by (e.g., 8080, 443)"
|
|
221
|
+
|
|
222
|
+
# Flow Filtering and Analysis
|
|
223
|
+
- name: "hubble_observe_from_pod"
|
|
224
|
+
description: "Observe flows originating from a specific pod (format: namespace/pod-name)"
|
|
225
|
+
command: "hubble observe --from-pod {{ namespace }}/{{ pod_name }} --last 100"
|
|
226
|
+
args:
|
|
227
|
+
- name: "namespace"
|
|
228
|
+
type: "string"
|
|
229
|
+
description: "Kubernetes namespace where the source pod is located"
|
|
230
|
+
- name: "pod_name"
|
|
231
|
+
type: "string"
|
|
232
|
+
description: "Name of the source pod"
|
|
233
|
+
|
|
234
|
+
- name: "hubble_observe_to_pod"
|
|
235
|
+
description: "Observe flows destined to a specific pod (format: namespace/pod-name)"
|
|
236
|
+
command: "hubble observe --to-pod {{ namespace }}/{{ pod_name }} --last 100"
|
|
237
|
+
args:
|
|
238
|
+
- name: "namespace"
|
|
239
|
+
type: "string"
|
|
240
|
+
description: "Kubernetes namespace where the destination pod is located"
|
|
241
|
+
- name: "pod_name"
|
|
242
|
+
type: "string"
|
|
243
|
+
description: "Name of the destination pod"
|
|
244
|
+
|
|
245
|
+
- name: "hubble_observe_between_namespaces"
|
|
246
|
+
description: "Observe flows between two specific namespaces"
|
|
247
|
+
command: "hubble observe --from-namespace {{ src_namespace }} --to-namespace {{ dst_namespace }} --last 100"
|
|
248
|
+
args:
|
|
249
|
+
- name: "src_namespace"
|
|
250
|
+
type: "string"
|
|
251
|
+
description: "Source namespace to filter flows from"
|
|
252
|
+
- name: "dst_namespace"
|
|
253
|
+
type: "string"
|
|
254
|
+
description: "Destination namespace to filter flows to"
|
|
255
|
+
|
|
256
|
+
- name: "hubble_observe_json"
|
|
257
|
+
description: "Output flow observations in JSON format for detailed analysis"
|
|
258
|
+
command: "hubble observe --output json --last 100"
|
|
259
|
+
|
|
260
|
+
# Status and Metrics
|
|
261
|
+
- name: "hubble_status"
|
|
262
|
+
description: "Display Hubble server status and configuration"
|
|
263
|
+
command: "hubble status"
|
|
264
|
+
|
|
265
|
+
- name: "hubble_list_nodes"
|
|
266
|
+
description: "List nodes available for flow observation"
|
|
267
|
+
command: "hubble list nodes"
|
|
268
|
+
|
|
269
|
+
- name: "hubble_observe_flows_summary"
|
|
270
|
+
description: "Get a summary of recent network flows with basic statistics"
|
|
271
|
+
command: "hubble observe --last 100 --output compact"
|
|
272
|
+
|
|
273
|
+
# Security and Policy Analysis
|
|
274
|
+
- name: "hubble_observe_security_events"
|
|
275
|
+
description: "Observe security-related network events and policy violations"
|
|
276
|
+
command: "hubble observe --verdict DROPPED --last 100"
|
|
277
|
+
|
|
278
|
+
- name: "hubble_observe_policy_verdicts"
|
|
279
|
+
description: "Show policy verdict events (allows and denies)"
|
|
280
|
+
command: "hubble observe --type policy-verdict --last 100"
|
|
281
|
+
|
|
282
|
+
- name: "hubble_observe_l7_traffic"
|
|
283
|
+
description: "Show L7 (application-layer) traffic flows"
|
|
284
|
+
command: "hubble observe --type l7 --last 100"
|
|
@@ -3,7 +3,7 @@ from typing import Any, Optional, Tuple, Set
|
|
|
3
3
|
from holmes.core.tools import (
|
|
4
4
|
CallablePrerequisite,
|
|
5
5
|
StructuredToolResult,
|
|
6
|
-
|
|
6
|
+
StructuredToolResultStatus,
|
|
7
7
|
ToolsetTag,
|
|
8
8
|
)
|
|
9
9
|
from holmes.plugins.toolsets.consts import (
|
|
@@ -74,7 +74,7 @@ class CoralogixLogsToolset(BasePodLoggingToolset):
|
|
|
74
74
|
def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
|
|
75
75
|
if not self.coralogix_config:
|
|
76
76
|
return StructuredToolResult(
|
|
77
|
-
status=
|
|
77
|
+
status=StructuredToolResultStatus.ERROR,
|
|
78
78
|
error=f"The {self.name} toolset is not configured",
|
|
79
79
|
params=params.model_dump(),
|
|
80
80
|
)
|
|
@@ -102,7 +102,9 @@ class CoralogixLogsToolset(BasePodLoggingToolset):
|
|
|
102
102
|
|
|
103
103
|
return StructuredToolResult(
|
|
104
104
|
status=(
|
|
105
|
-
|
|
105
|
+
StructuredToolResultStatus.ERROR
|
|
106
|
+
if logs_data.error
|
|
107
|
+
else StructuredToolResultStatus.SUCCESS
|
|
106
108
|
),
|
|
107
109
|
error=logs_data.error,
|
|
108
110
|
data=data,
|