holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +17 -4
  3. holmes/common/env_vars.py +40 -1
  4. holmes/config.py +114 -144
  5. holmes/core/conversations.py +53 -14
  6. holmes/core/feedback.py +191 -0
  7. holmes/core/investigation.py +18 -22
  8. holmes/core/llm.py +489 -88
  9. holmes/core/models.py +103 -1
  10. holmes/core/openai_formatting.py +13 -0
  11. holmes/core/prompt.py +1 -1
  12. holmes/core/safeguards.py +4 -4
  13. holmes/core/supabase_dal.py +293 -100
  14. holmes/core/tool_calling_llm.py +423 -323
  15. holmes/core/tools.py +311 -33
  16. holmes/core/tools_utils/token_counting.py +14 -0
  17. holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
  18. holmes/core/tools_utils/tool_executor.py +13 -8
  19. holmes/core/toolset_manager.py +155 -4
  20. holmes/core/tracing.py +6 -1
  21. holmes/core/transformers/__init__.py +23 -0
  22. holmes/core/transformers/base.py +62 -0
  23. holmes/core/transformers/llm_summarize.py +174 -0
  24. holmes/core/transformers/registry.py +122 -0
  25. holmes/core/transformers/transformer.py +31 -0
  26. holmes/core/truncation/compaction.py +59 -0
  27. holmes/core/truncation/dal_truncation_utils.py +23 -0
  28. holmes/core/truncation/input_context_window_limiter.py +218 -0
  29. holmes/interactive.py +177 -24
  30. holmes/main.py +7 -4
  31. holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
  32. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  33. holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
  34. holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
  35. holmes/plugins/prompts/generic_ask.jinja2 +2 -4
  36. holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
  37. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
  38. holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
  39. holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
  40. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
  41. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
  42. holmes/plugins/runbooks/__init__.py +117 -18
  43. holmes/plugins/runbooks/catalog.json +2 -0
  44. holmes/plugins/toolsets/__init__.py +21 -8
  45. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  46. holmes/plugins/toolsets/aks.yaml +64 -0
  47. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
  48. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
  49. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
  50. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
  51. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
  52. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
  53. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
  54. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
  55. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
  56. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
  57. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
  58. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
  59. holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
  60. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  61. holmes/plugins/toolsets/cilium.yaml +284 -0
  62. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  63. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  64. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  65. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
  66. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  67. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
  68. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
  69. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
  70. holmes/plugins/toolsets/git.py +51 -46
  71. holmes/plugins/toolsets/grafana/common.py +15 -3
  72. holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
  73. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
  74. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
  75. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
  76. holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
  77. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
  78. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  79. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
  80. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  81. holmes/plugins/toolsets/internet/internet.py +6 -7
  82. holmes/plugins/toolsets/internet/notion.py +5 -6
  83. holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
  84. holmes/plugins/toolsets/kafka.py +25 -36
  85. holmes/plugins/toolsets/kubernetes.yaml +58 -84
  86. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  87. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  88. holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
  89. holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
  90. holmes/plugins/toolsets/newrelic/__init__.py +0 -0
  91. holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
  92. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
  93. holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
  94. holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
  95. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  96. holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  97. holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
  98. holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
  99. holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
  100. holmes/plugins/toolsets/openshift.yaml +283 -0
  101. holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
  102. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
  103. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  104. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
  105. holmes/plugins/toolsets/robusta/robusta.py +236 -65
  106. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  107. holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
  108. holmes/plugins/toolsets/service_discovery.py +1 -1
  109. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  110. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  111. holmes/plugins/toolsets/utils.py +88 -0
  112. holmes/utils/config_utils.py +91 -0
  113. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  114. holmes/utils/env.py +7 -0
  115. holmes/utils/global_instructions.py +75 -10
  116. holmes/utils/holmes_status.py +2 -1
  117. holmes/utils/holmes_sync_toolsets.py +0 -2
  118. holmes/utils/krr_utils.py +188 -0
  119. holmes/utils/sentry_helper.py +41 -0
  120. holmes/utils/stream.py +61 -7
  121. holmes/version.py +34 -14
  122. holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
  123. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
  124. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
  125. holmes/core/performance_timing.py +0 -72
  126. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  127. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  128. holmes/plugins/toolsets/newrelic.py +0 -231
  129. holmes/plugins/toolsets/servicenow/install.md +0 -37
  130. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  131. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  132. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  133. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
  134. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
@@ -2,7 +2,11 @@ import logging
2
2
  from typing import Dict
3
3
  from datetime import datetime, timezone
4
4
 
5
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
5
+ from holmes.core.tools import (
6
+ StructuredToolResult,
7
+ StructuredToolResultStatus,
8
+ ToolInvokeContext,
9
+ )
6
10
  from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
7
11
  BaseAzureSQLTool,
8
12
  BaseAzureSQLToolset,
@@ -147,9 +151,7 @@ class GetActiveAlerts(BaseAzureSQLTool):
147
151
 
148
152
  return "\n".join(report_sections)
149
153
 
150
- def _invoke(
151
- self, params: dict, user_approved: bool = False
152
- ) -> StructuredToolResult:
154
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
153
155
  try:
154
156
  db_config = self.toolset.database_config()
155
157
  api_client = self.toolset.api_client()
@@ -170,7 +172,7 @@ class GetActiveAlerts(BaseAzureSQLTool):
170
172
  # Check for errors
171
173
  if "error" in alerts_data:
172
174
  return StructuredToolResult(
173
- status=ToolResultStatus.ERROR,
175
+ status=StructuredToolResultStatus.ERROR,
174
176
  error=alerts_data["error"],
175
177
  params=params,
176
178
  )
@@ -179,7 +181,7 @@ class GetActiveAlerts(BaseAzureSQLTool):
179
181
  report_text = self._build_alerts_report(db_config, alerts_data, "active")
180
182
 
181
183
  return StructuredToolResult(
182
- status=ToolResultStatus.SUCCESS,
184
+ status=StructuredToolResultStatus.SUCCESS,
183
185
  data=report_text,
184
186
  params=params,
185
187
  )
@@ -187,7 +189,7 @@ class GetActiveAlerts(BaseAzureSQLTool):
187
189
  error_msg = f"Failed to retrieve active alerts: {str(e)}"
188
190
  logging.error(error_msg)
189
191
  return StructuredToolResult(
190
- status=ToolResultStatus.ERROR,
192
+ status=StructuredToolResultStatus.ERROR,
191
193
  error=error_msg,
192
194
  params=params,
193
195
  )
@@ -1,7 +1,12 @@
1
1
  import logging
2
2
  from typing import Dict, List, Tuple
3
3
 
4
- from holmes.core.tools import StructuredToolResult, ToolParameter, ToolResultStatus
4
+ from holmes.core.tools import (
5
+ StructuredToolResult,
6
+ ToolInvokeContext,
7
+ ToolParameter,
8
+ StructuredToolResultStatus,
9
+ )
5
10
  from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
6
11
  BaseAzureSQLTool,
7
12
  BaseAzureSQLToolset,
@@ -99,9 +104,7 @@ class GetSlowQueries(BaseAzureSQLTool):
99
104
 
100
105
  return "\n".join(report_sections)
101
106
 
102
- def _invoke(
103
- self, params: dict, user_approved: bool = False
104
- ) -> StructuredToolResult:
107
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
105
108
  try:
106
109
  top_count = params.get("top_count", 15)
107
110
  hours_back = params.get("hours_back", 2)
@@ -125,7 +128,7 @@ class GetSlowQueries(BaseAzureSQLTool):
125
128
  )
126
129
 
127
130
  return StructuredToolResult(
128
- status=ToolResultStatus.SUCCESS,
131
+ status=StructuredToolResultStatus.SUCCESS,
129
132
  data=report_text,
130
133
  params=params,
131
134
  )
@@ -133,7 +136,7 @@ class GetSlowQueries(BaseAzureSQLTool):
133
136
  error_msg = f"Failed to get slow queries: {str(e)}"
134
137
  logging.error(error_msg)
135
138
  return StructuredToolResult(
136
- status=ToolResultStatus.ERROR,
139
+ status=StructuredToolResultStatus.ERROR,
137
140
  error=error_msg,
138
141
  params=params,
139
142
  )
@@ -1,7 +1,12 @@
1
1
  import logging
2
2
  from typing import Dict, List, Tuple
3
3
 
4
- from holmes.core.tools import StructuredToolResult, ToolParameter, ToolResultStatus
4
+ from holmes.core.tools import (
5
+ StructuredToolResult,
6
+ ToolInvokeContext,
7
+ ToolParameter,
8
+ StructuredToolResultStatus,
9
+ )
5
10
  from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
6
11
  BaseAzureSQLTool,
7
12
  BaseAzureSQLToolset,
@@ -97,9 +102,7 @@ class GetTopCPUQueries(BaseAzureSQLTool):
97
102
 
98
103
  return "\n".join(report_sections)
99
104
 
100
- def _invoke(
101
- self, params: dict, user_approved: bool = False
102
- ) -> StructuredToolResult:
105
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
103
106
  try:
104
107
  top_count = params.get("top_count", 15)
105
108
  hours_back = params.get("hours_back", 2)
@@ -123,7 +126,7 @@ class GetTopCPUQueries(BaseAzureSQLTool):
123
126
  )
124
127
 
125
128
  return StructuredToolResult(
126
- status=ToolResultStatus.SUCCESS,
129
+ status=StructuredToolResultStatus.SUCCESS,
127
130
  data=report_text,
128
131
  params=params,
129
132
  )
@@ -131,7 +134,7 @@ class GetTopCPUQueries(BaseAzureSQLTool):
131
134
  error_msg = f"Failed to get top CPU queries: {str(e)}"
132
135
  logging.error(error_msg)
133
136
  return StructuredToolResult(
134
- status=ToolResultStatus.ERROR,
137
+ status=StructuredToolResultStatus.ERROR,
135
138
  error=error_msg,
136
139
  params=params,
137
140
  )
@@ -1,7 +1,12 @@
1
1
  import logging
2
2
  from typing import Dict, List, Tuple
3
3
 
4
- from holmes.core.tools import StructuredToolResult, ToolParameter, ToolResultStatus
4
+ from holmes.core.tools import (
5
+ StructuredToolResult,
6
+ ToolInvokeContext,
7
+ ToolParameter,
8
+ StructuredToolResultStatus,
9
+ )
5
10
  from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
6
11
  BaseAzureSQLTool,
7
12
  BaseAzureSQLToolset,
@@ -115,9 +120,7 @@ class GetTopDataIOQueries(BaseAzureSQLTool):
115
120
 
116
121
  return "\n".join(report_sections)
117
122
 
118
- def _invoke(
119
- self, params: dict, user_approved: bool = False
120
- ) -> StructuredToolResult:
123
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
121
124
  try:
122
125
  top_count = params.get("top_count", 15)
123
126
  hours_back = params.get("hours_back", 2)
@@ -141,7 +144,7 @@ class GetTopDataIOQueries(BaseAzureSQLTool):
141
144
  )
142
145
 
143
146
  return StructuredToolResult(
144
- status=ToolResultStatus.SUCCESS,
147
+ status=StructuredToolResultStatus.SUCCESS,
145
148
  data=report_text,
146
149
  params=params,
147
150
  )
@@ -149,7 +152,7 @@ class GetTopDataIOQueries(BaseAzureSQLTool):
149
152
  error_msg = f"Failed to get top data I/O queries: {str(e)}"
150
153
  logging.error(error_msg)
151
154
  return StructuredToolResult(
152
- status=ToolResultStatus.ERROR,
155
+ status=StructuredToolResultStatus.ERROR,
153
156
  error=error_msg,
154
157
  params=params,
155
158
  )
@@ -1,7 +1,12 @@
1
1
  import logging
2
2
  from typing import Dict, List, Tuple
3
3
 
4
- from holmes.core.tools import StructuredToolResult, ToolParameter, ToolResultStatus
4
+ from holmes.core.tools import (
5
+ StructuredToolResult,
6
+ ToolInvokeContext,
7
+ ToolParameter,
8
+ StructuredToolResultStatus,
9
+ )
5
10
  from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
6
11
  BaseAzureSQLTool,
7
12
  BaseAzureSQLToolset,
@@ -107,9 +112,7 @@ class GetTopLogIOQueries(BaseAzureSQLTool):
107
112
 
108
113
  return "\n".join(report_sections)
109
114
 
110
- def _invoke(
111
- self, params: dict, user_approved: bool = False
112
- ) -> StructuredToolResult:
115
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
113
116
  try:
114
117
  top_count = params.get("top_count", 15)
115
118
  hours_back = params.get("hours_back", 2)
@@ -133,7 +136,7 @@ class GetTopLogIOQueries(BaseAzureSQLTool):
133
136
  )
134
137
 
135
138
  return StructuredToolResult(
136
- status=ToolResultStatus.SUCCESS,
139
+ status=StructuredToolResultStatus.SUCCESS,
137
140
  data=report_text,
138
141
  params=params,
139
142
  )
@@ -141,7 +144,7 @@ class GetTopLogIOQueries(BaseAzureSQLTool):
141
144
  error_msg = f"Failed to get top log I/O queries: {str(e)}"
142
145
  logging.error(error_msg)
143
146
  return StructuredToolResult(
144
- status=ToolResultStatus.ERROR,
147
+ status=StructuredToolResultStatus.ERROR,
145
148
  error=error_msg,
146
149
  params=params,
147
150
  )
@@ -16,8 +16,9 @@ from holmes.core.tools import (
16
16
  CallablePrerequisite,
17
17
  StructuredToolResult,
18
18
  Tool,
19
+ ToolInvokeContext,
19
20
  ToolParameter,
20
- ToolResultStatus,
21
+ StructuredToolResultStatus,
21
22
  Toolset,
22
23
  ToolsetTag,
23
24
  )
@@ -82,9 +83,7 @@ class KubectlRunImageCommand(BaseBashTool):
82
83
  command_str = get_param_or_raise(params, "command")
83
84
  return f"kubectl run {pod_name} --image={image} --namespace={namespace} --rm --attach --restart=Never -i -- {command_str}"
84
85
 
85
- def _invoke(
86
- self, params: dict, user_approved: bool = False
87
- ) -> StructuredToolResult:
86
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
88
87
  timeout = params.get("timeout", 60)
89
88
 
90
89
  image = get_param_or_raise(params, "image")
@@ -94,7 +93,7 @@ class KubectlRunImageCommand(BaseBashTool):
94
93
 
95
94
  if namespace and not re.match(SAFE_NAMESPACE_PATTERN, namespace):
96
95
  return StructuredToolResult(
97
- status=ToolResultStatus.ERROR,
96
+ status=StructuredToolResultStatus.ERROR,
98
97
  error=f"Error: The namespace is invalid. Valid namespaces must match the following regexp: {SAFE_NAMESPACE_PATTERN}",
99
98
  params=params,
100
99
  )
@@ -118,7 +117,7 @@ class KubectlRunImageCommand(BaseBashTool):
118
117
  }
119
118
  )
120
119
  return StructuredToolResult(
121
- status=ToolResultStatus.ERROR,
120
+ status=StructuredToolResultStatus.ERROR,
122
121
  error=str(e),
123
122
  params=params,
124
123
  )
@@ -164,22 +163,20 @@ class RunBashCommand(BaseBashTool):
164
163
  toolset=toolset,
165
164
  )
166
165
 
167
- def _invoke(
168
- self, params: dict, user_approved: bool = False
169
- ) -> StructuredToolResult:
166
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
170
167
  command_str = params.get("command")
171
168
  timeout = params.get("timeout", 60)
172
169
 
173
170
  if not command_str:
174
171
  return StructuredToolResult(
175
- status=ToolResultStatus.ERROR,
172
+ status=StructuredToolResultStatus.ERROR,
176
173
  error="The 'command' parameter is required and was not provided.",
177
174
  params=params,
178
175
  )
179
176
 
180
177
  if not isinstance(command_str, str):
181
178
  return StructuredToolResult(
182
- status=ToolResultStatus.ERROR,
179
+ status=StructuredToolResultStatus.ERROR,
183
180
  error=f"The 'command' parameter must be a string, got {type(command_str).__name__}.",
184
181
  params=params,
185
182
  )
@@ -187,7 +184,7 @@ class RunBashCommand(BaseBashTool):
187
184
  command_to_execute = command_str
188
185
 
189
186
  # Only run the safety check if user has NOT approved the command
190
- if not user_approved:
187
+ if not context.user_approved:
191
188
  try:
192
189
  command_to_execute = make_command_safe(command_str, self.toolset.config)
193
190
 
@@ -202,7 +199,7 @@ class RunBashCommand(BaseBashTool):
202
199
  logging.info(f"Refusing LLM tool call {command_str}")
203
200
 
204
201
  return StructuredToolResult(
205
- status=ToolResultStatus.APPROVAL_REQUIRED,
202
+ status=StructuredToolResultStatus.APPROVAL_REQUIRED,
206
203
  error=f"Refusing to execute bash command. {str(e)}",
207
204
  params=params,
208
205
  invocation=command_str,
@@ -1,5 +1,5 @@
1
1
  import subprocess
2
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
2
+ from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
3
3
 
4
4
 
5
5
  def execute_bash_command(cmd: str, timeout: int, params: dict) -> StructuredToolResult:
@@ -18,11 +18,11 @@ def execute_bash_command(cmd: str, timeout: int, params: dict) -> StructuredTool
18
18
  stdout = process.stdout.strip() if process.stdout else ""
19
19
  result_data = f"{cmd}\n" f"{stdout}"
20
20
 
21
- status = ToolResultStatus.ERROR
21
+ status = StructuredToolResultStatus.ERROR
22
22
  if process.returncode == 0 and stdout:
23
- status = ToolResultStatus.SUCCESS
23
+ status = StructuredToolResultStatus.SUCCESS
24
24
  elif not stdout:
25
- status = ToolResultStatus.NO_DATA
25
+ status = StructuredToolResultStatus.NO_DATA
26
26
 
27
27
  return StructuredToolResult(
28
28
  status=status,
@@ -33,20 +33,20 @@ def execute_bash_command(cmd: str, timeout: int, params: dict) -> StructuredTool
33
33
  )
34
34
  except subprocess.TimeoutExpired:
35
35
  return StructuredToolResult(
36
- status=ToolResultStatus.ERROR,
36
+ status=StructuredToolResultStatus.ERROR,
37
37
  error=f"Error: Command '{cmd}' timed out after {timeout} seconds.",
38
38
  params=params,
39
39
  )
40
40
  except FileNotFoundError:
41
41
  # This might occur if /bin/bash is not found, or if shell=False and command is not found
42
42
  return StructuredToolResult(
43
- status=ToolResultStatus.ERROR,
43
+ status=StructuredToolResultStatus.ERROR,
44
44
  error="Error: Bash executable or command not found. Ensure bash is installed and the command is valid.",
45
45
  params=params,
46
46
  )
47
47
  except Exception as e:
48
48
  return StructuredToolResult(
49
- status=ToolResultStatus.ERROR,
49
+ status=StructuredToolResultStatus.ERROR,
50
50
  error=f"Error executing command '{cmd}': {str(e)}",
51
51
  params=params,
52
52
  )
@@ -0,0 +1,284 @@
1
+ toolsets:
2
+ cilium/core:
3
+ description: "Cilium CNI and Hubble network observability tools for troubleshooting network connectivity and security"
4
+ docs_url: "https://robusta-dev.github.io/holmesgpt/data-sources/builtin-toolsets/cilium/"
5
+ icon_url: "https://cilium.io/static/logo-dark-text.png"
6
+ llm_instructions: |
7
+ You have access to comprehensive Cilium and Hubble tools for debugging Kubernetes networking and security.
8
+ Use these tools to investigate:
9
+ - Network connectivity issues between pods/services
10
+ - Network policy violations
11
+ - Load balancing problems
12
+ - Network flows and traffic patterns
13
+ - Cilium component health and status
14
+
15
+ ALWAYS follow these troubleshooting steps:
16
+ 1. Check Cilium agent status and health, and attempt to remediate if broken.
17
+ 2. Use Hubble to observe network flows and identify blocked or dropped traffic
18
+ 3. Check network policies if traffic is being denied
19
+ 4. Examine Cilium endpoints and services for configuration issues
20
+ 5. Analyze BPF maps and load balancer state if needed
21
+ 6. Run connectivity tests ONLY as a last resort (they take 5+ minutes)
22
+
23
+ {% if tool_names|list|length > 0 %}
24
+ The following Cilium commands are available: {{ ", ".join(tool_names) }}
25
+ {% endif %}
26
+
27
+ ALWAYS provide specific actionable solutions based on the observed data rather than generic troubleshooting advice.
28
+ tags:
29
+ - cli
30
+ prerequisites:
31
+ - command: "cilium status"
32
+ config:
33
+ timeout: 300 # Default timeout in seconds - connectivity tests can take 5+ minutes to complete
34
+
35
+ tools:
36
+ # Cilium Status and Health
37
+ - name: "cilium_status"
38
+ description: "Display overall Cilium agent status and health"
39
+ command: "cilium status"
40
+
41
+ - name: "cilium_status_verbose"
42
+ description: "Display detailed Cilium agent status with verbose output"
43
+ command: "cilium status --verbose"
44
+
45
+ - name: "cilium_version"
46
+ description: "Show Cilium version information"
47
+ command: "cilium version"
48
+
49
+ - name: "cilium_config"
50
+ description: "Display current Cilium configuration"
51
+ command: "cilium config view"
52
+
53
+ # Connectivity Testing
54
+ - name: "cilium_connectivity_test"
55
+ description: "Run comprehensive connectivity tests to validate network functionality (SLOW: 5+ minutes - use as last resort)"
56
+ command: "timeout {{ config.timeout | default(600) }} cilium connectivity test --test-concurrency 1"
57
+
58
+ - name: "cilium_connectivity_test_namespace"
59
+ description: "Run connectivity tests in a specific namespace (SLOW: 5+ minutes - use as last resort)"
60
+ command: "timeout {{ config.timeout | default(600) }} cilium connectivity test --test-namespace {{ namespace }}"
61
+ args:
62
+ - name: "namespace"
63
+ type: "string"
64
+ description: "Kubernetes namespace to run connectivity tests in"
65
+
66
+ # Cluster and Node Information
67
+ - name: "cilium_clustermesh_status"
68
+ description: "Display cluster mesh status for multi-cluster networking"
69
+ command: "cilium clustermesh status"
70
+
71
+ # Features and Configuration
72
+ - name: "cilium_features_status"
73
+ description: "Report which features are enabled in Cilium agents"
74
+ command: "cilium features status"
75
+
76
+ # BGP Control Plane
77
+ - name: "cilium_bgp_peers"
78
+ description: "List BGP peers for nodes running Cilium (requires BGP enabled)"
79
+ command: "cilium bgp peers || echo 'BGP not enabled or not properly configured'"
80
+
81
+ - name: "cilium_bgp_routes"
82
+ description: "List BGP routes for nodes running Cilium (requires BGP enabled)"
83
+ command: "cilium bgp routes || echo 'BGP not enabled or not properly configured'"
84
+
85
+ # Encryption (requires Cilium 1.18+)
86
+ - name: "cilium_encryption_status"
87
+ description: "Show encryption status and configuration (requires Cilium 1.18+)"
88
+ command: "cilium encryption status || echo 'Encryption status not supported in this Cilium version'"
89
+
90
+ # System Diagnostics
91
+ - name: "cilium_sysdump"
92
+ description: "Collect system information for troubleshooting Cilium issues"
93
+ command: "cilium sysdump --output-filename cilium-sysdump-$(date +%Y%m%d-%H%M%S).zip"
94
+
95
+ # Installation and Upgrade
96
+ - name: "cilium_install_status"
97
+ description: "Check Cilium installation status in the cluster"
98
+ command: "cilium status --wait"
99
+
100
+ - name: "cilium_context"
101
+ description: "Display the current Kubernetes context configuration"
102
+ command: "cilium context"
103
+
104
+ # Multicast (only works if multicast is enabled)
105
+ - name: "cilium_multicast_groups"
106
+ description: "List multicast groups and their members (requires multicast enabled)"
107
+ command: "cilium multicast list group || echo 'Multicast not enabled in this cluster'"
108
+
109
+ hubble/observability:
110
+ description: "Hubble network observability tools for monitoring and troubleshooting network flows"
111
+ docs_url: "https://robusta-dev.github.io/holmesgpt/data-sources/builtin-toolsets/cilium/#hubble"
112
+ icon_url: "https://raw.githubusercontent.com/cilium/hubble/main/Documentation/images/hubble_logo.png"
113
+ llm_instructions: |
114
+ Use Hubble to observe and analyze network traffic flows in your Kubernetes cluster.
115
+
116
+ IMPORTANT: Hubble commands require a running Hubble server (hubble-relay) to be accessible.
117
+ If Hubble server is not available, these commands will fail with connection errors.
118
+ Use 'cilium hubble enable' to enable Hubble if needed.
119
+
120
+ Hubble provides deep visibility into:
121
+ - HTTP/gRPC/DNS traffic flows
122
+ - Network policy drops and allows
123
+ - Service-to-service communication patterns
124
+ - Security events and anomalies
125
+ - Cilium agent events and debug information
126
+
127
+ When troubleshooting with Hubble:
128
+ 1. Start with broad flow observations to understand traffic patterns
129
+ 2. Filter by specific pods, namespaces, or protocols as needed
130
+ 3. Look for dropped flows to identify policy issues
131
+ 4. Check DNS resolution problems
132
+ 5. Analyze L7 protocols for application-level issues
133
+ 6. Use policy verdicts to understand network policy behavior
134
+
135
+ Use time windows and limits to focus on recent events during incident investigation.
136
+ Note: Some advanced features like agent-events and debug-events may not be available in all Hubble versions.
137
+ tags:
138
+ - cli
139
+ prerequisites:
140
+ - command: "hubble version"
141
+ - command: "hubble status"
142
+ config:
143
+ timeout: 300 # Default timeout in seconds for potentially long-running commands
144
+
145
+ tools:
146
+ # Flow Observation
147
+ - name: "hubble_observe"
148
+ description: "Observe network flows in real-time (last 100 flows)"
149
+ command: "hubble observe --last 1000"
150
+ transformers:
151
+ - name: llm_summarize
152
+ config:
153
+ input_threshold: 1000
154
+ prompt: |
155
+ Summarize this hubble observe output focusing on
156
+ - Notable traffic patterns.
157
+ - Traffic drops of any kind, source, destingation, protocol, etc.
158
+ - Errors that might indicate network issues.
159
+
160
+ - name: "hubble_observe_namespace"
161
+ description: "Observe flows for a specific namespace"
162
+ command: "hubble observe --namespace {{ namespace }} --last 100"
163
+ args:
164
+ - name: "namespace"
165
+ type: "string"
166
+ description: "Kubernetes namespace to observe flows for"
167
+
168
+ - name: "hubble_observe_pod"
169
+ description: "Observe flows to/from a specific pod (format: namespace/pod-name)"
170
+ command: "hubble observe --pod {{ pod_name }} --last 100"
171
+ args:
172
+ - name: "pod_name"
173
+ type: "string"
174
+ description: "Pod name in format namespace/pod-name or just pod-name (defaults to 'default' namespace)"
175
+
176
+ - name: "hubble_observe_since"
177
+ description: "Observe flows since a specific time (e.g., '5m', '1h', '2023-01-01T10:00:00Z')"
178
+ command: "timeout {{ config.timeout | default(30) }} hubble observe --since {{ time_duration }}"
179
+ args:
180
+ - name: "time_duration"
181
+ type: "string"
182
+ description: "Time duration or timestamp (e.g., '5m', '1h', '2023-01-01T10:00:00Z')"
183
+
184
+ # Protocol-Specific Observation
185
+ - name: "hubble_observe_http"
186
+ description: "Observe HTTP traffic flows"
187
+ command: "hubble observe --protocol http --last 100"
188
+
189
+ - name: "hubble_observe_dns"
190
+ description: "Observe DNS queries and responses"
191
+ command: "hubble observe --protocol dns --last 100"
192
+
193
+ - name: "hubble_observe_grpc"
194
+ description: "Observe gRPC traffic flows"
195
+ command: "hubble observe --protocol grpc --last 100"
196
+
197
+ # Traffic Analysis
198
+ - name: "hubble_observe_drops"
199
+ description: "Show only dropped network flows (policy denials, etc.)"
200
+ command: "hubble observe --verdict DROPPED --last 100"
201
+
202
+ - name: "hubble_observe_forwarded"
203
+ description: "Show flows that were successfully forwarded"
204
+ command: "hubble observe --verdict FORWARDED --last 100"
205
+
206
+ - name: "hubble_observe_service"
207
+ description: "Observe flows to/from a specific service (format: namespace/service-name)"
208
+ command: "hubble observe --service {{ service_name }} --last 100"
209
+ args:
210
+ - name: "service_name"
211
+ type: "string"
212
+ description: "Service name in format namespace/service-name or just service-name (defaults to 'default' namespace)"
213
+
214
+ - name: "hubble_observe_port"
215
+ description: "Observe flows on a specific port"
216
+ command: "hubble observe --port {{ port }} --last 100"
217
+ args:
218
+ - name: "port"
219
+ type: "integer"
220
+ description: "Port number to filter flows by (e.g., 8080, 443)"
221
+
222
+ # Flow Filtering and Analysis
223
+ - name: "hubble_observe_from_pod"
224
+ description: "Observe flows originating from a specific pod (format: namespace/pod-name)"
225
+ command: "hubble observe --from-pod {{ namespace }}/{{ pod_name }} --last 100"
226
+ args:
227
+ - name: "namespace"
228
+ type: "string"
229
+ description: "Kubernetes namespace where the source pod is located"
230
+ - name: "pod_name"
231
+ type: "string"
232
+ description: "Name of the source pod"
233
+
234
+ - name: "hubble_observe_to_pod"
235
+ description: "Observe flows destined to a specific pod (format: namespace/pod-name)"
236
+ command: "hubble observe --to-pod {{ namespace }}/{{ pod_name }} --last 100"
237
+ args:
238
+ - name: "namespace"
239
+ type: "string"
240
+ description: "Kubernetes namespace where the destination pod is located"
241
+ - name: "pod_name"
242
+ type: "string"
243
+ description: "Name of the destination pod"
244
+
245
+ - name: "hubble_observe_between_namespaces"
246
+ description: "Observe flows between two specific namespaces"
247
+ command: "hubble observe --from-namespace {{ src_namespace }} --to-namespace {{ dst_namespace }} --last 100"
248
+ args:
249
+ - name: "src_namespace"
250
+ type: "string"
251
+ description: "Source namespace to filter flows from"
252
+ - name: "dst_namespace"
253
+ type: "string"
254
+ description: "Destination namespace to filter flows to"
255
+
256
+ - name: "hubble_observe_json"
257
+ description: "Output flow observations in JSON format for detailed analysis"
258
+ command: "hubble observe --output json --last 100"
259
+
260
+ # Status and Metrics
261
+ - name: "hubble_status"
262
+ description: "Display Hubble server status and configuration"
263
+ command: "hubble status"
264
+
265
+ - name: "hubble_list_nodes"
266
+ description: "List nodes available for flow observation"
267
+ command: "hubble list nodes"
268
+
269
+ - name: "hubble_observe_flows_summary"
270
+ description: "Get a summary of recent network flows with basic statistics"
271
+ command: "hubble observe --last 100 --output compact"
272
+
273
+ # Security and Policy Analysis
274
+ - name: "hubble_observe_security_events"
275
+ description: "Observe security-related network events and policy violations"
276
+ command: "hubble observe --verdict DROPPED --last 100"
277
+
278
+ - name: "hubble_observe_policy_verdicts"
279
+ description: "Show policy verdict events (allows and denies)"
280
+ command: "hubble observe --type policy-verdict --last 100"
281
+
282
+ - name: "hubble_observe_l7_traffic"
283
+ description: "Show L7 (application-layer) traffic flows"
284
+ command: "hubble observe --type l7 --last 100"
@@ -3,7 +3,7 @@ from typing import Any, Optional, Tuple, Set
3
3
  from holmes.core.tools import (
4
4
  CallablePrerequisite,
5
5
  StructuredToolResult,
6
- ToolResultStatus,
6
+ StructuredToolResultStatus,
7
7
  ToolsetTag,
8
8
  )
9
9
  from holmes.plugins.toolsets.consts import (
@@ -74,7 +74,7 @@ class CoralogixLogsToolset(BasePodLoggingToolset):
74
74
  def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
75
75
  if not self.coralogix_config:
76
76
  return StructuredToolResult(
77
- status=ToolResultStatus.ERROR,
77
+ status=StructuredToolResultStatus.ERROR,
78
78
  error=f"The {self.name} toolset is not configured",
79
79
  params=params.model_dump(),
80
80
  )
@@ -102,7 +102,9 @@ class CoralogixLogsToolset(BasePodLoggingToolset):
102
102
 
103
103
  return StructuredToolResult(
104
104
  status=(
105
- ToolResultStatus.ERROR if logs_data.error else ToolResultStatus.SUCCESS
105
+ StructuredToolResultStatus.ERROR
106
+ if logs_data.error
107
+ else StructuredToolResultStatus.SUCCESS
106
108
  ),
107
109
  error=logs_data.error,
108
110
  data=data,