holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -1,23 +1,29 @@
1
- import os
2
-
3
1
  import logging
2
+ import os
3
+ from typing import Any, Dict, List, Optional
4
4
 
5
- from typing import Optional, Dict, Any, List
6
- from holmes.core.supabase_dal import SupabaseDal
5
+ from holmes.common.env_vars import load_bool
6
+ from holmes.core.supabase_dal import FindingType, SupabaseDal
7
7
  from holmes.core.tools import (
8
8
  StaticPrerequisite,
9
+ StructuredToolResult,
10
+ StructuredToolResultStatus,
9
11
  Tool,
12
+ ToolInvokeContext,
10
13
  ToolParameter,
11
14
  Toolset,
12
15
  ToolsetTag,
13
16
  )
14
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
17
+
18
+ PULL_EXTERNAL_FINDINGS = load_bool("PULL_EXTERNAL_FINDINGS", False)
15
19
 
16
20
  PARAM_FINDING_ID = "id"
17
21
  START_TIME = "start_datetime"
18
22
  END_TIME = "end_datetime"
19
23
  NAMESPACE = "namespace"
20
24
  WORKLOAD = "workload"
25
+ DEFAULT_LIMIT_CHANGE_ROWS = 100
26
+ MAX_LIMIT_CHANGE_ROWS = 200
21
27
 
22
28
 
23
29
  class FetchRobustaFinding(Tool):
@@ -26,7 +32,7 @@ class FetchRobustaFinding(Tool):
26
32
  def __init__(self, dal: Optional[SupabaseDal]):
27
33
  super().__init__(
28
34
  name="fetch_finding_by_id",
29
- description="Fetches a robusta finding. Findings are events, like a Prometheus alert or a deployment update",
35
+ description="Fetches a robusta finding. Findings are events, like a Prometheus alert or a deployment update and configuration change.",
30
36
  parameters={
31
37
  PARAM_FINDING_ID: ToolParameter(
32
38
  description="The id of the finding to fetch",
@@ -45,21 +51,19 @@ class FetchRobustaFinding(Tool):
45
51
  logging.error(error)
46
52
  return {"error": error}
47
53
 
48
- def _invoke(
49
- self, params: dict, user_approved: bool = False
50
- ) -> StructuredToolResult:
54
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
51
55
  finding_id = params[PARAM_FINDING_ID]
52
56
  try:
53
57
  finding = self._fetch_finding(finding_id)
54
58
  if finding:
55
59
  return StructuredToolResult(
56
- status=ToolResultStatus.SUCCESS,
60
+ status=StructuredToolResultStatus.SUCCESS,
57
61
  data=finding,
58
62
  params=params,
59
63
  )
60
64
  else:
61
65
  return StructuredToolResult(
62
- status=ToolResultStatus.NO_DATA,
66
+ status=StructuredToolResultStatus.NO_DATA,
63
67
  data=f"Could not find a finding with finding_id={finding_id}",
64
68
  params=params,
65
69
  )
@@ -70,13 +74,13 @@ class FetchRobustaFinding(Tool):
70
74
  )
71
75
 
72
76
  return StructuredToolResult(
73
- status=ToolResultStatus.ERROR,
77
+ status=StructuredToolResultStatus.ERROR,
74
78
  data=f"There was an internal error while fetching finding {finding_id}",
75
79
  params=params,
76
80
  )
77
81
 
78
82
  def get_parameterized_one_liner(self, params: Dict) -> str:
79
- return "Robusta: Fetch Alert Metadata"
83
+ return f"Robusta: Fetch finding data {params}"
80
84
 
81
85
 
82
86
  class FetchResourceRecommendation(Tool):
@@ -85,124 +89,285 @@ class FetchResourceRecommendation(Tool):
85
89
  def __init__(self, dal: Optional[SupabaseDal]):
86
90
  super().__init__(
87
91
  name="fetch_resource_recommendation",
88
- description="Fetch workload recommendations for resources requests and limits. Returns the current configured resources, as well as recommendation based on actual historical usage.",
92
+ description=(
93
+ "Fetch KRR (Kubernetes Resource Recommendations) for CPU and memory optimization. "
94
+ "KRR provides AI-powered recommendations based on actual historical usage patterns for right-sizing workloads. "
95
+ "Supports two usage modes: "
96
+ "(1) Specific workload lookup - Use name_pattern with an exact name, namespace, and kind to get recommendations for a single workload. "
97
+ "(2) Discovery mode - Use limit and sort_by to get a ranked list of top optimization opportunities. Optionally filter by namespace, name_pattern (wildcards supported), kind, or container. "
98
+ "Returns current configured resources alongside recommended values. In discovery mode, results are sorted by potential savings."
99
+ ),
89
100
  parameters={
90
- "name": ToolParameter(
91
- description="The name of the kubernetes workload.",
101
+ "limit": ToolParameter(
102
+ description="Maximum number of recommendations to return (default: 10, max: 100).",
103
+ type="integer",
104
+ required=False,
105
+ ),
106
+ "sort_by": ToolParameter(
107
+ description=(
108
+ "Field to sort recommendations by potential savings. Options: "
109
+ "'cpu_total' (default) - Total CPU savings (requests + limits), "
110
+ "'memory_total' - Total memory savings (requests + limits), "
111
+ "'cpu_requests' - CPU requests savings, "
112
+ "'memory_requests' - Memory requests savings, "
113
+ "'cpu_limits' - CPU limits savings, "
114
+ "'memory_limits' - Memory limits savings, "
115
+ "'priority' - Use scan priority field."
116
+ ),
92
117
  type="string",
93
- required=True,
118
+ required=False,
94
119
  ),
95
120
  "namespace": ToolParameter(
96
- description="The namespace of the kubernetes resource.",
121
+ description="Filter by Kubernetes namespace (exact match). Leave empty to search all namespaces.",
97
122
  type="string",
98
- required=True,
123
+ required=False,
124
+ ),
125
+ "name_pattern": ToolParameter(
126
+ description=(
127
+ "Filter by workload name pattern. Supports SQL LIKE patterns: "
128
+ "Use '%' as wildcard (e.g., '%app%' matches any name containing 'app', "
129
+ "'prod-%' matches names starting with 'prod-'). "
130
+ "Leave empty to match all names."
131
+ ),
132
+ type="string",
133
+ required=False,
99
134
  ),
100
135
  "kind": ToolParameter(
101
- description="The kind of the kubernetes resource. Must be one of: [Deployment, StatefulSet, DaemonSet, Job].",
136
+ description=(
137
+ "Filter by Kubernetes resource kind. "
138
+ "Must be one of: Deployment, StatefulSet, DaemonSet, Job. "
139
+ "Leave empty to include all kinds."
140
+ ),
102
141
  type="string",
103
- required=True,
142
+ required=False,
143
+ ),
144
+ "container": ToolParameter(
145
+ description="Filter by container name (exact match). Leave empty to include all containers.",
146
+ type="string",
147
+ required=False,
104
148
  ),
105
149
  },
106
150
  )
107
151
  self._dal = dal
108
152
 
109
- def _resource_recommendation(self, params: Dict) -> Optional[List[Dict]]:
153
+ def _fetch_recommendations(self, params: Dict) -> Optional[List[Dict]]:
110
154
  if self._dal and self._dal.enabled:
155
+ # Set default values
156
+ limit = min(params.get("limit", 10) or 10, 100)
157
+ sort_by = params.get("sort_by") or "cpu_total"
158
+
111
159
  return self._dal.get_resource_recommendation(
112
- name=params["name"],
113
- namespace=params["namespace"],
114
- kind=params["kind"],
160
+ limit=limit,
161
+ sort_by=sort_by,
162
+ namespace=params.get("namespace"),
163
+ name_pattern=params.get("name_pattern"),
164
+ kind=params.get("kind"),
165
+ container=params.get("container"),
115
166
  )
116
167
  return None
117
168
 
118
- def _invoke(
119
- self, params: dict, user_approved: bool = False
120
- ) -> StructuredToolResult:
169
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
121
170
  try:
122
- recommendations = self._resource_recommendation(params)
171
+ recommendations = self._fetch_recommendations(params)
123
172
  if recommendations:
124
173
  return StructuredToolResult(
125
- status=ToolResultStatus.SUCCESS,
174
+ status=StructuredToolResultStatus.SUCCESS,
126
175
  data=recommendations,
127
176
  params=params,
128
177
  )
129
178
  else:
130
179
  return StructuredToolResult(
131
- status=ToolResultStatus.NO_DATA,
132
- data=f"Could not find recommendations for {params}",
180
+ status=StructuredToolResultStatus.NO_DATA,
181
+ data=f"Could not find any recommendations with filters: {params}",
133
182
  params=params,
134
183
  )
135
184
  except Exception as e:
136
- msg = f"There was an internal error while fetching recommendations for {params}. {str(e)}"
185
+ msg = f"There was an error while fetching top recommendations for {params}. {str(e)}"
137
186
  logging.exception(msg)
138
187
  return StructuredToolResult(
139
- status=ToolResultStatus.ERROR,
140
- data=msg,
188
+ status=StructuredToolResultStatus.ERROR,
189
+ error=msg,
141
190
  params=params,
142
191
  )
143
192
 
144
193
  def get_parameterized_one_liner(self, params: Dict) -> str:
145
- return f"Robusta: Check Historical Resource Utilization: ({str(params)})"
194
+ return f"Robusta: Fetch KRR Recommendations ({str(params)})"
146
195
 
147
196
 
148
- class FetchConfigurationChanges(Tool):
197
+ class FetchConfigurationChangesMetadataBase(Tool):
149
198
  _dal: Optional[SupabaseDal]
150
199
 
151
- def __init__(self, dal: Optional[SupabaseDal]):
200
+ def __init__(
201
+ self,
202
+ dal: Optional[SupabaseDal],
203
+ name: str,
204
+ description: str,
205
+ add_cluster_filter: bool = True,
206
+ ):
207
+ """
208
+ We need seperate tools for external and cluster configuration changes due to the different cluster parameters that are not on "external" changes like 'workload' and 'namespace'.
209
+ add_cluster_filter: adds the namespace and workload parameters for configuration changes tool.
210
+ """
211
+ parameters = {
212
+ START_TIME: ToolParameter(
213
+ description="The starting time boundary for the search period. String in RFC3339 format.",
214
+ type="string",
215
+ required=True,
216
+ ),
217
+ END_TIME: ToolParameter(
218
+ description="The ending time boundary for the search period. String in RFC3339 format.",
219
+ type="string",
220
+ required=True,
221
+ ),
222
+ "limit": ToolParameter(
223
+ description=f"Maximum number of rows to return. Default is {DEFAULT_LIMIT_CHANGE_ROWS} and the maximum is 200",
224
+ type="integer",
225
+ required=False,
226
+ ),
227
+ }
228
+
229
+ if add_cluster_filter:
230
+ parameters.update(
231
+ {
232
+ "namespace": ToolParameter(
233
+ description="The Kubernetes namespace name for filtering configuration changes",
234
+ type="string",
235
+ required=False,
236
+ ),
237
+ "workload": ToolParameter(
238
+ description="Kubernetes resource name to filter configuration changes (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.",
239
+ type="string",
240
+ required=False,
241
+ ),
242
+ }
243
+ )
244
+
152
245
  super().__init__(
153
- name="fetch_configuration_changes",
154
- description="Fetch configuration changes in a given time range. By default, fetch all cluster changes. Can be filtered on a given namespace or a specific workload",
155
- parameters={
156
- START_TIME: ToolParameter(
157
- description="The starting time boundary for the search period. String in RFC3339 format.",
158
- type="string",
159
- required=True,
160
- ),
161
- END_TIME: ToolParameter(
162
- description="The starting time boundary for the search period. String in RFC3339 format.",
163
- type="string",
164
- required=True,
165
- ),
166
- },
246
+ name=name,
247
+ description=description,
248
+ parameters=parameters,
167
249
  )
168
250
  self._dal = dal
169
251
 
170
- def _fetch_change_history(self, params: Dict) -> Optional[List[Dict]]:
252
+ def _fetch_issues(
253
+ self,
254
+ params: Dict,
255
+ cluster: Optional[str] = None,
256
+ finding_type: FindingType = FindingType.CONFIGURATION_CHANGE,
257
+ ) -> Optional[List[Dict]]:
171
258
  if self._dal and self._dal.enabled:
172
- return self._dal.get_configuration_changes(
259
+ return self._dal.get_issues_metadata(
173
260
  start_datetime=params["start_datetime"],
174
261
  end_datetime=params["end_datetime"],
262
+ limit=min(
263
+ params.get("limit") or DEFAULT_LIMIT_CHANGE_ROWS,
264
+ MAX_LIMIT_CHANGE_ROWS,
265
+ ),
266
+ ns=params.get("namespace"),
267
+ workload=params.get("workload"),
268
+ cluster=cluster,
269
+ finding_type=finding_type,
175
270
  )
176
271
  return None
177
272
 
178
- def _invoke(
179
- self, params: dict, user_approved: bool = False
180
- ) -> StructuredToolResult:
273
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
181
274
  try:
182
- changes = self._fetch_change_history(params)
275
+ changes = self._fetch_issues(params)
183
276
  if changes:
184
277
  return StructuredToolResult(
185
- status=ToolResultStatus.SUCCESS,
278
+ status=StructuredToolResultStatus.SUCCESS,
186
279
  data=changes,
187
280
  params=params,
188
281
  )
189
282
  else:
190
283
  return StructuredToolResult(
191
- status=ToolResultStatus.NO_DATA,
192
- data=f"Could not find changes for {params}",
284
+ status=StructuredToolResultStatus.NO_DATA,
285
+ data=f"{self.name} found no data. {params}",
193
286
  params=params,
194
287
  )
195
288
  except Exception as e:
196
289
  msg = f"There was an internal error while fetching changes for {params}. {str(e)}"
197
290
  logging.exception(msg)
198
291
  return StructuredToolResult(
199
- status=ToolResultStatus.ERROR,
292
+ status=StructuredToolResultStatus.ERROR,
200
293
  data=msg,
201
294
  params=params,
202
295
  )
203
296
 
204
297
  def get_parameterized_one_liner(self, params: Dict) -> str:
205
- return "Robusta: Search Change History"
298
+ return f"Robusta: Search Change History {params}"
299
+
300
+
301
+ class FetchConfigurationChangesMetadata(FetchConfigurationChangesMetadataBase):
302
+ def __init__(self, dal: Optional[SupabaseDal]):
303
+ super().__init__(
304
+ dal=dal,
305
+ name="fetch_configuration_changes_metadata",
306
+ description=(
307
+ "Fetch configuration changes metadata in a given time range. "
308
+ "By default, fetch all cluster changes. Can be filtered on a given namespace or a specific kubernetes resource. "
309
+ "Use fetch_finding_by_id to get detailed change of one specific configuration change."
310
+ ),
311
+ )
312
+
313
+
314
+ class FetchExternalConfigurationChangesMetadata(FetchConfigurationChangesMetadataBase):
315
+ """
316
+ Fetch configuration changes from external sources, e.g., LaunchDarkly changes.
317
+ It needs to be a seperate tool due to the different cluster parameter used in the DAL method like workload and namespace.
318
+ """
319
+
320
+ def __init__(self, dal: Optional[SupabaseDal]):
321
+ super().__init__(
322
+ dal=dal,
323
+ name="fetch_external_configuration_changes_metadata",
324
+ description=(
325
+ "Fetch external configuration changes metadata in a given time range. "
326
+ "Fetches configuration changes from external sources. "
327
+ "Use fetch_finding_by_id to get detailed change of one specific configuration change."
328
+ ),
329
+ add_cluster_filter=False,
330
+ )
331
+
332
+ def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
333
+ return super()._fetch_issues(params, cluster="external")
334
+
335
+ def get_parameterized_one_liner(self, params: Dict) -> str:
336
+ return f"Robusta: Search External Change History {params}"
337
+
338
+
339
+ class FetchResourceIssuesMetadata(FetchConfigurationChangesMetadataBase):
340
+ def __init__(self, dal: Optional[SupabaseDal]):
341
+ super().__init__(
342
+ dal=dal,
343
+ name="fetch_resource_issues_metadata",
344
+ description=(
345
+ "Fetch issues and alert metadata in a given time range. "
346
+ "Must be filtered on a given namespace and specific kubernetes resource, such as pod, deployment, job, etc. "
347
+ "Use fetch_finding_by_id to get further information on a specific issue or alert."
348
+ ),
349
+ add_cluster_filter=False,
350
+ )
351
+ self.parameters.update(
352
+ {
353
+ "namespace": ToolParameter(
354
+ description="The Kubernetes namespace name for filtering issues and alerts",
355
+ type="string",
356
+ required=True,
357
+ ),
358
+ "workload": ToolParameter(
359
+ description="Kubernetes resource name to filter issues and alerts (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.",
360
+ type="string",
361
+ required=True,
362
+ ),
363
+ }
364
+ )
365
+
366
+ def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
367
+ return super()._fetch_issues(params, finding_type=FindingType.ISSUE)
368
+
369
+ def get_parameterized_one_liner(self, params: Dict) -> str:
370
+ return f"Robusta: fetch resource issues metadata {params}"
206
371
 
207
372
 
208
373
  class RobustaToolset(Toolset):
@@ -216,17 +381,23 @@ class RobustaToolset(Toolset):
216
381
  enabled=dal.enabled, disabled_reason="Data access layer is disabled"
217
382
  )
218
383
 
384
+ tools = [
385
+ FetchRobustaFinding(dal),
386
+ FetchConfigurationChangesMetadata(dal),
387
+ FetchResourceRecommendation(dal),
388
+ FetchResourceIssuesMetadata(dal),
389
+ ]
390
+
391
+ if PULL_EXTERNAL_FINDINGS:
392
+ tools.append(FetchExternalConfigurationChangesMetadata(dal))
393
+
219
394
  super().__init__(
220
395
  icon_url="https://cdn.prod.website-files.com/633e9bac8f71dfb7a8e4c9a6/646be7710db810b14133bdb5_logo.svg",
221
396
  description="Fetches alerts metadata and change history",
222
397
  docs_url="https://holmesgpt.dev/data-sources/builtin-toolsets/robusta/",
223
398
  name="robusta",
224
399
  prerequisites=[dal_prereq],
225
- tools=[
226
- FetchRobustaFinding(dal),
227
- FetchConfigurationChanges(dal),
228
- FetchResourceRecommendation(dal),
229
- ],
400
+ tools=tools,
230
401
  tags=[
231
402
  ToolsetTag.CORE,
232
403
  ],
@@ -1,7 +1,10 @@
1
1
  # Configuration and alerts history
2
- * Use fetch_configuration_changes to get historical configuration changes
3
- * You must ALWAYS call fetch_configuration_changes when investigating an alert
4
- * Never respond without calling fetch_configuration_changes
2
+ * Use fetch_configuration_changes_metadata to get historical configuration changes in a cluster or for a specific workload.
3
+ * If a change seems important to the investigation, Use fetch_finding_by_id with the configuration change ID to get full details of the change.
4
+ * You must ALWAYS call fetch_configuration_changes_metadata when investigating an alert
5
+ * Never respond without calling fetch_configuration_changes_metadata
6
+ * When investigating a resource (pod, deployment, or job), if no relevant information is available from the live cluster at the time of investigation, call the fetch_resource_issues_metadata function to retrieve its historical alert data.
7
+ * You can use fetch_resource_issues_metadata to get issues context for a specific kubernetes resource. Start with a 4 hours window and try to expand to 24 hours windows if nothing comes up.
5
8
  * When investigating an alert, look at historical configuration changes that happen 4 hours before the alert started
6
9
  * If you found a change that caused the alert, you MUST write: 'The issue was introduced by ...' with a short description of the change, and the date of it.
7
10
  For example:
@@ -11,12 +14,26 @@ For example:
11
14
  * Embed it in with the evidence id and a title describing the change. Use this format:
12
15
  << { "type": "diff", "evidence_id": "8a4d1369-0e98-4ff2-b180-699d5ff286ab", "title": "Image change on the DB workload" } >>
13
16
 
14
- # Resource and efficiency recommendations
15
- * Use fetch_resource_recommendation to get resource recommendations for a given kubernetes workload
16
- * Resource recommendations contains memory and cpu recommended request and limits for a given workload
17
- * When asked if a resource can be optimized, or if a resources is over utilized, use the fetch_resource_recommendation tool to answer
18
- * Right sizing of resources is a key to avoiding performance issues
19
- * Right sizing of resouces can also lead to cost savings
17
+ # Resource and efficiency recommendations (KRR)
18
+ * KRR (Kubernetes Resource Recommendations) provides AI-powered recommendations for right-sizing CPU and memory requests/limits
19
+ * Use fetch_resource_recommendation for all KRR queries - it supports two modes:
20
+ - **Discovery mode**: Get a ranked list of top optimization opportunities across multiple workloads
21
+ - Use limit and sort_by parameters to control ranking (CPU savings, memory savings, or priority)
22
+ - Supports filtering by namespace, name_pattern (with wildcards like '%app%'), kind, and container
23
+ - Returns up to 100 recommendations sorted by potential impact
24
+ - Use this for questions like "top recommendations", "cost savings opportunities", "what to optimize"
25
+ - **Specific lookup mode**: Get recommendations for a single known workload
26
+ - Use name_pattern with exact workload name, along with namespace and kind
27
+ - Best for focused analysis when you already know which workload to investigate
28
+ * When asked if a resource can be optimized, or if resources are over-utilized, use fetch_resource_recommendation to answer
29
+ * When asked about "GPU workloads" or filtering out GPU-based resources, you can use filters like name_pattern or namespace to exclude them
30
+ * Right-sizing of resources is key to avoiding performance issues and achieving cost savings
31
+ * Examples of questions that use fetch_resource_recommendation:
32
+ - "Show me top CPU recommendations" → Use limit=10, sort_by='cpu_total'
33
+ - "What are the biggest memory optimization opportunities?" → Use limit=10, sort_by='memory_total'
34
+ - "Show me top KRR recommendations for non-GPU workloads" → Use name_pattern filter or namespace filter
35
+ - "Find workloads in namespace X that can save the most CPU" → Use namespace='X', sort_by='cpu_total'
36
+ - "Get recommendations for deployment nginx in namespace prod" → Use name_pattern='nginx', namespace='prod', kind='Deployment'
20
37
 
21
38
  # Investigating issues
22
39
  * If provided an issue id (a.k.a. a finding), use `fetch_finding_by_id` to get more information about that issue