holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (86) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +15 -4
  3. holmes/common/env_vars.py +8 -1
  4. holmes/config.py +66 -139
  5. holmes/core/investigation.py +1 -2
  6. holmes/core/llm.py +295 -52
  7. holmes/core/models.py +2 -0
  8. holmes/core/safeguards.py +4 -4
  9. holmes/core/supabase_dal.py +14 -8
  10. holmes/core/tool_calling_llm.py +202 -177
  11. holmes/core/tools.py +260 -25
  12. holmes/core/tools_utils/data_types.py +81 -0
  13. holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
  14. holmes/core/tools_utils/tool_executor.py +2 -2
  15. holmes/core/toolset_manager.py +150 -3
  16. holmes/core/tracing.py +6 -1
  17. holmes/core/transformers/__init__.py +23 -0
  18. holmes/core/transformers/base.py +62 -0
  19. holmes/core/transformers/llm_summarize.py +174 -0
  20. holmes/core/transformers/registry.py +122 -0
  21. holmes/core/transformers/transformer.py +31 -0
  22. holmes/main.py +5 -0
  23. holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
  24. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  25. holmes/plugins/toolsets/aks.yaml +64 -0
  26. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
  27. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
  28. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
  29. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
  30. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  31. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
  32. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
  33. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
  34. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
  35. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
  36. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
  37. holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
  38. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  39. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  40. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  41. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  42. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +345 -207
  43. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  44. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +96 -32
  45. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +10 -10
  46. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +21 -22
  47. holmes/plugins/toolsets/git.py +22 -22
  48. holmes/plugins/toolsets/grafana/common.py +14 -2
  49. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
  50. holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
  51. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +5 -4
  52. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  53. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
  54. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  55. holmes/plugins/toolsets/internet/internet.py +3 -3
  56. holmes/plugins/toolsets/internet/notion.py +3 -3
  57. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  58. holmes/plugins/toolsets/kafka.py +18 -18
  59. holmes/plugins/toolsets/kubernetes.yaml +58 -0
  60. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  61. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  62. holmes/plugins/toolsets/logging_utils/logging_api.py +1 -1
  63. holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
  64. holmes/plugins/toolsets/newrelic.py +8 -8
  65. holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
  66. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  67. holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
  68. holmes/plugins/toolsets/prometheus/prometheus.py +841 -351
  69. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +39 -2
  70. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  71. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
  72. holmes/plugins/toolsets/robusta/robusta.py +10 -10
  73. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
  74. holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
  75. holmes/plugins/toolsets/utils.py +88 -0
  76. holmes/utils/config_utils.py +91 -0
  77. holmes/utils/env.py +7 -0
  78. holmes/utils/holmes_status.py +2 -1
  79. holmes/utils/sentry_helper.py +41 -0
  80. holmes/utils/stream.py +9 -0
  81. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/METADATA +11 -15
  82. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/RECORD +85 -75
  83. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  84. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/LICENSE.txt +0 -0
  85. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/WHEEL +0 -0
  86. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,18 @@
1
1
 
2
2
  # Prometheus/PromQL queries
3
- * ALWAYS call list_prometheus_rules to get the alert definition
3
+
4
+ ## Efficient Metric Discovery (when needed)
5
+ * When you need to discover metrics, use `get_metric_names` with filters - it's the fastest method
6
+ * Combine multiple patterns with regex OR (|) to reduce API calls:
7
+ - `{__name__=~"node_cpu.*|node_memory.*|node_disk.*"}` - get all node resource metrics in one call
8
+ - `{__name__=~"container.*|pod.*|kube.*"}` - get all Kubernetes-related metrics
9
+ - `{namespace=~"default|kube-system|monitoring"}` - metrics from multiple namespaces
10
+ * Use `get_metric_metadata` after discovering names to get types/descriptions if needed
11
+ * Use `get_label_values` to discover pods, namespaces, jobs: e.g., get_label_values(label="pod")
12
+ * Only use `get_series` when you need full label sets (slower than other methods)
13
+
14
+ ## Alert Investigation & Query Execution
15
+ * When investigating a Prometheus alert, ALWAYS call list_prometheus_rules to get the alert definition
4
16
  * Use Prometheus to query metrics from the alert promql
5
17
  * Use prometheus to execute promql queries with the tools `execute_prometheus_instant_query` and `execute_prometheus_range_query`
6
18
  * To create queries, use 'start_timestamp' and 'end_timestamp' as graphs start and end times
@@ -16,9 +28,34 @@
16
28
  ** Avoid global averages like `sum(rate(<metric>_sum)) / sum(rate(<metric>_count))` because it hides data and is not generally informative
17
29
  * Timestamps MUST be in string date format. For example: '2025-03-15 10:10:08.610862+00:00'
18
30
  * Post processing will parse your response, re-run the query from the tool output and create a chart visible to the user
19
- * Only generate and execute a prometheus query after checking what metrics are available with the `list_available_metrics` tool
31
+ * When unsure about available metrics, use `get_metric_names` with appropriate filters (combine multiple patterns with | for efficiency). Then use `get_metric_metadata` if you need descriptions/types
20
32
  * Check that any node, service, pod, container, app, namespace, etc. mentioned in the query exist in the kubernetes cluster before making a query. Use any appropriate kubectl tool(s) for this
21
33
  * The toolcall will return no data to you. That is expected. You MUST however ensure that the query is successful.
34
+
35
+ ## Handling High-Cardinality Metrics
36
+ * CRITICAL: When querying metrics that may return many time series (>10), ALWAYS use aggregation to limit results
37
+ * ALWAYS use `topk()` or `bottomk()` to limit the number of series returned
38
+ * Standard pattern for high-cardinality queries:
39
+ - Use `topk(5, <your_query>)` to get the top 5 series
40
+ - Example: `topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m]))`
41
+ - This prevents context overflow and focuses on the most relevant data
42
+ * To also capture the aggregate of remaining series as "other":
43
+ ```
44
+ topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m]))
45
+ or
46
+ label_replace(
47
+ (sum(rate(container_cpu_usage_seconds_total{namespace="default"}[5m])) - sum(topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m])))),
48
+ "pod", "other", "", ""
49
+ )
50
+ ```
51
+ * Common high-cardinality scenarios requiring topk():
52
+ - Pod-level metrics in namespaces with many pods
53
+ - Container-level CPU/memory metrics
54
+ - HTTP metrics with many endpoints or status codes
55
+ - Any query returning more than 10 time series
56
+ * For initial exploration, use instant queries with `count()` to check cardinality:
57
+ - Example: `count(count by (pod) (container_cpu_usage_seconds_total{namespace="default"}))`
58
+ - If count > 10, use topk() in your range query
22
59
  * When doing queries, always extend the time range, to 15 min before and after the alert start time
23
60
  * ALWAYS embed the execution results into your answer
24
61
  * ALWAYS embed a Prometheus graph in the response. The graph should visualize data related to the incident.
@@ -0,0 +1,28 @@
1
+ import re
2
+ from typing import Optional, Union
3
+
4
+
5
+ def parse_duration_to_seconds(v: Optional[Union[str, float, int]]) -> Optional[float]:
6
+ if v is None:
7
+ return None
8
+ if isinstance(v, (int, float)):
9
+ return float(v)
10
+ s = v.strip().lower()
11
+ if s.isdigit():
12
+ return float(int(s))
13
+
14
+ units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
15
+
16
+ # Check for partial time formats (e.g., 1h30m, 5m12s, 1d2h30m)
17
+ pattern = r"(\d+(?:\.\d+)?)(d|h|m|s)"
18
+ matches = re.findall(pattern, s)
19
+
20
+ if matches:
21
+ total_seconds = 0.0
22
+ for value_str, unit in matches:
23
+ value = float(value_str)
24
+ total_seconds += value * units[unit]
25
+ return float(int(total_seconds))
26
+
27
+ # fallback: try float seconds
28
+ return float(s)
@@ -8,7 +8,7 @@ from holmes.core.tools import (
8
8
  StructuredToolResult,
9
9
  Tool,
10
10
  ToolParameter,
11
- ToolResultStatus,
11
+ StructuredToolResultStatus,
12
12
  Toolset,
13
13
  ToolsetTag,
14
14
  )
@@ -79,7 +79,7 @@ class ListConfiguredClusters(BaseRabbitMQTool):
79
79
  if c.connection_status == ClusterConnectionStatus.SUCCESS
80
80
  ]
81
81
  return StructuredToolResult(
82
- status=ToolResultStatus.SUCCESS, data=available_clusters
82
+ status=StructuredToolResultStatus.SUCCESS, data=available_clusters
83
83
  )
84
84
 
85
85
  def get_parameterized_one_liner(self, params) -> str:
@@ -112,12 +112,14 @@ class GetRabbitMQClusterStatus(BaseRabbitMQTool):
112
112
  cluster_id=params.get("cluster_id")
113
113
  )
114
114
  result = get_cluster_status(cluster_config)
115
- return StructuredToolResult(status=ToolResultStatus.SUCCESS, data=result)
115
+ return StructuredToolResult(
116
+ status=StructuredToolResultStatus.SUCCESS, data=result
117
+ )
116
118
 
117
119
  except Exception as e:
118
120
  logging.info("Failed to process RabbitMQ cluster status", exc_info=True)
119
121
  return StructuredToolResult(
120
- status=ToolResultStatus.ERROR,
122
+ status=StructuredToolResultStatus.ERROR,
121
123
  error=f"Unexpected error fetching RabbitMQ cluster status: {str(e)}",
122
124
  data=None,
123
125
  )
@@ -11,7 +11,7 @@ from holmes.core.tools import (
11
11
  Toolset,
12
12
  ToolsetTag,
13
13
  )
14
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
14
+ from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
15
15
 
16
16
  PARAM_FINDING_ID = "id"
17
17
  START_TIME = "start_datetime"
@@ -53,13 +53,13 @@ class FetchRobustaFinding(Tool):
53
53
  finding = self._fetch_finding(finding_id)
54
54
  if finding:
55
55
  return StructuredToolResult(
56
- status=ToolResultStatus.SUCCESS,
56
+ status=StructuredToolResultStatus.SUCCESS,
57
57
  data=finding,
58
58
  params=params,
59
59
  )
60
60
  else:
61
61
  return StructuredToolResult(
62
- status=ToolResultStatus.NO_DATA,
62
+ status=StructuredToolResultStatus.NO_DATA,
63
63
  data=f"Could not find a finding with finding_id={finding_id}",
64
64
  params=params,
65
65
  )
@@ -70,7 +70,7 @@ class FetchRobustaFinding(Tool):
70
70
  )
71
71
 
72
72
  return StructuredToolResult(
73
- status=ToolResultStatus.ERROR,
73
+ status=StructuredToolResultStatus.ERROR,
74
74
  data=f"There was an internal error while fetching finding {finding_id}",
75
75
  params=params,
76
76
  )
@@ -122,13 +122,13 @@ class FetchResourceRecommendation(Tool):
122
122
  recommendations = self._resource_recommendation(params)
123
123
  if recommendations:
124
124
  return StructuredToolResult(
125
- status=ToolResultStatus.SUCCESS,
125
+ status=StructuredToolResultStatus.SUCCESS,
126
126
  data=recommendations,
127
127
  params=params,
128
128
  )
129
129
  else:
130
130
  return StructuredToolResult(
131
- status=ToolResultStatus.NO_DATA,
131
+ status=StructuredToolResultStatus.NO_DATA,
132
132
  data=f"Could not find recommendations for {params}",
133
133
  params=params,
134
134
  )
@@ -136,7 +136,7 @@ class FetchResourceRecommendation(Tool):
136
136
  msg = f"There was an internal error while fetching recommendations for {params}. {str(e)}"
137
137
  logging.exception(msg)
138
138
  return StructuredToolResult(
139
- status=ToolResultStatus.ERROR,
139
+ status=StructuredToolResultStatus.ERROR,
140
140
  data=msg,
141
141
  params=params,
142
142
  )
@@ -182,13 +182,13 @@ class FetchConfigurationChanges(Tool):
182
182
  changes = self._fetch_change_history(params)
183
183
  if changes:
184
184
  return StructuredToolResult(
185
- status=ToolResultStatus.SUCCESS,
185
+ status=StructuredToolResultStatus.SUCCESS,
186
186
  data=changes,
187
187
  params=params,
188
188
  )
189
189
  else:
190
190
  return StructuredToolResult(
191
- status=ToolResultStatus.NO_DATA,
191
+ status=StructuredToolResultStatus.NO_DATA,
192
192
  data=f"Could not find changes for {params}",
193
193
  params=params,
194
194
  )
@@ -196,7 +196,7 @@ class FetchConfigurationChanges(Tool):
196
196
  msg = f"There was an internal error while fetching changes for {params}. {str(e)}"
197
197
  logging.exception(msg)
198
198
  return StructuredToolResult(
199
- status=ToolResultStatus.ERROR,
199
+ status=StructuredToolResultStatus.ERROR,
200
200
  data=msg,
201
201
  params=params,
202
202
  )
@@ -6,7 +6,7 @@ from holmes.core.tools import (
6
6
  StructuredToolResult,
7
7
  Tool,
8
8
  ToolParameter,
9
- ToolResultStatus,
9
+ StructuredToolResultStatus,
10
10
  Toolset,
11
11
  ToolsetTag,
12
12
  )
@@ -52,7 +52,7 @@ class RunbookFetcher(Tool):
52
52
  )
53
53
  logging.error(err_msg)
54
54
  return StructuredToolResult(
55
- status=ToolResultStatus.ERROR,
55
+ status=StructuredToolResultStatus.ERROR,
56
56
  error=err_msg,
57
57
  params=params,
58
58
  )
@@ -96,7 +96,7 @@ class RunbookFetcher(Tool):
96
96
  </example>
97
97
  """)
98
98
  return StructuredToolResult(
99
- status=ToolResultStatus.SUCCESS,
99
+ status=StructuredToolResultStatus.SUCCESS,
100
100
  data=wrapped_content,
101
101
  params=params,
102
102
  )
@@ -104,7 +104,7 @@ class RunbookFetcher(Tool):
104
104
  err_msg = f"Failed to read runbook {runbook_path}: {str(e)}"
105
105
  logging.error(err_msg)
106
106
  return StructuredToolResult(
107
- status=ToolResultStatus.ERROR,
107
+ status=StructuredToolResultStatus.ERROR,
108
108
  error=err_msg,
109
109
  params=params,
110
110
  )
@@ -11,7 +11,7 @@ from holmes.core.tools import (
11
11
  )
12
12
 
13
13
  from pydantic import BaseModel, PrivateAttr
14
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
14
+ from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
15
15
  from holmes.plugins.toolsets.utils import (
16
16
  process_timestamps_to_rfc3339,
17
17
  standard_start_datetime_tool_param_description,
@@ -86,9 +86,9 @@ class ServiceNowBaseTool(Tool):
86
86
  response.raise_for_status()
87
87
  res = response.json()
88
88
  return StructuredToolResult(
89
- status=ToolResultStatus.SUCCESS
89
+ status=StructuredToolResultStatus.SUCCESS
90
90
  if res.get(field, [])
91
- else ToolResultStatus.NO_DATA,
91
+ else StructuredToolResultStatus.NO_DATA,
92
92
  data=res,
93
93
  params=params,
94
94
  )
@@ -139,7 +139,7 @@ class ReturnChangesInTimerange(ServiceNowBaseTool):
139
139
  except Exception as e:
140
140
  logging.exception(self.get_parameterized_one_liner(params))
141
141
  return StructuredToolResult(
142
- status=ToolResultStatus.ERROR,
142
+ status=StructuredToolResultStatus.ERROR,
143
143
  data=f"Exception {self.name}: {str(e)}",
144
144
  params=params,
145
145
  )
@@ -173,7 +173,7 @@ class ReturnChange(ServiceNowBaseTool):
173
173
  except Exception as e:
174
174
  logging.exception(self.get_parameterized_one_liner(params))
175
175
  return StructuredToolResult(
176
- status=ToolResultStatus.ERROR,
176
+ status=StructuredToolResultStatus.ERROR,
177
177
  data=f"Exception {self.name}: {str(e)}",
178
178
  params=params,
179
179
  )
@@ -213,7 +213,7 @@ class ReturnChangesWithKeyword(ServiceNowBaseTool):
213
213
  except Exception as e:
214
214
  logging.exception(self.get_parameterized_one_liner(params))
215
215
  return StructuredToolResult(
216
- status=ToolResultStatus.ERROR,
216
+ status=StructuredToolResultStatus.ERROR,
217
217
  data=f"Exception {self.name}: {str(e)}",
218
218
  params=params,
219
219
  )
@@ -1,5 +1,7 @@
1
1
  import datetime
2
+ import math
2
3
  import time
4
+ import re
3
5
  from typing import Dict, Optional, Tuple, Union
4
6
 
5
7
  from dateutil import parser
@@ -134,6 +136,92 @@ def process_timestamps_to_int(
134
136
  return (start, end) # type: ignore
135
137
 
136
138
 
139
+ def seconds_to_duration_string(seconds: int) -> str:
140
+ """Convert seconds into a compact duration string like '2h30m15s'.
141
+ If the value is less than 1 minute, return just the number of seconds (e.g. '45').
142
+ """
143
+ if seconds < 0:
144
+ raise ValueError("seconds must be non-negative")
145
+
146
+ parts = []
147
+ weeks, seconds = divmod(seconds, 7 * 24 * 3600)
148
+ days, seconds = divmod(seconds, 24 * 3600)
149
+ hours, seconds = divmod(seconds, 3600)
150
+ minutes, seconds = divmod(seconds, 60)
151
+
152
+ if weeks:
153
+ parts.append(f"{weeks}w")
154
+ if days:
155
+ parts.append(f"{days}d")
156
+ if hours:
157
+ parts.append(f"{hours}h")
158
+ if minutes:
159
+ parts.append(f"{minutes}m")
160
+ if seconds or not parts:
161
+ parts.append(f"{seconds}s")
162
+
163
+ return "".join(parts)
164
+
165
+
166
+ def duration_string_to_seconds(duration_string: str) -> int:
167
+ """Convert a duration string like '2h30m15s' or '300' into total seconds.
168
+ A bare integer string is treated as seconds.
169
+ """
170
+ if not duration_string:
171
+ raise ValueError("duration_string cannot be empty")
172
+
173
+ # Pure number? Assume seconds
174
+ if duration_string.isdigit():
175
+ return int(duration_string)
176
+
177
+ pattern = re.compile(r"(?P<value>\d+)(?P<unit>[wdhms])")
178
+ matches = pattern.findall(duration_string)
179
+ if not matches:
180
+ raise ValueError(f"Invalid duration string: {duration_string}")
181
+
182
+ unit_multipliers = {
183
+ "w": 7 * 24 * 3600,
184
+ "d": 24 * 3600,
185
+ "h": 3600,
186
+ "m": 60,
187
+ "s": 1,
188
+ }
189
+
190
+ total_seconds = 0
191
+ for value, unit in matches:
192
+ if unit not in unit_multipliers:
193
+ raise ValueError(f"Unknown unit: {unit}")
194
+ total_seconds += int(value) * unit_multipliers[unit]
195
+
196
+ return total_seconds
197
+
198
+
199
+ def adjust_step_for_max_points(
200
+ time_range_seconds: int,
201
+ max_points: int,
202
+ step: Optional[int] = None,
203
+ ) -> int:
204
+ """
205
+ Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
206
+
207
+ Args:
208
+ time_range_seconds: time range in seconds
209
+ step: The requested step duration in seconds
210
+ max_points: The requested maximum number of data points
211
+
212
+ Returns:
213
+ Adjusted step value in seconds that ensures points <= max_points
214
+ """
215
+ smallest_allowed_step = int(
216
+ math.ceil(float(time_range_seconds) / float(max_points))
217
+ )
218
+
219
+ if not step:
220
+ return smallest_allowed_step
221
+
222
+ return max(smallest_allowed_step, step)
223
+
224
+
137
225
  def get_param_or_raise(dict: Dict, param: str) -> str:
138
226
  value = dict.get(param)
139
227
  if not value:
@@ -0,0 +1,91 @@
1
+ """
2
+ Configuration utility functions for HolmesGPT.
3
+ """
4
+
5
+ from typing import List, Optional, TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from holmes.core.transformers import Transformer
9
+
10
+
11
+ def merge_transformers(
12
+ base_transformers: Optional[List["Transformer"]],
13
+ override_transformers: Optional[List["Transformer"]],
14
+ only_merge_when_override_exists: bool = False,
15
+ ) -> Optional[List["Transformer"]]:
16
+ """
17
+ Merge transformer configurations with intelligent field-level merging.
18
+
19
+ Logic:
20
+ - Override transformers take precedence for existing fields
21
+ - Base transformers provide missing fields
22
+ - Merge at transformer-type level (e.g., "llm_summarize")
23
+
24
+ Args:
25
+ base_transformers: Base transformer configurations (e.g., global transformers)
26
+ override_transformers: Override transformer configurations (e.g., toolset transformers)
27
+ only_merge_when_override_exists: If True, only merge when override_transformers exist.
28
+
29
+ Returns:
30
+ Merged transformer configuration list or None if both inputs are None/empty
31
+ """
32
+ if not base_transformers and not override_transformers:
33
+ return None
34
+ if not base_transformers:
35
+ return override_transformers
36
+ if not override_transformers:
37
+ if only_merge_when_override_exists:
38
+ return None # Don't apply base transformers if override doesn't exist
39
+ else:
40
+ return base_transformers # Original behavior: return base transformers
41
+
42
+ # Convert lists to dicts keyed by transformer name for easier merging
43
+ base_dict = {}
44
+ for transformer in base_transformers:
45
+ base_dict[transformer.name] = transformer
46
+
47
+ override_dict = {}
48
+ for transformer in override_transformers:
49
+ override_dict[transformer.name] = transformer
50
+
51
+ # Merge configurations at field level
52
+ merged_transformers = []
53
+
54
+ # Start with all base transformer types
55
+ for transformer_name, base_transformer in base_dict.items():
56
+ if transformer_name in override_dict:
57
+ # Merge fields: override takes precedence, base provides missing fields
58
+ override_transformer = override_dict[transformer_name]
59
+ merged_config = dict(base_transformer.config) # Start with base
60
+ merged_config.update(
61
+ override_transformer.config
62
+ ) # Override with specific fields
63
+
64
+ # IMPORTANT: Preserve global_fast_model from both base and override
65
+ # This ensures our injected global_fast_model settings aren't lost during merging
66
+ if "global_fast_model" in base_transformer.config:
67
+ merged_config["global_fast_model"] = base_transformer.config[
68
+ "global_fast_model"
69
+ ]
70
+ if "global_fast_model" in override_transformer.config:
71
+ merged_config["global_fast_model"] = override_transformer.config[
72
+ "global_fast_model"
73
+ ]
74
+
75
+ # Create new transformer with merged config
76
+ from holmes.core.transformers import Transformer
77
+
78
+ merged_transformer = Transformer(
79
+ name=transformer_name, config=merged_config
80
+ )
81
+ merged_transformers.append(merged_transformer)
82
+ else:
83
+ # No override, use base transformer as-is
84
+ merged_transformers.append(base_transformer)
85
+
86
+ # Add any override-only transformer types
87
+ for transformer_name, override_transformer in override_dict.items():
88
+ if transformer_name not in base_dict:
89
+ merged_transformers.append(override_transformer)
90
+
91
+ return merged_transformers
holmes/utils/env.py CHANGED
@@ -6,6 +6,13 @@ from typing import Any, Optional
6
6
  from pydantic import SecretStr
7
7
 
8
8
 
9
+ def environ_get_safe_int(env_var: str, default: str = "0") -> int:
10
+ try:
11
+ return max(int(os.environ.get(env_var, default)), 0)
12
+ except ValueError:
13
+ return int(default)
14
+
15
+
9
16
  def get_env_replacement(value: str) -> Optional[str]:
10
17
  env_patterns = re.findall(r"{{\s*env\.([^}]*)\s*}}", value)
11
18
 
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from holmes.core.supabase_dal import SupabaseDal
2
3
  from holmes.config import Config
3
4
  from holmes import get_version # type: ignore
@@ -16,7 +17,7 @@ def update_holmes_status_in_db(dal: SupabaseDal, config: Config):
16
17
  dal.upsert_holmes_status(
17
18
  {
18
19
  "cluster_id": config.cluster_name,
19
- "model": config.get_models_list(),
20
+ "model": json.dumps(config.get_models_list()),
20
21
  "version": get_version(),
21
22
  }
22
23
  )
@@ -0,0 +1,41 @@
1
+ import sentry_sdk
2
+ from holmes.core.tools_utils.data_types import ToolCallResult, TruncationMetadata
3
+
4
+
5
+ def capture_tool_truncations(truncations: list[TruncationMetadata]):
6
+ for truncation in truncations:
7
+ _capture_tool_truncation(truncation)
8
+
9
+
10
+ def _capture_tool_truncation(truncation: TruncationMetadata):
11
+ sentry_sdk.capture_message(
12
+ f"Tool {truncation.tool_name} was truncated",
13
+ level="warning",
14
+ tags={
15
+ "tool_name": truncation.tool_name,
16
+ "tool_original_token_count": truncation.original_token_count,
17
+ "tool_new_token_count": truncation.end_index,
18
+ },
19
+ )
20
+
21
+
22
+ def capture_toolcall_contains_too_many_tokens(
23
+ tool_call_result: ToolCallResult, token_count: int, max_allowed_token_count: int
24
+ ):
25
+ sentry_sdk.capture_message(
26
+ f"Tool call {tool_call_result.tool_name} contains too many tokens",
27
+ level="warning",
28
+ tags={
29
+ "tool_name": tool_call_result.tool_name,
30
+ "tool_original_token_count": token_count,
31
+ "tool_max_allowed_token_count": max_allowed_token_count,
32
+ "tool_description": tool_call_result.description,
33
+ },
34
+ )
35
+
36
+
37
+ def capture_structured_output_incorrect_tool_call():
38
+ sentry_sdk.capture_message(
39
+ "Structured output incorrect tool call",
40
+ level="warning",
41
+ )
holmes/utils/stream.py CHANGED
@@ -5,6 +5,7 @@ import litellm
5
5
  from pydantic import BaseModel, Field
6
6
  from holmes.core.investigation_structured_output import process_response_into_sections
7
7
  from functools import partial
8
+ import logging
8
9
 
9
10
 
10
11
  class StreamEvents(str, Enum):
@@ -61,6 +62,7 @@ def stream_investigate_formatter(
61
62
  "sections": sections or {},
62
63
  "analysis": text_response,
63
64
  "instructions": runbooks or [],
65
+ "metadata": message.data.get("metadata") or {},
64
66
  },
65
67
  )
66
68
  else:
@@ -82,9 +84,16 @@ def stream_chat_formatter(
82
84
  "analysis": message.data.get("content"),
83
85
  "conversation_history": message.data.get("messages"),
84
86
  "follow_up_actions": followups,
87
+ "metadata": message.data.get("metadata") or {},
85
88
  },
86
89
  )
87
90
  else:
88
91
  yield create_sse_message(message.event.value, message.data)
89
92
  except litellm.exceptions.RateLimitError as e:
90
93
  yield create_rate_limit_error_message(str(e))
94
+ except Exception as e:
95
+ logging.error(e)
96
+ if "Model is getting throttled" in str(e): # happens for bedrock
97
+ yield create_rate_limit_error_message(str(e))
98
+ else:
99
+ yield create_sse_error_message(description=str(e), error_code=1, msg=str(e))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: holmesgpt
3
- Version: 0.13.3a0
3
+ Version: 0.14.1
4
4
  Summary:
5
5
  Author: Natan Yellin
6
6
  Author-email: natan@robusta.dev
@@ -8,7 +8,6 @@ Requires-Python: >=3.10,<4.0
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.10
10
10
  Classifier: Programming Language :: Python :: 3.11
11
- Requires-Dist: aiohttp (>=3.10.2,<4.0.0)
12
11
  Requires-Dist: azure-core (>=1.34.0,<2.0.0)
13
12
  Requires-Dist: azure-identity (>=1.23.0,<2.0.0)
14
13
  Requires-Dist: azure-mgmt-alertsmanagement (>=1.0.0,<2.0.0)
@@ -24,41 +23,30 @@ Requires-Dist: certifi (>=2024.7.4,<2025.0.0)
24
23
  Requires-Dist: colorlog (>=6.8.2,<7.0.0)
25
24
  Requires-Dist: confluent-kafka (>=2.6.1,<3.0.0)
26
25
  Requires-Dist: fastapi (>=0.116,<0.117)
27
- Requires-Dist: google-api-python-client (>=2.156.0,<3.0.0)
28
26
  Requires-Dist: humanize (>=4.9.0,<5.0.0)
29
27
  Requires-Dist: jinja2 (>=3.1.2,<4.0.0)
30
28
  Requires-Dist: kubernetes (>=32.0.1,<33.0.0)
31
- Requires-Dist: litellm (>=1.75.4,<2.0.0)
29
+ Requires-Dist: litellm (==1.77.1)
32
30
  Requires-Dist: markdown (>=3.6,<4.0)
33
31
  Requires-Dist: markdownify (>=1.1.0,<2.0.0)
34
32
  Requires-Dist: mcp (==v1.12.2)
35
33
  Requires-Dist: openai (>=1.6.1,<1.100.0)
36
34
  Requires-Dist: opensearch-py (>=2.8.0,<3.0.0)
37
35
  Requires-Dist: postgrest (==0.16.8)
38
- Requires-Dist: prometrix (==0.2.3)
36
+ Requires-Dist: prometrix (==0.2.5)
39
37
  Requires-Dist: prompt-toolkit (>=3.0.51,<4.0.0)
40
- Requires-Dist: protobuf (>=6.31.1)
41
38
  Requires-Dist: pydantic (>=2.7,<3.0)
42
- Requires-Dist: pydantic-settings (>=2.1.0,<3.0.0)
43
- Requires-Dist: pydash (>=8.0.1,<9.0.0)
44
39
  Requires-Dist: pygments (>=2.18.0,<3.0.0)
45
40
  Requires-Dist: pyodbc (>=5.0.1,<6.0.0)
46
- Requires-Dist: pytest-shared-session-scope (>=0.4.0,<0.5.0)
47
41
  Requires-Dist: python-benedict (>=0.33.1,<0.34.0)
48
- Requires-Dist: python_multipart (>=0.0.18,<0.0.19)
49
- Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
50
42
  Requires-Dist: requests (>=2.32.4,<3.0.0)
51
43
  Requires-Dist: requests-aws4auth (>=1.3.1,<2.0.0)
52
44
  Requires-Dist: rich (>=13.7.1,<14.0.0)
53
45
  Requires-Dist: sentry-sdk[fastapi] (>=2.20.0,<3.0.0)
54
- Requires-Dist: setuptools (>=80.9.0,<81.0.0)
55
- Requires-Dist: slack-bolt (>=1.18.1,<2.0.0)
56
- Requires-Dist: starlette (==0.47.2)
57
46
  Requires-Dist: strenum (>=0.4.15,<0.5.0)
58
47
  Requires-Dist: supabase (>=2.5,<3.0)
59
48
  Requires-Dist: tenacity (>=9.1.2,<10.0.0)
60
49
  Requires-Dist: typer (>=0.15.4,<0.16.0)
61
- Requires-Dist: urllib3 (>=1.26.19,<2.0.0)
62
50
  Requires-Dist: uvicorn (>=0.30,<0.31)
63
51
  Description-Content-Type: text/markdown
64
52
 
@@ -223,6 +211,14 @@ You can save common settings and API Keys in a config file to avoid passing them
223
211
  You can save common settings and API keys in config file for re-use. Place the config file in <code>~/.holmes/config.yaml`</code> or pass it using the <code> --config</code>
224
212
 
225
213
  You can view an example config file with all available settings [here](config.example.yaml).
214
+
215
+ ### Tool Output Transformers
216
+
217
+ HolmesGPT supports **transformers** to process large tool outputs before sending them to your primary LLM. This feature helps manage context window limits while preserving essential information.
218
+
219
+ The most common transformer is `llm_summarize`, which uses a fast secondary model to summarize lengthy outputs from tools like `kubectl describe`, log queries, or metrics collection.
220
+
221
+ 📖 **Learn more**: [Tool Output Transformers Documentation](docs/transformers.md)
226
222
  </details>
227
223
 
228
224
  ## 🔐 Data Privacy