PyPI - holmesgpt - Versions diffs - 0.13.3a0__py3-none-any.whl → 0.14.1a0__py3-none-any.whl - Mend

holmesgpt 0.13.3a0py3-none-any.whl → 0.14.1a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of holmesgpt might be problematic. Click here for more details.

Files changed (82) hide show

holmes/__init__.py +1 -1
holmes/clients/robusta_client.py +10 -2
holmes/common/env_vars.py +8 -1
holmes/config.py +66 -139
holmes/core/investigation.py +1 -2
holmes/core/llm.py +256 -51
holmes/core/models.py +2 -0
holmes/core/safeguards.py +4 -4
holmes/core/supabase_dal.py +14 -8
holmes/core/tool_calling_llm.py +193 -176
holmes/core/tools.py +260 -25
holmes/core/tools_utils/data_types.py +81 -0
holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
holmes/core/tools_utils/tool_executor.py +2 -2
holmes/core/toolset_manager.py +150 -3
holmes/core/tracing.py +6 -1
holmes/core/transformers/__init__.py +23 -0
holmes/core/transformers/base.py +62 -0
holmes/core/transformers/llm_summarize.py +174 -0
holmes/core/transformers/registry.py +122 -0
holmes/core/transformers/transformer.py +31 -0
holmes/main.py +5 -0
holmes/plugins/toolsets/aks-node-health.yaml +46 -0
holmes/plugins/toolsets/aks.yaml +64 -0
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
holmes/plugins/toolsets/bash/common/bash.py +7 -7
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
holmes/plugins/toolsets/datadog/toolset_datadog_general.py +16 -17
holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +9 -10
holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +21 -22
holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +8 -8
holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -19
holmes/plugins/toolsets/git.py +22 -22
holmes/plugins/toolsets/grafana/common.py +14 -2
holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +3 -3
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
holmes/plugins/toolsets/internet/internet.py +3 -3
holmes/plugins/toolsets/internet/notion.py +3 -3
holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
holmes/plugins/toolsets/kafka.py +18 -18
holmes/plugins/toolsets/kubernetes.yaml +58 -0
holmes/plugins/toolsets/kubernetes_logs.py +6 -6
holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
holmes/plugins/toolsets/newrelic.py +8 -8
holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
holmes/plugins/toolsets/prometheus/prometheus.py +172 -39
holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +25 -0
holmes/plugins/toolsets/prometheus/utils.py +28 -0
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
holmes/plugins/toolsets/robusta/robusta.py +10 -10
holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
holmes/plugins/toolsets/utils.py +88 -0
holmes/utils/config_utils.py +91 -0
holmes/utils/env.py +7 -0
holmes/utils/holmes_status.py +2 -1
holmes/utils/sentry_helper.py +41 -0
holmes/utils/stream.py +9 -0
{holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/METADATA +10 -14
{holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/RECORD +81 -71
holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
{holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/LICENSE.txt +0 -0
{holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/WHEEL +0 -0
{holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/entry_points.txt +0 -0

holmes/plugins/toolsets/opensearch/opensearch_traces.py CHANGED Viewed

@@ -18,7 +18,7 @@ from holmes.plugins.toolsets.opensearch.opensearch_utils import (
     add_auth_header,
     get_search_url,
 )
-from holmes.core.tools import StructuredToolResult, ToolResultStatus
+from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
 from holmes.plugins.toolsets.utils import get_param_or_raise, toolset_name_for_one_liner
 TRACES_FIELDS_CACHE_KEY = "cached_traces_fields"
@@ -48,7 +48,7 @@ class GetTracesFields(Tool):
                 if cached_response:
                     logging.debug("traces fields returned from cache")
                     return StructuredToolResult(
-                        status=ToolResultStatus.SUCCESS,
+                        status=StructuredToolResultStatus.SUCCESS,
                         data=cached_response,
                         params=params,
                     )
@@ -81,7 +81,7 @@ class GetTracesFields(Tool):
             if self._cache:
                 self._cache[TRACES_FIELDS_CACHE_KEY] = response
             return StructuredToolResult(
-                status=ToolResultStatus.SUCCESS,
+                status=StructuredToolResultStatus.SUCCESS,
                 data=response,
                 params=params,
             )
@@ -90,21 +90,21 @@ class GetTracesFields(Tool):
                 "Timeout while fetching opensearch traces fields", exc_info=True
             )
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Request timed out while fetching opensearch traces fields",
                 params=params,
             )
         except RequestException as e:
             logging.warning("Failed to fetch opensearch traces fields", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Network error while opensearch traces fields: {str(e)}",
                 params=params,
             )
         except Exception as e:
             logging.warning("Failed to process opensearch traces fields", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error: {str(e)}",
                 params=params,
             )
@@ -157,7 +157,7 @@ class TracesSearchQuery(Tool):
             logs_response.raise_for_status()
             return StructuredToolResult(
-                status=ToolResultStatus.SUCCESS,
+                status=StructuredToolResultStatus.SUCCESS,
                 data=json.dumps(logs_response.json()),
                 params=params,
             )
@@ -166,14 +166,14 @@ class TracesSearchQuery(Tool):
                 "Timeout while fetching opensearch traces search", exc_info=True
             )
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Request timed out while fetching opensearch traces search {err_msg}",
                 params=params,
             )
         except RequestException as e:
             logging.warning("Failed to fetch opensearch traces search", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Network error while opensearch traces search {err_msg} : {str(e)}",
                 params=params,
             )
@@ -182,7 +182,7 @@ class TracesSearchQuery(Tool):
                 "Failed to process opensearch traces search ", exc_info=True
             )
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error {err_msg}: {str(e)}",
                 params=params,
             )

holmes/plugins/toolsets/prometheus/prometheus.py CHANGED Viewed

@@ -17,11 +17,12 @@ from holmes.core.tools import (
     StructuredToolResult,
     Tool,
     ToolParameter,
-    ToolResultStatus,
+    StructuredToolResultStatus,
     Toolset,
     ToolsetTag,
 )
 from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
+from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
 from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
 from holmes.plugins.toolsets.utils import (
     get_param_or_raise,
@@ -55,6 +56,9 @@ class PrometheusConfig(BaseModel):
     rules_cache_duration_seconds: Union[int, None] = 1800  # 30 minutes
     additional_labels: Optional[Dict[str, str]] = None
     prometheus_ssl_enabled: bool = True
+    query_response_size_limit: Optional[int] = (
+        80000  # Limit the max number of characters in a query result to proactively prevent truncation and advise LLM to query less data
+    )
     @field_validator("prometheus_url")
     def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
@@ -284,7 +288,7 @@ def result_has_data(result: Dict) -> bool:
 def adjust_step_for_max_points(
     start_timestamp: str,
     end_timestamp: str,
-    step: float,
+    step: Optional[float] = None,
 ) -> float:
     """
     Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
@@ -293,7 +297,7 @@ def adjust_step_for_max_points(
     Args:
         start_timestamp: RFC3339 formatted start time
         end_timestamp: RFC3339 formatted end time
-        step: The requested step duration in seconds
+        step: The requested step duration in seconds (None for auto-calculation)
     Returns:
         Adjusted step value in seconds that ensures points <= max_points
@@ -304,6 +308,14 @@ def adjust_step_for_max_points(
     time_range_seconds = (end_dt - start_dt).total_seconds()
+    # If no step provided, calculate a reasonable default
+    # Aim for ~60 data points across the time range (1 per minute for hourly, etc)
+    if step is None:
+        step = max(1, time_range_seconds / 60)
+        logging.debug(
+            f"No step provided, defaulting to {step}s for {time_range_seconds}s range"
+        )
     current_points = time_range_seconds / step
     # If current points exceed max, adjust the step
@@ -324,6 +336,79 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
     return results
+def create_data_summary_for_large_result(
+    result_data: Dict, query: str, data_size_chars: int, is_range_query: bool = False
+) -> Dict[str, Any]:
+    """
+    Create a summary for large Prometheus results instead of returning full data.
+    Args:
+        result_data: The Prometheus data result
+        query: The original PromQL query
+        data_size_chars: Size of the data in characters
+        is_range_query: Whether this is a range query (vs instant query)
+    Returns:
+        Dictionary with summary information and suggestions
+    """
+    if is_range_query:
+        series_list = result_data.get("result", [])
+        num_items = len(series_list)
+        # Calculate statistics for range queries
+        total_points = 0
+        for series in series_list[:10]:  # Sample first 10 series
+            points = len(series.get("values", []))
+            total_points += points
+        avg_points_per_series = (
+            total_points / min(10, num_items) if num_items > 0 else 0
+        )
+        estimated_total_points = avg_points_per_series * num_items
+        # Create a sample of just the metadata (labels) without values
+        sample_metrics = []
+        for series in series_list[:10]:  # Sample first 10 series
+            sample_metrics.append(series.get("metric", {}))
+        sample_json = json.dumps(sample_metrics, indent=2)
+        if len(sample_json) > 2000:
+            sample_json = sample_json[:2000] + "\n... (truncated)"
+        return {
+            "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} time series with approximately {estimated_total_points:,.0f} total data points.",
+            "series_count": num_items,
+            "estimated_total_points": int(estimated_total_points),
+            "data_size_characters": data_size_chars,
+            "sample_data": sample_json,
+            "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
+        }
+    else:
+        # Instant query
+        result_type = result_data.get("resultType", "")
+        result_list = result_data.get("result", [])
+        num_items = len(result_list)
+        # Create a sample of just the metadata (labels) without values
+        sample_metrics = []
+        for item in result_list[:10]:  # Sample first 10 results
+            if isinstance(item, dict):
+                sample_metrics.append(item.get("metric", {}))
+        sample_json = json.dumps(sample_metrics, indent=2)
+        if len(sample_json) > 2000:
+            sample_json = sample_json[:2000] + "\n... (truncated)"
+        return {
+            "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} results.",
+            "result_count": num_items,
+            "result_type": result_type,
+            "data_size_characters": data_size_chars,
+            "sample_data": sample_json,
+            "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
+        }
 def fetch_metrics_labels_with_series_api(
     prometheus_url: str,
     headers: Dict[str, str],
@@ -496,13 +581,13 @@ class ListPrometheusRules(BasePrometheusTool):
     ) -> StructuredToolResult:
         if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
         if self.toolset.config.is_amp():
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Tool not supported in AMP",
                 params=params,
             )
@@ -515,7 +600,7 @@ class ListPrometheusRules(BasePrometheusTool):
                     logging.debug("rules returned from cache")
                     return StructuredToolResult(
-                        status=ToolResultStatus.SUCCESS,
+                        status=StructuredToolResultStatus.SUCCESS,
                         data=cached_rules,
                         params=params,
                     )
@@ -539,28 +624,28 @@ class ListPrometheusRules(BasePrometheusTool):
             if self._cache:
                 self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
             return StructuredToolResult(
-                status=ToolResultStatus.SUCCESS,
+                status=StructuredToolResultStatus.SUCCESS,
                 data=data,
                 params=params,
             )
         except requests.Timeout:
             logging.warning("Timeout while fetching prometheus rules", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Request timed out while fetching rules",
                 params=params,
             )
         except RequestException as e:
             logging.warning("Failed to fetch prometheus rules", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Network error while fetching rules: {str(e)}",
                 params=params,
             )
         except Exception as e:
             logging.warning("Failed to process prometheus rules", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error: {str(e)}",
                 params=params,
             )
@@ -595,7 +680,7 @@ class ListAvailableMetrics(BasePrometheusTool):
     ) -> StructuredToolResult:
         if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
@@ -612,7 +697,7 @@ class ListAvailableMetrics(BasePrometheusTool):
             name_filter = params.get("name_filter")
             if not name_filter:
                 return StructuredToolResult(
-                    status=ToolResultStatus.ERROR,
+                    status=StructuredToolResultStatus.ERROR,
                     error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
                     params=params,
                 )
@@ -646,7 +731,7 @@ class ListAvailableMetrics(BasePrometheusTool):
             table_output = "\n".join(output)
             return StructuredToolResult(
-                status=ToolResultStatus.SUCCESS,
+                status=StructuredToolResultStatus.SUCCESS,
                 data=table_output,
                 params=params,
             )
@@ -654,21 +739,21 @@ class ListAvailableMetrics(BasePrometheusTool):
         except requests.Timeout:
             logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Request timed out while fetching metrics",
                 params=params,
             )
         except RequestException as e:
             logging.warn("Failed to fetch prometheus metrics", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Network error while fetching metrics: {str(e)}",
                 params=params,
             )
         except Exception as e:
             logging.warn("Failed to process prometheus metrics", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error: {str(e)}",
                 params=params,
             )
@@ -703,7 +788,7 @@ class ExecuteInstantQuery(BasePrometheusTool):
     ) -> StructuredToolResult:
         if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
@@ -743,12 +828,39 @@ class ExecuteInstantQuery(BasePrometheusTool):
                     "query": query,
                 }
+                # Check if data should be included based on size
                 if self.toolset.config.tool_calls_return_data:
-                    response_data["data"] = data.get("data")
+                    result_data = data.get("data", {})
+                    # Estimate the size of the data
+                    data_str_preview = json.dumps(result_data)
+                    data_size_chars = len(data_str_preview)
+                    # Provide summary if data is too large
+                    if (
+                        self.toolset.config.query_response_size_limit
+                        and data_size_chars
+                        > self.toolset.config.query_response_size_limit
+                    ):
+                        response_data["data_summary"] = (
+                            create_data_summary_for_large_result(
+                                result_data,
+                                query,
+                                data_size_chars,
+                                is_range_query=False,
+                            )
+                        )
+                        logging.info(
+                            f"Prometheus instant query returned large dataset: "
+                            f"{response_data['data_summary'].get('result_count', 0)} results, "
+                            f"{data_size_chars:,} characters. Returning summary instead of full data."
+                        )
+                    else:
+                        response_data["data"] = result_data
                 data_str = json.dumps(response_data, indent=2)
                 return StructuredToolResult(
-                    status=ToolResultStatus.SUCCESS,
+                    status=StructuredToolResultStatus.SUCCESS,
                     data=data_str,
                     params=params,
                 )
@@ -764,14 +876,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
                 except json.JSONDecodeError:
                     pass
                 return StructuredToolResult(
-                    status=ToolResultStatus.ERROR,
+                    status=StructuredToolResultStatus.ERROR,
                     error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
                     params=params,
                 )
             # For other status codes, just return the status code and content
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
                 params=params,
             )
@@ -779,14 +891,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
         except RequestException as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Connection error to Prometheus: {str(e)}",
                 params=params,
             )
         except Exception as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error executing query: {str(e)}",
                 params=params,
             )
@@ -827,7 +939,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 "step": ToolParameter(
                     description="Query resolution step width in duration format or float number of seconds",
                     type="number",
-                    required=True,
+                    required=False,
                 ),
                 "output_type": ToolParameter(
                     description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
@@ -843,7 +955,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
     ) -> StructuredToolResult:
         if not self.toolset.config or not self.toolset.config.prometheus_url:
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error="Prometheus is not configured. Prometheus URL is missing",
                 params=params,
             )
@@ -857,12 +969,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 end_timestamp=params.get("end"),
                 default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
             )
-            step = params.get("step", "")
+            step = parse_duration_to_seconds(params.get("step"))
+            # adjust_step_for_max_points handles None case and converts to float
             step = adjust_step_for_max_points(
                 start_timestamp=start,
                 end_timestamp=end,
-                step=float(step) if step else MAX_GRAPH_POINTS,
+                step=step,
             )
             description = params.get("description", "")
@@ -906,12 +1019,37 @@ class ExecuteRangeQuery(BasePrometheusTool):
                     "output_type": output_type,
                 }
+                # Check if data should be included based on size
                 if self.toolset.config.tool_calls_return_data:
-                    response_data["data"] = data.get("data")
+                    result_data = data.get("data", {})
+                    # Estimate the size of the data
+                    data_str_preview = json.dumps(result_data)
+                    data_size_chars = len(data_str_preview)
+                    # Provide summary if data is too large
+                    if (
+                        self.toolset.config.query_response_size_limit
+                        and data_size_chars
+                        > self.toolset.config.query_response_size_limit
+                    ):
+                        response_data["data_summary"] = (
+                            create_data_summary_for_large_result(
+                                result_data, query, data_size_chars, is_range_query=True
+                            )
+                        )
+                        logging.info(
+                            f"Prometheus range query returned large dataset: "
+                            f"{response_data['data_summary'].get('series_count', 0)} series, "
+                            f"{data_size_chars:,} characters. Returning summary instead of full data."
+                        )
+                    else:
+                        response_data["data"] = result_data
                 data_str = json.dumps(response_data, indent=2)
                 return StructuredToolResult(
-                    status=ToolResultStatus.SUCCESS,
+                    status=StructuredToolResultStatus.SUCCESS,
                     data=data_str,
                     params=params,
                 )
@@ -926,13 +1064,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
                 except json.JSONDecodeError:
                     pass
                 return StructuredToolResult(
-                    status=ToolResultStatus.ERROR,
+                    status=StructuredToolResultStatus.ERROR,
                     error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
                     params=params,
                 )
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
                 params=params,
             )
@@ -940,14 +1078,14 @@ class ExecuteRangeQuery(BasePrometheusTool):
         except RequestException as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Connection error to Prometheus: {str(e)}",
                 params=params,
             )
         except Exception as e:
             logging.info("Failed to connect to Prometheus", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error executing query: {str(e)}",
                 params=params,
             )
@@ -1060,13 +1198,8 @@ class PrometheusToolset(Toolset):
                     f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
                 )
-        except RequestException:
-            return (
-                False,
-                f"Failed to initialize using url={url}",
-            )
         except Exception as e:
-            logging.exception("Failed to initialize Prometheus")
+            logging.exception("Failed to initialize Prometheus", exc_info=True)
             return (
                 False,
                 f"Failed to initialize using url={url}. Unexpected error: {str(e)}",

holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 CHANGED Viewed

@@ -19,6 +19,31 @@
 * Only generate and execute a prometheus query after checking what metrics are available with the `list_available_metrics` tool
 * Check that any node, service, pod, container, app, namespace, etc. mentioned in the query exist in the kubernetes cluster before making a query. Use any appropriate kubectl tool(s) for this
 * The toolcall will return no data to you. That is expected. You MUST however ensure that the query is successful.
+## Handling High-Cardinality Metrics
+* CRITICAL: When querying metrics that may return many time series (>10), ALWAYS use aggregation to limit results
+* ALWAYS use `topk()` or `bottomk()` to limit the number of series returned
+* Standard pattern for high-cardinality queries:
+  - Use `topk(5, <your_query>)` to get the top 5 series
+  - Example: `topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m]))`
+  - This prevents context overflow and focuses on the most relevant data
+* To also capture the aggregate of remaining series as "other":
+  ```
+  topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m]))
+  or
+  label_replace(
+    (sum(rate(container_cpu_usage_seconds_total{namespace="default"}[5m])) - sum(topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m])))),
+    "pod", "other", "", ""
+  )
+  ```
+* Common high-cardinality scenarios requiring topk():
+  - Pod-level metrics in namespaces with many pods
+  - Container-level CPU/memory metrics
+  - HTTP metrics with many endpoints or status codes
+  - Any query returning more than 10 time series
+* For initial exploration, use instant queries with `count()` to check cardinality:
+  - Example: `count(count by (pod) (container_cpu_usage_seconds_total{namespace="default"}))`
+  - If count > 10, use topk() in your range query
 * When doing queries, always extend the time range, to 15 min before and after the alert start time
 * ALWAYS embed the execution results into your answer
 * ALWAYS embed a Prometheus graph in the response. The graph should visualize data related to the incident.

holmes/plugins/toolsets/prometheus/utils.py ADDED Viewed

@@ -0,0 +1,28 @@
+import re
+from typing import Optional, Union
+def parse_duration_to_seconds(v: Optional[Union[str, float, int]]) -> Optional[float]:
+    if v is None:
+        return None
+    if isinstance(v, (int, float)):
+        return float(v)
+    s = v.strip().lower()
+    if s.isdigit():
+        return float(int(s))
+    units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
+    # Check for partial time formats (e.g., 1h30m, 5m12s, 1d2h30m)
+    pattern = r"(\d+(?:\.\d+)?)(d|h|m|s)"
+    matches = re.findall(pattern, s)
+    if matches:
+        total_seconds = 0.0
+        for value_str, unit in matches:
+            value = float(value_str)
+            total_seconds += value * units[unit]
+        return float(int(total_seconds))
+    # fallback: try float seconds
+    return float(s)

holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py CHANGED Viewed

@@ -8,7 +8,7 @@ from holmes.core.tools import (
     StructuredToolResult,
     Tool,
     ToolParameter,
-    ToolResultStatus,
+    StructuredToolResultStatus,
     Toolset,
     ToolsetTag,
 )
@@ -79,7 +79,7 @@ class ListConfiguredClusters(BaseRabbitMQTool):
             if c.connection_status == ClusterConnectionStatus.SUCCESS
         ]
         return StructuredToolResult(
-            status=ToolResultStatus.SUCCESS, data=available_clusters
+            status=StructuredToolResultStatus.SUCCESS, data=available_clusters
         )
     def get_parameterized_one_liner(self, params) -> str:
@@ -112,12 +112,14 @@ class GetRabbitMQClusterStatus(BaseRabbitMQTool):
                 cluster_id=params.get("cluster_id")
             )
             result = get_cluster_status(cluster_config)
-            return StructuredToolResult(status=ToolResultStatus.SUCCESS, data=result)
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.SUCCESS, data=result
+            )
         except Exception as e:
             logging.info("Failed to process RabbitMQ cluster status", exc_info=True)
             return StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Unexpected error fetching RabbitMQ cluster status: {str(e)}",
                 data=None,
             )

holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1a0__py3-none-any.whl

Potentially problematic release.

holmesgpt 0.13.3a0py3-none-any.whl → 0.14.1a0py3-none-any.whl