holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (82) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +10 -2
  3. holmes/common/env_vars.py +8 -1
  4. holmes/config.py +66 -139
  5. holmes/core/investigation.py +1 -2
  6. holmes/core/llm.py +256 -51
  7. holmes/core/models.py +2 -0
  8. holmes/core/safeguards.py +4 -4
  9. holmes/core/supabase_dal.py +14 -8
  10. holmes/core/tool_calling_llm.py +193 -176
  11. holmes/core/tools.py +260 -25
  12. holmes/core/tools_utils/data_types.py +81 -0
  13. holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
  14. holmes/core/tools_utils/tool_executor.py +2 -2
  15. holmes/core/toolset_manager.py +150 -3
  16. holmes/core/tracing.py +6 -1
  17. holmes/core/transformers/__init__.py +23 -0
  18. holmes/core/transformers/base.py +62 -0
  19. holmes/core/transformers/llm_summarize.py +174 -0
  20. holmes/core/transformers/registry.py +122 -0
  21. holmes/core/transformers/transformer.py +31 -0
  22. holmes/main.py +5 -0
  23. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  24. holmes/plugins/toolsets/aks.yaml +64 -0
  25. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
  26. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
  27. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
  28. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
  29. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  30. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
  31. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
  32. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
  33. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
  34. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
  35. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
  36. holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
  37. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  38. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  39. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +16 -17
  40. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +9 -10
  41. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +21 -22
  42. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +8 -8
  43. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -19
  44. holmes/plugins/toolsets/git.py +22 -22
  45. holmes/plugins/toolsets/grafana/common.py +14 -2
  46. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
  47. holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
  48. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +3 -3
  49. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  50. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
  51. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  52. holmes/plugins/toolsets/internet/internet.py +3 -3
  53. holmes/plugins/toolsets/internet/notion.py +3 -3
  54. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  55. holmes/plugins/toolsets/kafka.py +18 -18
  56. holmes/plugins/toolsets/kubernetes.yaml +58 -0
  57. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  58. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  59. holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
  60. holmes/plugins/toolsets/newrelic.py +8 -8
  61. holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
  62. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  63. holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
  64. holmes/plugins/toolsets/prometheus/prometheus.py +172 -39
  65. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +25 -0
  66. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  67. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
  68. holmes/plugins/toolsets/robusta/robusta.py +10 -10
  69. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
  70. holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
  71. holmes/plugins/toolsets/utils.py +88 -0
  72. holmes/utils/config_utils.py +91 -0
  73. holmes/utils/env.py +7 -0
  74. holmes/utils/holmes_status.py +2 -1
  75. holmes/utils/sentry_helper.py +41 -0
  76. holmes/utils/stream.py +9 -0
  77. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/METADATA +10 -14
  78. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/RECORD +81 -71
  79. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  80. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/LICENSE.txt +0 -0
  81. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/WHEEL +0 -0
  82. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/entry_points.txt +0 -0
@@ -18,7 +18,7 @@ from holmes.plugins.toolsets.opensearch.opensearch_utils import (
18
18
  add_auth_header,
19
19
  get_search_url,
20
20
  )
21
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
21
+ from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
22
22
  from holmes.plugins.toolsets.utils import get_param_or_raise, toolset_name_for_one_liner
23
23
 
24
24
  TRACES_FIELDS_CACHE_KEY = "cached_traces_fields"
@@ -48,7 +48,7 @@ class GetTracesFields(Tool):
48
48
  if cached_response:
49
49
  logging.debug("traces fields returned from cache")
50
50
  return StructuredToolResult(
51
- status=ToolResultStatus.SUCCESS,
51
+ status=StructuredToolResultStatus.SUCCESS,
52
52
  data=cached_response,
53
53
  params=params,
54
54
  )
@@ -81,7 +81,7 @@ class GetTracesFields(Tool):
81
81
  if self._cache:
82
82
  self._cache[TRACES_FIELDS_CACHE_KEY] = response
83
83
  return StructuredToolResult(
84
- status=ToolResultStatus.SUCCESS,
84
+ status=StructuredToolResultStatus.SUCCESS,
85
85
  data=response,
86
86
  params=params,
87
87
  )
@@ -90,21 +90,21 @@ class GetTracesFields(Tool):
90
90
  "Timeout while fetching opensearch traces fields", exc_info=True
91
91
  )
92
92
  return StructuredToolResult(
93
- status=ToolResultStatus.ERROR,
93
+ status=StructuredToolResultStatus.ERROR,
94
94
  error="Request timed out while fetching opensearch traces fields",
95
95
  params=params,
96
96
  )
97
97
  except RequestException as e:
98
98
  logging.warning("Failed to fetch opensearch traces fields", exc_info=True)
99
99
  return StructuredToolResult(
100
- status=ToolResultStatus.ERROR,
100
+ status=StructuredToolResultStatus.ERROR,
101
101
  error=f"Network error while opensearch traces fields: {str(e)}",
102
102
  params=params,
103
103
  )
104
104
  except Exception as e:
105
105
  logging.warning("Failed to process opensearch traces fields", exc_info=True)
106
106
  return StructuredToolResult(
107
- status=ToolResultStatus.ERROR,
107
+ status=StructuredToolResultStatus.ERROR,
108
108
  error=f"Unexpected error: {str(e)}",
109
109
  params=params,
110
110
  )
@@ -157,7 +157,7 @@ class TracesSearchQuery(Tool):
157
157
 
158
158
  logs_response.raise_for_status()
159
159
  return StructuredToolResult(
160
- status=ToolResultStatus.SUCCESS,
160
+ status=StructuredToolResultStatus.SUCCESS,
161
161
  data=json.dumps(logs_response.json()),
162
162
  params=params,
163
163
  )
@@ -166,14 +166,14 @@ class TracesSearchQuery(Tool):
166
166
  "Timeout while fetching opensearch traces search", exc_info=True
167
167
  )
168
168
  return StructuredToolResult(
169
- status=ToolResultStatus.ERROR,
169
+ status=StructuredToolResultStatus.ERROR,
170
170
  error=f"Request timed out while fetching opensearch traces search {err_msg}",
171
171
  params=params,
172
172
  )
173
173
  except RequestException as e:
174
174
  logging.warning("Failed to fetch opensearch traces search", exc_info=True)
175
175
  return StructuredToolResult(
176
- status=ToolResultStatus.ERROR,
176
+ status=StructuredToolResultStatus.ERROR,
177
177
  error=f"Network error while opensearch traces search {err_msg} : {str(e)}",
178
178
  params=params,
179
179
  )
@@ -182,7 +182,7 @@ class TracesSearchQuery(Tool):
182
182
  "Failed to process opensearch traces search ", exc_info=True
183
183
  )
184
184
  return StructuredToolResult(
185
- status=ToolResultStatus.ERROR,
185
+ status=StructuredToolResultStatus.ERROR,
186
186
  error=f"Unexpected error {err_msg}: {str(e)}",
187
187
  params=params,
188
188
  )
@@ -17,11 +17,12 @@ from holmes.core.tools import (
17
17
  StructuredToolResult,
18
18
  Tool,
19
19
  ToolParameter,
20
- ToolResultStatus,
20
+ StructuredToolResultStatus,
21
21
  Toolset,
22
22
  ToolsetTag,
23
23
  )
24
24
  from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
25
+ from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
25
26
  from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
26
27
  from holmes.plugins.toolsets.utils import (
27
28
  get_param_or_raise,
@@ -55,6 +56,9 @@ class PrometheusConfig(BaseModel):
55
56
  rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
56
57
  additional_labels: Optional[Dict[str, str]] = None
57
58
  prometheus_ssl_enabled: bool = True
59
+ query_response_size_limit: Optional[int] = (
60
+ 80000 # Limit the max number of characters in a query result to proactively prevent truncation and advise LLM to query less data
61
+ )
58
62
 
59
63
  @field_validator("prometheus_url")
60
64
  def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
@@ -284,7 +288,7 @@ def result_has_data(result: Dict) -> bool:
284
288
  def adjust_step_for_max_points(
285
289
  start_timestamp: str,
286
290
  end_timestamp: str,
287
- step: float,
291
+ step: Optional[float] = None,
288
292
  ) -> float:
289
293
  """
290
294
  Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
@@ -293,7 +297,7 @@ def adjust_step_for_max_points(
293
297
  Args:
294
298
  start_timestamp: RFC3339 formatted start time
295
299
  end_timestamp: RFC3339 formatted end time
296
- step: The requested step duration in seconds
300
+ step: The requested step duration in seconds (None for auto-calculation)
297
301
 
298
302
  Returns:
299
303
  Adjusted step value in seconds that ensures points <= max_points
@@ -304,6 +308,14 @@ def adjust_step_for_max_points(
304
308
 
305
309
  time_range_seconds = (end_dt - start_dt).total_seconds()
306
310
 
311
+ # If no step provided, calculate a reasonable default
312
+ # Aim for ~60 data points across the time range (1 per minute for hourly, etc)
313
+ if step is None:
314
+ step = max(1, time_range_seconds / 60)
315
+ logging.debug(
316
+ f"No step provided, defaulting to {step}s for {time_range_seconds}s range"
317
+ )
318
+
307
319
  current_points = time_range_seconds / step
308
320
 
309
321
  # If current points exceed max, adjust the step
@@ -324,6 +336,79 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
324
336
  return results
325
337
 
326
338
 
339
+ def create_data_summary_for_large_result(
340
+ result_data: Dict, query: str, data_size_chars: int, is_range_query: bool = False
341
+ ) -> Dict[str, Any]:
342
+ """
343
+ Create a summary for large Prometheus results instead of returning full data.
344
+
345
+ Args:
346
+ result_data: The Prometheus data result
347
+ query: The original PromQL query
348
+ data_size_chars: Size of the data in characters
349
+ is_range_query: Whether this is a range query (vs instant query)
350
+
351
+ Returns:
352
+ Dictionary with summary information and suggestions
353
+ """
354
+ if is_range_query:
355
+ series_list = result_data.get("result", [])
356
+ num_items = len(series_list)
357
+
358
+ # Calculate statistics for range queries
359
+ total_points = 0
360
+ for series in series_list[:10]: # Sample first 10 series
361
+ points = len(series.get("values", []))
362
+ total_points += points
363
+
364
+ avg_points_per_series = (
365
+ total_points / min(10, num_items) if num_items > 0 else 0
366
+ )
367
+ estimated_total_points = avg_points_per_series * num_items
368
+
369
+ # Create a sample of just the metadata (labels) without values
370
+ sample_metrics = []
371
+ for series in series_list[:10]: # Sample first 10 series
372
+ sample_metrics.append(series.get("metric", {}))
373
+
374
+ sample_json = json.dumps(sample_metrics, indent=2)
375
+ if len(sample_json) > 2000:
376
+ sample_json = sample_json[:2000] + "\n... (truncated)"
377
+
378
+ return {
379
+ "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} time series with approximately {estimated_total_points:,.0f} total data points.",
380
+ "series_count": num_items,
381
+ "estimated_total_points": int(estimated_total_points),
382
+ "data_size_characters": data_size_chars,
383
+ "sample_data": sample_json,
384
+ "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
385
+ }
386
+ else:
387
+ # Instant query
388
+ result_type = result_data.get("resultType", "")
389
+ result_list = result_data.get("result", [])
390
+ num_items = len(result_list)
391
+
392
+ # Create a sample of just the metadata (labels) without values
393
+ sample_metrics = []
394
+ for item in result_list[:10]: # Sample first 10 results
395
+ if isinstance(item, dict):
396
+ sample_metrics.append(item.get("metric", {}))
397
+
398
+ sample_json = json.dumps(sample_metrics, indent=2)
399
+ if len(sample_json) > 2000:
400
+ sample_json = sample_json[:2000] + "\n... (truncated)"
401
+
402
+ return {
403
+ "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} results.",
404
+ "result_count": num_items,
405
+ "result_type": result_type,
406
+ "data_size_characters": data_size_chars,
407
+ "sample_data": sample_json,
408
+ "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
409
+ }
410
+
411
+
327
412
  def fetch_metrics_labels_with_series_api(
328
413
  prometheus_url: str,
329
414
  headers: Dict[str, str],
@@ -496,13 +581,13 @@ class ListPrometheusRules(BasePrometheusTool):
496
581
  ) -> StructuredToolResult:
497
582
  if not self.toolset.config or not self.toolset.config.prometheus_url:
498
583
  return StructuredToolResult(
499
- status=ToolResultStatus.ERROR,
584
+ status=StructuredToolResultStatus.ERROR,
500
585
  error="Prometheus is not configured. Prometheus URL is missing",
501
586
  params=params,
502
587
  )
503
588
  if self.toolset.config.is_amp():
504
589
  return StructuredToolResult(
505
- status=ToolResultStatus.ERROR,
590
+ status=StructuredToolResultStatus.ERROR,
506
591
  error="Tool not supported in AMP",
507
592
  params=params,
508
593
  )
@@ -515,7 +600,7 @@ class ListPrometheusRules(BasePrometheusTool):
515
600
  logging.debug("rules returned from cache")
516
601
 
517
602
  return StructuredToolResult(
518
- status=ToolResultStatus.SUCCESS,
603
+ status=StructuredToolResultStatus.SUCCESS,
519
604
  data=cached_rules,
520
605
  params=params,
521
606
  )
@@ -539,28 +624,28 @@ class ListPrometheusRules(BasePrometheusTool):
539
624
  if self._cache:
540
625
  self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
541
626
  return StructuredToolResult(
542
- status=ToolResultStatus.SUCCESS,
627
+ status=StructuredToolResultStatus.SUCCESS,
543
628
  data=data,
544
629
  params=params,
545
630
  )
546
631
  except requests.Timeout:
547
632
  logging.warning("Timeout while fetching prometheus rules", exc_info=True)
548
633
  return StructuredToolResult(
549
- status=ToolResultStatus.ERROR,
634
+ status=StructuredToolResultStatus.ERROR,
550
635
  error="Request timed out while fetching rules",
551
636
  params=params,
552
637
  )
553
638
  except RequestException as e:
554
639
  logging.warning("Failed to fetch prometheus rules", exc_info=True)
555
640
  return StructuredToolResult(
556
- status=ToolResultStatus.ERROR,
641
+ status=StructuredToolResultStatus.ERROR,
557
642
  error=f"Network error while fetching rules: {str(e)}",
558
643
  params=params,
559
644
  )
560
645
  except Exception as e:
561
646
  logging.warning("Failed to process prometheus rules", exc_info=True)
562
647
  return StructuredToolResult(
563
- status=ToolResultStatus.ERROR,
648
+ status=StructuredToolResultStatus.ERROR,
564
649
  error=f"Unexpected error: {str(e)}",
565
650
  params=params,
566
651
  )
@@ -595,7 +680,7 @@ class ListAvailableMetrics(BasePrometheusTool):
595
680
  ) -> StructuredToolResult:
596
681
  if not self.toolset.config or not self.toolset.config.prometheus_url:
597
682
  return StructuredToolResult(
598
- status=ToolResultStatus.ERROR,
683
+ status=StructuredToolResultStatus.ERROR,
599
684
  error="Prometheus is not configured. Prometheus URL is missing",
600
685
  params=params,
601
686
  )
@@ -612,7 +697,7 @@ class ListAvailableMetrics(BasePrometheusTool):
612
697
  name_filter = params.get("name_filter")
613
698
  if not name_filter:
614
699
  return StructuredToolResult(
615
- status=ToolResultStatus.ERROR,
700
+ status=StructuredToolResultStatus.ERROR,
616
701
  error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
617
702
  params=params,
618
703
  )
@@ -646,7 +731,7 @@ class ListAvailableMetrics(BasePrometheusTool):
646
731
 
647
732
  table_output = "\n".join(output)
648
733
  return StructuredToolResult(
649
- status=ToolResultStatus.SUCCESS,
734
+ status=StructuredToolResultStatus.SUCCESS,
650
735
  data=table_output,
651
736
  params=params,
652
737
  )
@@ -654,21 +739,21 @@ class ListAvailableMetrics(BasePrometheusTool):
654
739
  except requests.Timeout:
655
740
  logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
656
741
  return StructuredToolResult(
657
- status=ToolResultStatus.ERROR,
742
+ status=StructuredToolResultStatus.ERROR,
658
743
  error="Request timed out while fetching metrics",
659
744
  params=params,
660
745
  )
661
746
  except RequestException as e:
662
747
  logging.warn("Failed to fetch prometheus metrics", exc_info=True)
663
748
  return StructuredToolResult(
664
- status=ToolResultStatus.ERROR,
749
+ status=StructuredToolResultStatus.ERROR,
665
750
  error=f"Network error while fetching metrics: {str(e)}",
666
751
  params=params,
667
752
  )
668
753
  except Exception as e:
669
754
  logging.warn("Failed to process prometheus metrics", exc_info=True)
670
755
  return StructuredToolResult(
671
- status=ToolResultStatus.ERROR,
756
+ status=StructuredToolResultStatus.ERROR,
672
757
  error=f"Unexpected error: {str(e)}",
673
758
  params=params,
674
759
  )
@@ -703,7 +788,7 @@ class ExecuteInstantQuery(BasePrometheusTool):
703
788
  ) -> StructuredToolResult:
704
789
  if not self.toolset.config or not self.toolset.config.prometheus_url:
705
790
  return StructuredToolResult(
706
- status=ToolResultStatus.ERROR,
791
+ status=StructuredToolResultStatus.ERROR,
707
792
  error="Prometheus is not configured. Prometheus URL is missing",
708
793
  params=params,
709
794
  )
@@ -743,12 +828,39 @@ class ExecuteInstantQuery(BasePrometheusTool):
743
828
  "query": query,
744
829
  }
745
830
 
831
+ # Check if data should be included based on size
746
832
  if self.toolset.config.tool_calls_return_data:
747
- response_data["data"] = data.get("data")
833
+ result_data = data.get("data", {})
834
+
835
+ # Estimate the size of the data
836
+ data_str_preview = json.dumps(result_data)
837
+ data_size_chars = len(data_str_preview)
838
+
839
+ # Provide summary if data is too large
840
+ if (
841
+ self.toolset.config.query_response_size_limit
842
+ and data_size_chars
843
+ > self.toolset.config.query_response_size_limit
844
+ ):
845
+ response_data["data_summary"] = (
846
+ create_data_summary_for_large_result(
847
+ result_data,
848
+ query,
849
+ data_size_chars,
850
+ is_range_query=False,
851
+ )
852
+ )
853
+ logging.info(
854
+ f"Prometheus instant query returned large dataset: "
855
+ f"{response_data['data_summary'].get('result_count', 0)} results, "
856
+ f"{data_size_chars:,} characters. Returning summary instead of full data."
857
+ )
858
+ else:
859
+ response_data["data"] = result_data
748
860
 
749
861
  data_str = json.dumps(response_data, indent=2)
750
862
  return StructuredToolResult(
751
- status=ToolResultStatus.SUCCESS,
863
+ status=StructuredToolResultStatus.SUCCESS,
752
864
  data=data_str,
753
865
  params=params,
754
866
  )
@@ -764,14 +876,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
764
876
  except json.JSONDecodeError:
765
877
  pass
766
878
  return StructuredToolResult(
767
- status=ToolResultStatus.ERROR,
879
+ status=StructuredToolResultStatus.ERROR,
768
880
  error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
769
881
  params=params,
770
882
  )
771
883
 
772
884
  # For other status codes, just return the status code and content
773
885
  return StructuredToolResult(
774
- status=ToolResultStatus.ERROR,
886
+ status=StructuredToolResultStatus.ERROR,
775
887
  error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
776
888
  params=params,
777
889
  )
@@ -779,14 +891,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
779
891
  except RequestException as e:
780
892
  logging.info("Failed to connect to Prometheus", exc_info=True)
781
893
  return StructuredToolResult(
782
- status=ToolResultStatus.ERROR,
894
+ status=StructuredToolResultStatus.ERROR,
783
895
  error=f"Connection error to Prometheus: {str(e)}",
784
896
  params=params,
785
897
  )
786
898
  except Exception as e:
787
899
  logging.info("Failed to connect to Prometheus", exc_info=True)
788
900
  return StructuredToolResult(
789
- status=ToolResultStatus.ERROR,
901
+ status=StructuredToolResultStatus.ERROR,
790
902
  error=f"Unexpected error executing query: {str(e)}",
791
903
  params=params,
792
904
  )
@@ -827,7 +939,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
827
939
  "step": ToolParameter(
828
940
  description="Query resolution step width in duration format or float number of seconds",
829
941
  type="number",
830
- required=True,
942
+ required=False,
831
943
  ),
832
944
  "output_type": ToolParameter(
833
945
  description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
@@ -843,7 +955,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
843
955
  ) -> StructuredToolResult:
844
956
  if not self.toolset.config or not self.toolset.config.prometheus_url:
845
957
  return StructuredToolResult(
846
- status=ToolResultStatus.ERROR,
958
+ status=StructuredToolResultStatus.ERROR,
847
959
  error="Prometheus is not configured. Prometheus URL is missing",
848
960
  params=params,
849
961
  )
@@ -857,12 +969,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
857
969
  end_timestamp=params.get("end"),
858
970
  default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
859
971
  )
860
- step = params.get("step", "")
972
+ step = parse_duration_to_seconds(params.get("step"))
861
973
 
974
+ # adjust_step_for_max_points handles None case and converts to float
862
975
  step = adjust_step_for_max_points(
863
976
  start_timestamp=start,
864
977
  end_timestamp=end,
865
- step=float(step) if step else MAX_GRAPH_POINTS,
978
+ step=step,
866
979
  )
867
980
 
868
981
  description = params.get("description", "")
@@ -906,12 +1019,37 @@ class ExecuteRangeQuery(BasePrometheusTool):
906
1019
  "output_type": output_type,
907
1020
  }
908
1021
 
1022
+ # Check if data should be included based on size
909
1023
  if self.toolset.config.tool_calls_return_data:
910
- response_data["data"] = data.get("data")
1024
+ result_data = data.get("data", {})
1025
+
1026
+ # Estimate the size of the data
1027
+ data_str_preview = json.dumps(result_data)
1028
+ data_size_chars = len(data_str_preview)
1029
+
1030
+ # Provide summary if data is too large
1031
+ if (
1032
+ self.toolset.config.query_response_size_limit
1033
+ and data_size_chars
1034
+ > self.toolset.config.query_response_size_limit
1035
+ ):
1036
+ response_data["data_summary"] = (
1037
+ create_data_summary_for_large_result(
1038
+ result_data, query, data_size_chars, is_range_query=True
1039
+ )
1040
+ )
1041
+ logging.info(
1042
+ f"Prometheus range query returned large dataset: "
1043
+ f"{response_data['data_summary'].get('series_count', 0)} series, "
1044
+ f"{data_size_chars:,} characters. Returning summary instead of full data."
1045
+ )
1046
+ else:
1047
+ response_data["data"] = result_data
1048
+
911
1049
  data_str = json.dumps(response_data, indent=2)
912
1050
 
913
1051
  return StructuredToolResult(
914
- status=ToolResultStatus.SUCCESS,
1052
+ status=StructuredToolResultStatus.SUCCESS,
915
1053
  data=data_str,
916
1054
  params=params,
917
1055
  )
@@ -926,13 +1064,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
926
1064
  except json.JSONDecodeError:
927
1065
  pass
928
1066
  return StructuredToolResult(
929
- status=ToolResultStatus.ERROR,
1067
+ status=StructuredToolResultStatus.ERROR,
930
1068
  error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
931
1069
  params=params,
932
1070
  )
933
1071
 
934
1072
  return StructuredToolResult(
935
- status=ToolResultStatus.ERROR,
1073
+ status=StructuredToolResultStatus.ERROR,
936
1074
  error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
937
1075
  params=params,
938
1076
  )
@@ -940,14 +1078,14 @@ class ExecuteRangeQuery(BasePrometheusTool):
940
1078
  except RequestException as e:
941
1079
  logging.info("Failed to connect to Prometheus", exc_info=True)
942
1080
  return StructuredToolResult(
943
- status=ToolResultStatus.ERROR,
1081
+ status=StructuredToolResultStatus.ERROR,
944
1082
  error=f"Connection error to Prometheus: {str(e)}",
945
1083
  params=params,
946
1084
  )
947
1085
  except Exception as e:
948
1086
  logging.info("Failed to connect to Prometheus", exc_info=True)
949
1087
  return StructuredToolResult(
950
- status=ToolResultStatus.ERROR,
1088
+ status=StructuredToolResultStatus.ERROR,
951
1089
  error=f"Unexpected error executing query: {str(e)}",
952
1090
  params=params,
953
1091
  )
@@ -1060,13 +1198,8 @@ class PrometheusToolset(Toolset):
1060
1198
  f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
1061
1199
  )
1062
1200
 
1063
- except RequestException:
1064
- return (
1065
- False,
1066
- f"Failed to initialize using url={url}",
1067
- )
1068
1201
  except Exception as e:
1069
- logging.exception("Failed to initialize Prometheus")
1202
+ logging.exception("Failed to initialize Prometheus", exc_info=True)
1070
1203
  return (
1071
1204
  False,
1072
1205
  f"Failed to initialize using url={url}. Unexpected error: {str(e)}",
@@ -19,6 +19,31 @@
19
19
  * Only generate and execute a prometheus query after checking what metrics are available with the `list_available_metrics` tool
20
20
  * Check that any node, service, pod, container, app, namespace, etc. mentioned in the query exist in the kubernetes cluster before making a query. Use any appropriate kubectl tool(s) for this
21
21
  * The toolcall will return no data to you. That is expected. You MUST however ensure that the query is successful.
22
+
23
+ ## Handling High-Cardinality Metrics
24
+ * CRITICAL: When querying metrics that may return many time series (>10), ALWAYS use aggregation to limit results
25
+ * ALWAYS use `topk()` or `bottomk()` to limit the number of series returned
26
+ * Standard pattern for high-cardinality queries:
27
+ - Use `topk(5, <your_query>)` to get the top 5 series
28
+ - Example: `topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m]))`
29
+ - This prevents context overflow and focuses on the most relevant data
30
+ * To also capture the aggregate of remaining series as "other":
31
+ ```
32
+ topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m]))
33
+ or
34
+ label_replace(
35
+ (sum(rate(container_cpu_usage_seconds_total{namespace="default"}[5m])) - sum(topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m])))),
36
+ "pod", "other", "", ""
37
+ )
38
+ ```
39
+ * Common high-cardinality scenarios requiring topk():
40
+ - Pod-level metrics in namespaces with many pods
41
+ - Container-level CPU/memory metrics
42
+ - HTTP metrics with many endpoints or status codes
43
+ - Any query returning more than 10 time series
44
+ * For initial exploration, use instant queries with `count()` to check cardinality:
45
+ - Example: `count(count by (pod) (container_cpu_usage_seconds_total{namespace="default"}))`
46
+ - If count > 10, use topk() in your range query
22
47
  * When doing queries, always extend the time range, to 15 min before and after the alert start time
23
48
  * ALWAYS embed the execution results into your answer
24
49
  * ALWAYS embed a Prometheus graph in the response. The graph should visualize data related to the incident.
@@ -0,0 +1,28 @@
1
+ import re
2
+ from typing import Optional, Union
3
+
4
+
5
+ def parse_duration_to_seconds(v: Optional[Union[str, float, int]]) -> Optional[float]:
6
+ if v is None:
7
+ return None
8
+ if isinstance(v, (int, float)):
9
+ return float(v)
10
+ s = v.strip().lower()
11
+ if s.isdigit():
12
+ return float(int(s))
13
+
14
+ units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
15
+
16
+ # Check for partial time formats (e.g., 1h30m, 5m12s, 1d2h30m)
17
+ pattern = r"(\d+(?:\.\d+)?)(d|h|m|s)"
18
+ matches = re.findall(pattern, s)
19
+
20
+ if matches:
21
+ total_seconds = 0.0
22
+ for value_str, unit in matches:
23
+ value = float(value_str)
24
+ total_seconds += value * units[unit]
25
+ return float(int(total_seconds))
26
+
27
+ # fallback: try float seconds
28
+ return float(s)
@@ -8,7 +8,7 @@ from holmes.core.tools import (
8
8
  StructuredToolResult,
9
9
  Tool,
10
10
  ToolParameter,
11
- ToolResultStatus,
11
+ StructuredToolResultStatus,
12
12
  Toolset,
13
13
  ToolsetTag,
14
14
  )
@@ -79,7 +79,7 @@ class ListConfiguredClusters(BaseRabbitMQTool):
79
79
  if c.connection_status == ClusterConnectionStatus.SUCCESS
80
80
  ]
81
81
  return StructuredToolResult(
82
- status=ToolResultStatus.SUCCESS, data=available_clusters
82
+ status=StructuredToolResultStatus.SUCCESS, data=available_clusters
83
83
  )
84
84
 
85
85
  def get_parameterized_one_liner(self, params) -> str:
@@ -112,12 +112,14 @@ class GetRabbitMQClusterStatus(BaseRabbitMQTool):
112
112
  cluster_id=params.get("cluster_id")
113
113
  )
114
114
  result = get_cluster_status(cluster_config)
115
- return StructuredToolResult(status=ToolResultStatus.SUCCESS, data=result)
115
+ return StructuredToolResult(
116
+ status=StructuredToolResultStatus.SUCCESS, data=result
117
+ )
116
118
 
117
119
  except Exception as e:
118
120
  logging.info("Failed to process RabbitMQ cluster status", exc_info=True)
119
121
  return StructuredToolResult(
120
- status=ToolResultStatus.ERROR,
122
+ status=StructuredToolResultStatus.ERROR,
121
123
  error=f"Unexpected error fetching RabbitMQ cluster status: {str(e)}",
122
124
  data=None,
123
125
  )