holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +10 -2
- holmes/common/env_vars.py +8 -1
- holmes/config.py +66 -139
- holmes/core/investigation.py +1 -2
- holmes/core/llm.py +256 -51
- holmes/core/models.py +2 -0
- holmes/core/safeguards.py +4 -4
- holmes/core/supabase_dal.py +14 -8
- holmes/core/tool_calling_llm.py +193 -176
- holmes/core/tools.py +260 -25
- holmes/core/tools_utils/data_types.py +81 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
- holmes/core/tools_utils/tool_executor.py +2 -2
- holmes/core/toolset_manager.py +150 -3
- holmes/core/tracing.py +6 -1
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +62 -0
- holmes/core/transformers/llm_summarize.py +174 -0
- holmes/core/transformers/registry.py +122 -0
- holmes/core/transformers/transformer.py +31 -0
- holmes/main.py +5 -0
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
- holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
- holmes/plugins/toolsets/bash/common/bash.py +7 -7
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +16 -17
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +9 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +21 -22
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +8 -8
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -19
- holmes/plugins/toolsets/git.py +22 -22
- holmes/plugins/toolsets/grafana/common.py +14 -2
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +3 -3
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
- holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
- holmes/plugins/toolsets/internet/internet.py +3 -3
- holmes/plugins/toolsets/internet/notion.py +3 -3
- holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
- holmes/plugins/toolsets/kafka.py +18 -18
- holmes/plugins/toolsets/kubernetes.yaml +58 -0
- holmes/plugins/toolsets/kubernetes_logs.py +6 -6
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
- holmes/plugins/toolsets/newrelic.py +8 -8
- holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
- holmes/plugins/toolsets/prometheus/prometheus.py +172 -39
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +25 -0
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
- holmes/plugins/toolsets/robusta/robusta.py +10 -10
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
- holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/env.py +7 -0
- holmes/utils/holmes_status.py +2 -1
- holmes/utils/sentry_helper.py +41 -0
- holmes/utils/stream.py +9 -0
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/METADATA +10 -14
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/RECORD +81 -71
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/entry_points.txt +0 -0
|
@@ -18,7 +18,7 @@ from holmes.plugins.toolsets.opensearch.opensearch_utils import (
|
|
|
18
18
|
add_auth_header,
|
|
19
19
|
get_search_url,
|
|
20
20
|
)
|
|
21
|
-
from holmes.core.tools import StructuredToolResult,
|
|
21
|
+
from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
|
|
22
22
|
from holmes.plugins.toolsets.utils import get_param_or_raise, toolset_name_for_one_liner
|
|
23
23
|
|
|
24
24
|
TRACES_FIELDS_CACHE_KEY = "cached_traces_fields"
|
|
@@ -48,7 +48,7 @@ class GetTracesFields(Tool):
|
|
|
48
48
|
if cached_response:
|
|
49
49
|
logging.debug("traces fields returned from cache")
|
|
50
50
|
return StructuredToolResult(
|
|
51
|
-
status=
|
|
51
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
52
52
|
data=cached_response,
|
|
53
53
|
params=params,
|
|
54
54
|
)
|
|
@@ -81,7 +81,7 @@ class GetTracesFields(Tool):
|
|
|
81
81
|
if self._cache:
|
|
82
82
|
self._cache[TRACES_FIELDS_CACHE_KEY] = response
|
|
83
83
|
return StructuredToolResult(
|
|
84
|
-
status=
|
|
84
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
85
85
|
data=response,
|
|
86
86
|
params=params,
|
|
87
87
|
)
|
|
@@ -90,21 +90,21 @@ class GetTracesFields(Tool):
|
|
|
90
90
|
"Timeout while fetching opensearch traces fields", exc_info=True
|
|
91
91
|
)
|
|
92
92
|
return StructuredToolResult(
|
|
93
|
-
status=
|
|
93
|
+
status=StructuredToolResultStatus.ERROR,
|
|
94
94
|
error="Request timed out while fetching opensearch traces fields",
|
|
95
95
|
params=params,
|
|
96
96
|
)
|
|
97
97
|
except RequestException as e:
|
|
98
98
|
logging.warning("Failed to fetch opensearch traces fields", exc_info=True)
|
|
99
99
|
return StructuredToolResult(
|
|
100
|
-
status=
|
|
100
|
+
status=StructuredToolResultStatus.ERROR,
|
|
101
101
|
error=f"Network error while opensearch traces fields: {str(e)}",
|
|
102
102
|
params=params,
|
|
103
103
|
)
|
|
104
104
|
except Exception as e:
|
|
105
105
|
logging.warning("Failed to process opensearch traces fields", exc_info=True)
|
|
106
106
|
return StructuredToolResult(
|
|
107
|
-
status=
|
|
107
|
+
status=StructuredToolResultStatus.ERROR,
|
|
108
108
|
error=f"Unexpected error: {str(e)}",
|
|
109
109
|
params=params,
|
|
110
110
|
)
|
|
@@ -157,7 +157,7 @@ class TracesSearchQuery(Tool):
|
|
|
157
157
|
|
|
158
158
|
logs_response.raise_for_status()
|
|
159
159
|
return StructuredToolResult(
|
|
160
|
-
status=
|
|
160
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
161
161
|
data=json.dumps(logs_response.json()),
|
|
162
162
|
params=params,
|
|
163
163
|
)
|
|
@@ -166,14 +166,14 @@ class TracesSearchQuery(Tool):
|
|
|
166
166
|
"Timeout while fetching opensearch traces search", exc_info=True
|
|
167
167
|
)
|
|
168
168
|
return StructuredToolResult(
|
|
169
|
-
status=
|
|
169
|
+
status=StructuredToolResultStatus.ERROR,
|
|
170
170
|
error=f"Request timed out while fetching opensearch traces search {err_msg}",
|
|
171
171
|
params=params,
|
|
172
172
|
)
|
|
173
173
|
except RequestException as e:
|
|
174
174
|
logging.warning("Failed to fetch opensearch traces search", exc_info=True)
|
|
175
175
|
return StructuredToolResult(
|
|
176
|
-
status=
|
|
176
|
+
status=StructuredToolResultStatus.ERROR,
|
|
177
177
|
error=f"Network error while opensearch traces search {err_msg} : {str(e)}",
|
|
178
178
|
params=params,
|
|
179
179
|
)
|
|
@@ -182,7 +182,7 @@ class TracesSearchQuery(Tool):
|
|
|
182
182
|
"Failed to process opensearch traces search ", exc_info=True
|
|
183
183
|
)
|
|
184
184
|
return StructuredToolResult(
|
|
185
|
-
status=
|
|
185
|
+
status=StructuredToolResultStatus.ERROR,
|
|
186
186
|
error=f"Unexpected error {err_msg}: {str(e)}",
|
|
187
187
|
params=params,
|
|
188
188
|
)
|
|
@@ -17,11 +17,12 @@ from holmes.core.tools import (
|
|
|
17
17
|
StructuredToolResult,
|
|
18
18
|
Tool,
|
|
19
19
|
ToolParameter,
|
|
20
|
-
|
|
20
|
+
StructuredToolResultStatus,
|
|
21
21
|
Toolset,
|
|
22
22
|
ToolsetTag,
|
|
23
23
|
)
|
|
24
24
|
from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
|
|
25
|
+
from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
|
|
25
26
|
from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
|
|
26
27
|
from holmes.plugins.toolsets.utils import (
|
|
27
28
|
get_param_or_raise,
|
|
@@ -55,6 +56,9 @@ class PrometheusConfig(BaseModel):
|
|
|
55
56
|
rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
|
|
56
57
|
additional_labels: Optional[Dict[str, str]] = None
|
|
57
58
|
prometheus_ssl_enabled: bool = True
|
|
59
|
+
query_response_size_limit: Optional[int] = (
|
|
60
|
+
80000 # Limit the max number of characters in a query result to proactively prevent truncation and advise LLM to query less data
|
|
61
|
+
)
|
|
58
62
|
|
|
59
63
|
@field_validator("prometheus_url")
|
|
60
64
|
def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
|
|
@@ -284,7 +288,7 @@ def result_has_data(result: Dict) -> bool:
|
|
|
284
288
|
def adjust_step_for_max_points(
|
|
285
289
|
start_timestamp: str,
|
|
286
290
|
end_timestamp: str,
|
|
287
|
-
step: float,
|
|
291
|
+
step: Optional[float] = None,
|
|
288
292
|
) -> float:
|
|
289
293
|
"""
|
|
290
294
|
Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
|
|
@@ -293,7 +297,7 @@ def adjust_step_for_max_points(
|
|
|
293
297
|
Args:
|
|
294
298
|
start_timestamp: RFC3339 formatted start time
|
|
295
299
|
end_timestamp: RFC3339 formatted end time
|
|
296
|
-
step: The requested step duration in seconds
|
|
300
|
+
step: The requested step duration in seconds (None for auto-calculation)
|
|
297
301
|
|
|
298
302
|
Returns:
|
|
299
303
|
Adjusted step value in seconds that ensures points <= max_points
|
|
@@ -304,6 +308,14 @@ def adjust_step_for_max_points(
|
|
|
304
308
|
|
|
305
309
|
time_range_seconds = (end_dt - start_dt).total_seconds()
|
|
306
310
|
|
|
311
|
+
# If no step provided, calculate a reasonable default
|
|
312
|
+
# Aim for ~60 data points across the time range (1 per minute for hourly, etc)
|
|
313
|
+
if step is None:
|
|
314
|
+
step = max(1, time_range_seconds / 60)
|
|
315
|
+
logging.debug(
|
|
316
|
+
f"No step provided, defaulting to {step}s for {time_range_seconds}s range"
|
|
317
|
+
)
|
|
318
|
+
|
|
307
319
|
current_points = time_range_seconds / step
|
|
308
320
|
|
|
309
321
|
# If current points exceed max, adjust the step
|
|
@@ -324,6 +336,79 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
|
|
|
324
336
|
return results
|
|
325
337
|
|
|
326
338
|
|
|
339
|
+
def create_data_summary_for_large_result(
|
|
340
|
+
result_data: Dict, query: str, data_size_chars: int, is_range_query: bool = False
|
|
341
|
+
) -> Dict[str, Any]:
|
|
342
|
+
"""
|
|
343
|
+
Create a summary for large Prometheus results instead of returning full data.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
result_data: The Prometheus data result
|
|
347
|
+
query: The original PromQL query
|
|
348
|
+
data_size_chars: Size of the data in characters
|
|
349
|
+
is_range_query: Whether this is a range query (vs instant query)
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Dictionary with summary information and suggestions
|
|
353
|
+
"""
|
|
354
|
+
if is_range_query:
|
|
355
|
+
series_list = result_data.get("result", [])
|
|
356
|
+
num_items = len(series_list)
|
|
357
|
+
|
|
358
|
+
# Calculate statistics for range queries
|
|
359
|
+
total_points = 0
|
|
360
|
+
for series in series_list[:10]: # Sample first 10 series
|
|
361
|
+
points = len(series.get("values", []))
|
|
362
|
+
total_points += points
|
|
363
|
+
|
|
364
|
+
avg_points_per_series = (
|
|
365
|
+
total_points / min(10, num_items) if num_items > 0 else 0
|
|
366
|
+
)
|
|
367
|
+
estimated_total_points = avg_points_per_series * num_items
|
|
368
|
+
|
|
369
|
+
# Create a sample of just the metadata (labels) without values
|
|
370
|
+
sample_metrics = []
|
|
371
|
+
for series in series_list[:10]: # Sample first 10 series
|
|
372
|
+
sample_metrics.append(series.get("metric", {}))
|
|
373
|
+
|
|
374
|
+
sample_json = json.dumps(sample_metrics, indent=2)
|
|
375
|
+
if len(sample_json) > 2000:
|
|
376
|
+
sample_json = sample_json[:2000] + "\n... (truncated)"
|
|
377
|
+
|
|
378
|
+
return {
|
|
379
|
+
"message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} time series with approximately {estimated_total_points:,.0f} total data points.",
|
|
380
|
+
"series_count": num_items,
|
|
381
|
+
"estimated_total_points": int(estimated_total_points),
|
|
382
|
+
"data_size_characters": data_size_chars,
|
|
383
|
+
"sample_data": sample_json,
|
|
384
|
+
"suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
|
|
385
|
+
}
|
|
386
|
+
else:
|
|
387
|
+
# Instant query
|
|
388
|
+
result_type = result_data.get("resultType", "")
|
|
389
|
+
result_list = result_data.get("result", [])
|
|
390
|
+
num_items = len(result_list)
|
|
391
|
+
|
|
392
|
+
# Create a sample of just the metadata (labels) without values
|
|
393
|
+
sample_metrics = []
|
|
394
|
+
for item in result_list[:10]: # Sample first 10 results
|
|
395
|
+
if isinstance(item, dict):
|
|
396
|
+
sample_metrics.append(item.get("metric", {}))
|
|
397
|
+
|
|
398
|
+
sample_json = json.dumps(sample_metrics, indent=2)
|
|
399
|
+
if len(sample_json) > 2000:
|
|
400
|
+
sample_json = sample_json[:2000] + "\n... (truncated)"
|
|
401
|
+
|
|
402
|
+
return {
|
|
403
|
+
"message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} results.",
|
|
404
|
+
"result_count": num_items,
|
|
405
|
+
"result_type": result_type,
|
|
406
|
+
"data_size_characters": data_size_chars,
|
|
407
|
+
"sample_data": sample_json,
|
|
408
|
+
"suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
|
|
327
412
|
def fetch_metrics_labels_with_series_api(
|
|
328
413
|
prometheus_url: str,
|
|
329
414
|
headers: Dict[str, str],
|
|
@@ -496,13 +581,13 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
496
581
|
) -> StructuredToolResult:
|
|
497
582
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
498
583
|
return StructuredToolResult(
|
|
499
|
-
status=
|
|
584
|
+
status=StructuredToolResultStatus.ERROR,
|
|
500
585
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
501
586
|
params=params,
|
|
502
587
|
)
|
|
503
588
|
if self.toolset.config.is_amp():
|
|
504
589
|
return StructuredToolResult(
|
|
505
|
-
status=
|
|
590
|
+
status=StructuredToolResultStatus.ERROR,
|
|
506
591
|
error="Tool not supported in AMP",
|
|
507
592
|
params=params,
|
|
508
593
|
)
|
|
@@ -515,7 +600,7 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
515
600
|
logging.debug("rules returned from cache")
|
|
516
601
|
|
|
517
602
|
return StructuredToolResult(
|
|
518
|
-
status=
|
|
603
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
519
604
|
data=cached_rules,
|
|
520
605
|
params=params,
|
|
521
606
|
)
|
|
@@ -539,28 +624,28 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
539
624
|
if self._cache:
|
|
540
625
|
self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
|
|
541
626
|
return StructuredToolResult(
|
|
542
|
-
status=
|
|
627
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
543
628
|
data=data,
|
|
544
629
|
params=params,
|
|
545
630
|
)
|
|
546
631
|
except requests.Timeout:
|
|
547
632
|
logging.warning("Timeout while fetching prometheus rules", exc_info=True)
|
|
548
633
|
return StructuredToolResult(
|
|
549
|
-
status=
|
|
634
|
+
status=StructuredToolResultStatus.ERROR,
|
|
550
635
|
error="Request timed out while fetching rules",
|
|
551
636
|
params=params,
|
|
552
637
|
)
|
|
553
638
|
except RequestException as e:
|
|
554
639
|
logging.warning("Failed to fetch prometheus rules", exc_info=True)
|
|
555
640
|
return StructuredToolResult(
|
|
556
|
-
status=
|
|
641
|
+
status=StructuredToolResultStatus.ERROR,
|
|
557
642
|
error=f"Network error while fetching rules: {str(e)}",
|
|
558
643
|
params=params,
|
|
559
644
|
)
|
|
560
645
|
except Exception as e:
|
|
561
646
|
logging.warning("Failed to process prometheus rules", exc_info=True)
|
|
562
647
|
return StructuredToolResult(
|
|
563
|
-
status=
|
|
648
|
+
status=StructuredToolResultStatus.ERROR,
|
|
564
649
|
error=f"Unexpected error: {str(e)}",
|
|
565
650
|
params=params,
|
|
566
651
|
)
|
|
@@ -595,7 +680,7 @@ class ListAvailableMetrics(BasePrometheusTool):
|
|
|
595
680
|
) -> StructuredToolResult:
|
|
596
681
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
597
682
|
return StructuredToolResult(
|
|
598
|
-
status=
|
|
683
|
+
status=StructuredToolResultStatus.ERROR,
|
|
599
684
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
600
685
|
params=params,
|
|
601
686
|
)
|
|
@@ -612,7 +697,7 @@ class ListAvailableMetrics(BasePrometheusTool):
|
|
|
612
697
|
name_filter = params.get("name_filter")
|
|
613
698
|
if not name_filter:
|
|
614
699
|
return StructuredToolResult(
|
|
615
|
-
status=
|
|
700
|
+
status=StructuredToolResultStatus.ERROR,
|
|
616
701
|
error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
|
|
617
702
|
params=params,
|
|
618
703
|
)
|
|
@@ -646,7 +731,7 @@ class ListAvailableMetrics(BasePrometheusTool):
|
|
|
646
731
|
|
|
647
732
|
table_output = "\n".join(output)
|
|
648
733
|
return StructuredToolResult(
|
|
649
|
-
status=
|
|
734
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
650
735
|
data=table_output,
|
|
651
736
|
params=params,
|
|
652
737
|
)
|
|
@@ -654,21 +739,21 @@ class ListAvailableMetrics(BasePrometheusTool):
|
|
|
654
739
|
except requests.Timeout:
|
|
655
740
|
logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
|
|
656
741
|
return StructuredToolResult(
|
|
657
|
-
status=
|
|
742
|
+
status=StructuredToolResultStatus.ERROR,
|
|
658
743
|
error="Request timed out while fetching metrics",
|
|
659
744
|
params=params,
|
|
660
745
|
)
|
|
661
746
|
except RequestException as e:
|
|
662
747
|
logging.warn("Failed to fetch prometheus metrics", exc_info=True)
|
|
663
748
|
return StructuredToolResult(
|
|
664
|
-
status=
|
|
749
|
+
status=StructuredToolResultStatus.ERROR,
|
|
665
750
|
error=f"Network error while fetching metrics: {str(e)}",
|
|
666
751
|
params=params,
|
|
667
752
|
)
|
|
668
753
|
except Exception as e:
|
|
669
754
|
logging.warn("Failed to process prometheus metrics", exc_info=True)
|
|
670
755
|
return StructuredToolResult(
|
|
671
|
-
status=
|
|
756
|
+
status=StructuredToolResultStatus.ERROR,
|
|
672
757
|
error=f"Unexpected error: {str(e)}",
|
|
673
758
|
params=params,
|
|
674
759
|
)
|
|
@@ -703,7 +788,7 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
703
788
|
) -> StructuredToolResult:
|
|
704
789
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
705
790
|
return StructuredToolResult(
|
|
706
|
-
status=
|
|
791
|
+
status=StructuredToolResultStatus.ERROR,
|
|
707
792
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
708
793
|
params=params,
|
|
709
794
|
)
|
|
@@ -743,12 +828,39 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
743
828
|
"query": query,
|
|
744
829
|
}
|
|
745
830
|
|
|
831
|
+
# Check if data should be included based on size
|
|
746
832
|
if self.toolset.config.tool_calls_return_data:
|
|
747
|
-
|
|
833
|
+
result_data = data.get("data", {})
|
|
834
|
+
|
|
835
|
+
# Estimate the size of the data
|
|
836
|
+
data_str_preview = json.dumps(result_data)
|
|
837
|
+
data_size_chars = len(data_str_preview)
|
|
838
|
+
|
|
839
|
+
# Provide summary if data is too large
|
|
840
|
+
if (
|
|
841
|
+
self.toolset.config.query_response_size_limit
|
|
842
|
+
and data_size_chars
|
|
843
|
+
> self.toolset.config.query_response_size_limit
|
|
844
|
+
):
|
|
845
|
+
response_data["data_summary"] = (
|
|
846
|
+
create_data_summary_for_large_result(
|
|
847
|
+
result_data,
|
|
848
|
+
query,
|
|
849
|
+
data_size_chars,
|
|
850
|
+
is_range_query=False,
|
|
851
|
+
)
|
|
852
|
+
)
|
|
853
|
+
logging.info(
|
|
854
|
+
f"Prometheus instant query returned large dataset: "
|
|
855
|
+
f"{response_data['data_summary'].get('result_count', 0)} results, "
|
|
856
|
+
f"{data_size_chars:,} characters. Returning summary instead of full data."
|
|
857
|
+
)
|
|
858
|
+
else:
|
|
859
|
+
response_data["data"] = result_data
|
|
748
860
|
|
|
749
861
|
data_str = json.dumps(response_data, indent=2)
|
|
750
862
|
return StructuredToolResult(
|
|
751
|
-
status=
|
|
863
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
752
864
|
data=data_str,
|
|
753
865
|
params=params,
|
|
754
866
|
)
|
|
@@ -764,14 +876,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
764
876
|
except json.JSONDecodeError:
|
|
765
877
|
pass
|
|
766
878
|
return StructuredToolResult(
|
|
767
|
-
status=
|
|
879
|
+
status=StructuredToolResultStatus.ERROR,
|
|
768
880
|
error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
|
|
769
881
|
params=params,
|
|
770
882
|
)
|
|
771
883
|
|
|
772
884
|
# For other status codes, just return the status code and content
|
|
773
885
|
return StructuredToolResult(
|
|
774
|
-
status=
|
|
886
|
+
status=StructuredToolResultStatus.ERROR,
|
|
775
887
|
error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
|
|
776
888
|
params=params,
|
|
777
889
|
)
|
|
@@ -779,14 +891,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
779
891
|
except RequestException as e:
|
|
780
892
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
781
893
|
return StructuredToolResult(
|
|
782
|
-
status=
|
|
894
|
+
status=StructuredToolResultStatus.ERROR,
|
|
783
895
|
error=f"Connection error to Prometheus: {str(e)}",
|
|
784
896
|
params=params,
|
|
785
897
|
)
|
|
786
898
|
except Exception as e:
|
|
787
899
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
788
900
|
return StructuredToolResult(
|
|
789
|
-
status=
|
|
901
|
+
status=StructuredToolResultStatus.ERROR,
|
|
790
902
|
error=f"Unexpected error executing query: {str(e)}",
|
|
791
903
|
params=params,
|
|
792
904
|
)
|
|
@@ -827,7 +939,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
827
939
|
"step": ToolParameter(
|
|
828
940
|
description="Query resolution step width in duration format or float number of seconds",
|
|
829
941
|
type="number",
|
|
830
|
-
required=
|
|
942
|
+
required=False,
|
|
831
943
|
),
|
|
832
944
|
"output_type": ToolParameter(
|
|
833
945
|
description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
|
|
@@ -843,7 +955,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
843
955
|
) -> StructuredToolResult:
|
|
844
956
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
845
957
|
return StructuredToolResult(
|
|
846
|
-
status=
|
|
958
|
+
status=StructuredToolResultStatus.ERROR,
|
|
847
959
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
848
960
|
params=params,
|
|
849
961
|
)
|
|
@@ -857,12 +969,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
857
969
|
end_timestamp=params.get("end"),
|
|
858
970
|
default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
|
|
859
971
|
)
|
|
860
|
-
step = params.get("step"
|
|
972
|
+
step = parse_duration_to_seconds(params.get("step"))
|
|
861
973
|
|
|
974
|
+
# adjust_step_for_max_points handles None case and converts to float
|
|
862
975
|
step = adjust_step_for_max_points(
|
|
863
976
|
start_timestamp=start,
|
|
864
977
|
end_timestamp=end,
|
|
865
|
-
step=
|
|
978
|
+
step=step,
|
|
866
979
|
)
|
|
867
980
|
|
|
868
981
|
description = params.get("description", "")
|
|
@@ -906,12 +1019,37 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
906
1019
|
"output_type": output_type,
|
|
907
1020
|
}
|
|
908
1021
|
|
|
1022
|
+
# Check if data should be included based on size
|
|
909
1023
|
if self.toolset.config.tool_calls_return_data:
|
|
910
|
-
|
|
1024
|
+
result_data = data.get("data", {})
|
|
1025
|
+
|
|
1026
|
+
# Estimate the size of the data
|
|
1027
|
+
data_str_preview = json.dumps(result_data)
|
|
1028
|
+
data_size_chars = len(data_str_preview)
|
|
1029
|
+
|
|
1030
|
+
# Provide summary if data is too large
|
|
1031
|
+
if (
|
|
1032
|
+
self.toolset.config.query_response_size_limit
|
|
1033
|
+
and data_size_chars
|
|
1034
|
+
> self.toolset.config.query_response_size_limit
|
|
1035
|
+
):
|
|
1036
|
+
response_data["data_summary"] = (
|
|
1037
|
+
create_data_summary_for_large_result(
|
|
1038
|
+
result_data, query, data_size_chars, is_range_query=True
|
|
1039
|
+
)
|
|
1040
|
+
)
|
|
1041
|
+
logging.info(
|
|
1042
|
+
f"Prometheus range query returned large dataset: "
|
|
1043
|
+
f"{response_data['data_summary'].get('series_count', 0)} series, "
|
|
1044
|
+
f"{data_size_chars:,} characters. Returning summary instead of full data."
|
|
1045
|
+
)
|
|
1046
|
+
else:
|
|
1047
|
+
response_data["data"] = result_data
|
|
1048
|
+
|
|
911
1049
|
data_str = json.dumps(response_data, indent=2)
|
|
912
1050
|
|
|
913
1051
|
return StructuredToolResult(
|
|
914
|
-
status=
|
|
1052
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
915
1053
|
data=data_str,
|
|
916
1054
|
params=params,
|
|
917
1055
|
)
|
|
@@ -926,13 +1064,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
926
1064
|
except json.JSONDecodeError:
|
|
927
1065
|
pass
|
|
928
1066
|
return StructuredToolResult(
|
|
929
|
-
status=
|
|
1067
|
+
status=StructuredToolResultStatus.ERROR,
|
|
930
1068
|
error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
|
|
931
1069
|
params=params,
|
|
932
1070
|
)
|
|
933
1071
|
|
|
934
1072
|
return StructuredToolResult(
|
|
935
|
-
status=
|
|
1073
|
+
status=StructuredToolResultStatus.ERROR,
|
|
936
1074
|
error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
|
|
937
1075
|
params=params,
|
|
938
1076
|
)
|
|
@@ -940,14 +1078,14 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
940
1078
|
except RequestException as e:
|
|
941
1079
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
942
1080
|
return StructuredToolResult(
|
|
943
|
-
status=
|
|
1081
|
+
status=StructuredToolResultStatus.ERROR,
|
|
944
1082
|
error=f"Connection error to Prometheus: {str(e)}",
|
|
945
1083
|
params=params,
|
|
946
1084
|
)
|
|
947
1085
|
except Exception as e:
|
|
948
1086
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
949
1087
|
return StructuredToolResult(
|
|
950
|
-
status=
|
|
1088
|
+
status=StructuredToolResultStatus.ERROR,
|
|
951
1089
|
error=f"Unexpected error executing query: {str(e)}",
|
|
952
1090
|
params=params,
|
|
953
1091
|
)
|
|
@@ -1060,13 +1198,8 @@ class PrometheusToolset(Toolset):
|
|
|
1060
1198
|
f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
|
|
1061
1199
|
)
|
|
1062
1200
|
|
|
1063
|
-
except RequestException:
|
|
1064
|
-
return (
|
|
1065
|
-
False,
|
|
1066
|
-
f"Failed to initialize using url={url}",
|
|
1067
|
-
)
|
|
1068
1201
|
except Exception as e:
|
|
1069
|
-
logging.exception("Failed to initialize Prometheus")
|
|
1202
|
+
logging.exception("Failed to initialize Prometheus", exc_info=True)
|
|
1070
1203
|
return (
|
|
1071
1204
|
False,
|
|
1072
1205
|
f"Failed to initialize using url={url}. Unexpected error: {str(e)}",
|
|
@@ -19,6 +19,31 @@
|
|
|
19
19
|
* Only generate and execute a prometheus query after checking what metrics are available with the `list_available_metrics` tool
|
|
20
20
|
* Check that any node, service, pod, container, app, namespace, etc. mentioned in the query exist in the kubernetes cluster before making a query. Use any appropriate kubectl tool(s) for this
|
|
21
21
|
* The toolcall will return no data to you. That is expected. You MUST however ensure that the query is successful.
|
|
22
|
+
|
|
23
|
+
## Handling High-Cardinality Metrics
|
|
24
|
+
* CRITICAL: When querying metrics that may return many time series (>10), ALWAYS use aggregation to limit results
|
|
25
|
+
* ALWAYS use `topk()` or `bottomk()` to limit the number of series returned
|
|
26
|
+
* Standard pattern for high-cardinality queries:
|
|
27
|
+
- Use `topk(5, <your_query>)` to get the top 5 series
|
|
28
|
+
- Example: `topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m]))`
|
|
29
|
+
- This prevents context overflow and focuses on the most relevant data
|
|
30
|
+
* To also capture the aggregate of remaining series as "other":
|
|
31
|
+
```
|
|
32
|
+
topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m]))
|
|
33
|
+
or
|
|
34
|
+
label_replace(
|
|
35
|
+
(sum(rate(container_cpu_usage_seconds_total{namespace="default"}[5m])) - sum(topk(5, rate(container_cpu_usage_seconds_total{namespace="default"}[5m])))),
|
|
36
|
+
"pod", "other", "", ""
|
|
37
|
+
)
|
|
38
|
+
```
|
|
39
|
+
* Common high-cardinality scenarios requiring topk():
|
|
40
|
+
- Pod-level metrics in namespaces with many pods
|
|
41
|
+
- Container-level CPU/memory metrics
|
|
42
|
+
- HTTP metrics with many endpoints or status codes
|
|
43
|
+
- Any query returning more than 10 time series
|
|
44
|
+
* For initial exploration, use instant queries with `count()` to check cardinality:
|
|
45
|
+
- Example: `count(count by (pod) (container_cpu_usage_seconds_total{namespace="default"}))`
|
|
46
|
+
- If count > 10, use topk() in your range query
|
|
22
47
|
* When doing queries, always extend the time range, to 15 min before and after the alert start time
|
|
23
48
|
* ALWAYS embed the execution results into your answer
|
|
24
49
|
* ALWAYS embed a Prometheus graph in the response. The graph should visualize data related to the incident.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def parse_duration_to_seconds(v: Optional[Union[str, float, int]]) -> Optional[float]:
|
|
6
|
+
if v is None:
|
|
7
|
+
return None
|
|
8
|
+
if isinstance(v, (int, float)):
|
|
9
|
+
return float(v)
|
|
10
|
+
s = v.strip().lower()
|
|
11
|
+
if s.isdigit():
|
|
12
|
+
return float(int(s))
|
|
13
|
+
|
|
14
|
+
units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
|
|
15
|
+
|
|
16
|
+
# Check for partial time formats (e.g., 1h30m, 5m12s, 1d2h30m)
|
|
17
|
+
pattern = r"(\d+(?:\.\d+)?)(d|h|m|s)"
|
|
18
|
+
matches = re.findall(pattern, s)
|
|
19
|
+
|
|
20
|
+
if matches:
|
|
21
|
+
total_seconds = 0.0
|
|
22
|
+
for value_str, unit in matches:
|
|
23
|
+
value = float(value_str)
|
|
24
|
+
total_seconds += value * units[unit]
|
|
25
|
+
return float(int(total_seconds))
|
|
26
|
+
|
|
27
|
+
# fallback: try float seconds
|
|
28
|
+
return float(s)
|
|
@@ -8,7 +8,7 @@ from holmes.core.tools import (
|
|
|
8
8
|
StructuredToolResult,
|
|
9
9
|
Tool,
|
|
10
10
|
ToolParameter,
|
|
11
|
-
|
|
11
|
+
StructuredToolResultStatus,
|
|
12
12
|
Toolset,
|
|
13
13
|
ToolsetTag,
|
|
14
14
|
)
|
|
@@ -79,7 +79,7 @@ class ListConfiguredClusters(BaseRabbitMQTool):
|
|
|
79
79
|
if c.connection_status == ClusterConnectionStatus.SUCCESS
|
|
80
80
|
]
|
|
81
81
|
return StructuredToolResult(
|
|
82
|
-
status=
|
|
82
|
+
status=StructuredToolResultStatus.SUCCESS, data=available_clusters
|
|
83
83
|
)
|
|
84
84
|
|
|
85
85
|
def get_parameterized_one_liner(self, params) -> str:
|
|
@@ -112,12 +112,14 @@ class GetRabbitMQClusterStatus(BaseRabbitMQTool):
|
|
|
112
112
|
cluster_id=params.get("cluster_id")
|
|
113
113
|
)
|
|
114
114
|
result = get_cluster_status(cluster_config)
|
|
115
|
-
return StructuredToolResult(
|
|
115
|
+
return StructuredToolResult(
|
|
116
|
+
status=StructuredToolResultStatus.SUCCESS, data=result
|
|
117
|
+
)
|
|
116
118
|
|
|
117
119
|
except Exception as e:
|
|
118
120
|
logging.info("Failed to process RabbitMQ cluster status", exc_info=True)
|
|
119
121
|
return StructuredToolResult(
|
|
120
|
-
status=
|
|
122
|
+
status=StructuredToolResultStatus.ERROR,
|
|
121
123
|
error=f"Unexpected error fetching RabbitMQ cluster status: {str(e)}",
|
|
122
124
|
data=None,
|
|
123
125
|
)
|