holmesgpt 0.12.5__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (84) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +19 -1
  3. holmes/common/env_vars.py +13 -0
  4. holmes/config.py +69 -9
  5. holmes/core/conversations.py +11 -0
  6. holmes/core/investigation.py +16 -3
  7. holmes/core/investigation_structured_output.py +12 -0
  8. holmes/core/llm.py +10 -0
  9. holmes/core/models.py +9 -1
  10. holmes/core/openai_formatting.py +72 -12
  11. holmes/core/prompt.py +13 -0
  12. holmes/core/supabase_dal.py +3 -0
  13. holmes/core/todo_manager.py +88 -0
  14. holmes/core/tool_calling_llm.py +121 -149
  15. holmes/core/tools.py +10 -1
  16. holmes/core/tools_utils/tool_executor.py +7 -2
  17. holmes/core/tools_utils/toolset_utils.py +7 -2
  18. holmes/core/tracing.py +3 -2
  19. holmes/interactive.py +1 -0
  20. holmes/main.py +2 -1
  21. holmes/plugins/prompts/__init__.py +7 -1
  22. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  23. holmes/plugins/prompts/_default_log_prompt.jinja2 +4 -2
  24. holmes/plugins/prompts/_fetch_logs.jinja2 +6 -1
  25. holmes/plugins/prompts/_general_instructions.jinja2 +14 -0
  26. holmes/plugins/prompts/_permission_errors.jinja2 +1 -1
  27. holmes/plugins/prompts/_toolsets_instructions.jinja2 +4 -4
  28. holmes/plugins/prompts/generic_ask.jinja2 +4 -3
  29. holmes/plugins/prompts/investigation_procedure.jinja2 +210 -0
  30. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -0
  31. holmes/plugins/toolsets/__init__.py +19 -6
  32. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +27 -0
  33. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +2 -2
  34. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +2 -1
  35. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
  36. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +2 -1
  37. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +2 -1
  38. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
  39. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +2 -1
  40. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +2 -1
  41. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +2 -1
  42. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +2 -1
  43. holmes/plugins/toolsets/coralogix/api.py +6 -6
  44. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +7 -1
  45. holmes/plugins/toolsets/datadog/datadog_api.py +20 -8
  46. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +8 -1
  47. holmes/plugins/toolsets/datadog/datadog_rds_instructions.jinja2 +82 -0
  48. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +12 -5
  49. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +20 -11
  50. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +735 -0
  51. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -11
  52. holmes/plugins/toolsets/git.py +15 -15
  53. holmes/plugins/toolsets/grafana/grafana_api.py +12 -1
  54. holmes/plugins/toolsets/grafana/toolset_grafana.py +5 -1
  55. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +9 -4
  56. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +12 -5
  57. holmes/plugins/toolsets/internet/internet.py +2 -1
  58. holmes/plugins/toolsets/internet/notion.py +2 -1
  59. holmes/plugins/toolsets/investigator/__init__.py +0 -0
  60. holmes/plugins/toolsets/investigator/core_investigation.py +157 -0
  61. holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +253 -0
  62. holmes/plugins/toolsets/investigator/model.py +15 -0
  63. holmes/plugins/toolsets/kafka.py +14 -7
  64. holmes/plugins/toolsets/kubernetes_logs.py +454 -25
  65. holmes/plugins/toolsets/logging_utils/logging_api.py +115 -55
  66. holmes/plugins/toolsets/mcp/toolset_mcp.py +1 -1
  67. holmes/plugins/toolsets/newrelic.py +8 -3
  68. holmes/plugins/toolsets/opensearch/opensearch.py +8 -4
  69. holmes/plugins/toolsets/opensearch/opensearch_logs.py +9 -2
  70. holmes/plugins/toolsets/opensearch/opensearch_traces.py +6 -2
  71. holmes/plugins/toolsets/prometheus/prometheus.py +149 -44
  72. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +8 -2
  73. holmes/plugins/toolsets/robusta/robusta.py +4 -4
  74. holmes/plugins/toolsets/runbook/runbook_fetcher.py +6 -5
  75. holmes/plugins/toolsets/servicenow/servicenow.py +18 -3
  76. holmes/plugins/toolsets/utils.py +8 -1
  77. holmes/utils/llms.py +20 -0
  78. holmes/utils/stream.py +90 -0
  79. {holmesgpt-0.12.5.dist-info → holmesgpt-0.13.0.dist-info}/METADATA +48 -35
  80. {holmesgpt-0.12.5.dist-info → holmesgpt-0.13.0.dist-info}/RECORD +83 -74
  81. holmes/utils/robusta.py +0 -9
  82. {holmesgpt-0.12.5.dist-info → holmesgpt-0.13.0.dist-info}/LICENSE.txt +0 -0
  83. {holmesgpt-0.12.5.dist-info → holmesgpt-0.13.0.dist-info}/WHEEL +0 -0
  84. {holmesgpt-0.12.5.dist-info → holmesgpt-0.13.0.dist-info}/entry_points.txt +0 -0
@@ -9,6 +9,7 @@ from holmes.plugins.toolsets.azure_sql.azure_base_toolset import (
9
9
  )
10
10
  from holmes.plugins.toolsets.azure_sql.apis.azure_sql_api import AzureSQLAPIClient
11
11
  from holmes.plugins.toolsets.azure_sql.utils import format_timing
12
+ from holmes.plugins.toolsets.utils import toolset_name_for_one_liner
12
13
 
13
14
 
14
15
  class GetTopLogIOQueries(BaseAzureSQLTool):
@@ -145,7 +146,7 @@ class GetTopLogIOQueries(BaseAzureSQLTool):
145
146
 
146
147
  def get_parameterized_one_liner(self, params: Dict) -> str:
147
148
  db_config = self.toolset.database_config()
148
- return f"Fetch top log I/O consuming queries for database {db_config.server_name}/{db_config.database_name}"
149
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Top Log I/O Queries ({db_config.server_name}/{db_config.database_name})"
149
150
 
150
151
  @staticmethod
151
152
  def validate_config(
@@ -12,16 +12,16 @@ from holmes.plugins.toolsets.coralogix.utils import (
12
12
  parse_logs,
13
13
  CoralogixLogsMethodology,
14
14
  )
15
- from holmes.plugins.toolsets.logging_utils.logging_api import FetchPodLogsParams
15
+ from holmes.plugins.toolsets.logging_utils.logging_api import (
16
+ FetchPodLogsParams,
17
+ DEFAULT_TIME_SPAN_SECONDS,
18
+ DEFAULT_LOG_LIMIT,
19
+ )
16
20
  from holmes.plugins.toolsets.utils import (
17
21
  process_timestamps_to_rfc3339,
18
22
  )
19
23
 
20
24
 
21
- DEFAULT_TIME_SPAN_SECONDS = 86400
22
- DEFAULT_LOG_COUNT = 2000 # Coralogix's default is 2000
23
-
24
-
25
25
  class CoralogixTier(str, Enum):
26
26
  FREQUENT_SEARCH = "TIER_FREQUENT_SEARCH"
27
27
  ARCHIVE = "TIER_ARCHIVE"
@@ -62,7 +62,7 @@ def build_query_string(config: CoralogixConfig, params: FetchPodLogsParams) -> s
62
62
  query_filters.append(f'{config.labels.log_message}:"{params.filter}"')
63
63
 
64
64
  query_string = " AND ".join(query_filters)
65
- query_string = f"source logs | lucene '{query_string}' | limit {params.limit or DEFAULT_LOG_COUNT}"
65
+ query_string = f"source logs | lucene '{query_string}' | limit {params.limit or DEFAULT_LOG_LIMIT}"
66
66
  return query_string
67
67
 
68
68
 
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional, Tuple
1
+ from typing import Any, Optional, Tuple, Set
2
2
 
3
3
  from holmes.core.tools import (
4
4
  CallablePrerequisite,
@@ -23,11 +23,17 @@ from holmes.plugins.toolsets.coralogix.utils import (
23
23
  from holmes.plugins.toolsets.logging_utils.logging_api import (
24
24
  BasePodLoggingToolset,
25
25
  FetchPodLogsParams,
26
+ LoggingCapability,
26
27
  PodLoggingTool,
27
28
  )
28
29
 
29
30
 
30
31
  class CoralogixLogsToolset(BasePodLoggingToolset):
32
+ @property
33
+ def supported_capabilities(self) -> Set[LoggingCapability]:
34
+ """Coralogix only supports substring matching, not regex or exclude filters"""
35
+ return set() # No regex support, no exclude filter
36
+
31
37
  def __init__(self):
32
38
  super().__init__(
33
39
  name="coralogix/logs",
@@ -126,6 +126,25 @@ class wait_for_retry_after_header(wait_base):
126
126
  ),
127
127
  reraise=True,
128
128
  )
129
+ def execute_paginated_datadog_http_request(
130
+ url: str,
131
+ headers: dict,
132
+ payload_or_params: dict,
133
+ timeout: int,
134
+ method: str = "POST",
135
+ ) -> tuple[Any, Optional[str]]:
136
+ response_data = execute_datadog_http_request(
137
+ url=url,
138
+ headers=headers,
139
+ payload_or_params=payload_or_params,
140
+ timeout=timeout,
141
+ method=method,
142
+ )
143
+ cursor = extract_cursor(response_data)
144
+ data = response_data.get("data", [])
145
+ return data, cursor
146
+
147
+
129
148
  def execute_datadog_http_request(
130
149
  url: str,
131
150
  headers: dict,
@@ -143,14 +162,7 @@ def execute_datadog_http_request(
143
162
  )
144
163
 
145
164
  if response.status_code == 200:
146
- response_data = response.json()
147
-
148
- if method == "POST" and response_data and "data" in response_data:
149
- cursor = extract_cursor(response_data)
150
- data = response_data.get("data", [])
151
- return data, cursor
152
- else:
153
- return response_data
165
+ return response.json()
154
166
 
155
167
  else:
156
168
  raise DataDogRequestError(
@@ -1,5 +1,13 @@
1
1
  ## Datadog Metrics Tools Usage Guide
2
2
 
3
+ Before running metrics queries:
4
+
5
+ ** You are often (but not always) running in a kubernetes environment. So users might ask you questions about kubernetes workloads without explicitly stating their type.
6
+ ** When getting ambiguous questions, use kubectl_find_resource to find the resource you are being asked about!
7
+ ** Find the involved resource name and kind
8
+ ** If you can't figure out what is the type of the resource, ask the user for more information and don't guess
9
+
10
+
3
11
  When investigating metrics-related issues:
4
12
 
5
13
  1. **Start with `list_active_datadog_metrics`** to discover available metrics
@@ -8,7 +16,6 @@ When investigating metrics-related issues:
8
16
 
9
17
  2. **Use `query_datadog_metrics`** to fetch actual metric data
10
18
  - Query syntax: `metric_name{tag:value}`
11
- - Example: `system.cpu.user{host:myhost}`
12
19
  - Returns timeseries data with timestamps and values
13
20
 
14
21
  3. **Use `get_datadog_metric_metadata`** to understand metric properties
@@ -0,0 +1,82 @@
1
+ ## Datadog RDS Performance Analysis Instructions
2
+
3
+ You have access to tools for analyzing RDS database performance and identifying problematic instances using Datadog metrics.
4
+
5
+ ### Available Tools:
6
+
7
+ 1. **datadog_rds_performance_report** - Generate comprehensive performance report for a specific RDS instance
8
+ - Analyzes latency, resource utilization, and storage metrics
9
+ - Identifies performance issues and bottlenecks
10
+ - Provides actionable recommendations
11
+ - Returns formatted report with executive summary
12
+
13
+ 2. **datadog_rds_top_worst_performing** - Get summary of worst performing RDS instances
14
+ - Analyzes all RDS instances in the environment
15
+ - Ranks by latency, CPU, or composite performance score
16
+ - Shows top N worst performers with their key metrics
17
+ - Helps prioritize optimization efforts
18
+
19
+ ### Usage Guidelines:
20
+
21
+ **For investigating a specific RDS instance:**
22
+ ```
23
+ Use datadog_rds_performance_report with:
24
+ - db_instance_identifier: "instance-name"
25
+ - start_time: "-3600" (last hour)
26
+ ```
27
+
28
+ **For finding problematic instances across the fleet:**
29
+ ```
30
+ Use datadog_rds_top_worst_performing with:
31
+ - top_n: 10 (show top 10 worst)
32
+ - sort_by: "latency" (or "cpu", "composite")
33
+ - start_time: "-3600"
34
+ ```
35
+
36
+ ### Key Performance Thresholds:
37
+
38
+ The tools automatically flag issues based on these thresholds:
39
+ - **Latency**: >10ms average (warning), >50ms peak (critical)
40
+ - **CPU**: >70% average (warning), >90% peak (critical)
41
+ - **Memory**: <100MB freeable memory (warning)
42
+ - **Burst Balance**: <30% (warning, indicates I/O constraints)
43
+ - **Disk Queue Depth**: >5 average (indicates I/O bottleneck)
44
+
45
+ ### Common Scenarios:
46
+
47
+ 1. **Application experiencing slow database queries:**
48
+ - Generate performance report for the specific RDS instance
49
+ - Look for latency spikes and resource constraints
50
+ - Follow recommendations for optimization
51
+
52
+ 2. **Proactive performance monitoring:**
53
+ - Use top worst performing to identify problem instances
54
+ - Generate detailed reports for the worst performers
55
+ - Plan capacity upgrades based on findings
56
+
57
+ 3. **Capacity planning:**
58
+ - Analyze resource utilization trends
59
+ - Identify instances approaching limits
60
+ - Plan upgrades before performance degradation
61
+
62
+ ### Interpreting Results:
63
+
64
+ **Performance Report Sections:**
65
+ - **Executive Summary**: High-level assessment and severity
66
+ - **Metrics Tables**: Statistical analysis of each metric
67
+ - **Issues**: Specific problems detected with thresholds exceeded
68
+ - **Recommendations**: Prioritized actions to resolve issues
69
+
70
+ **Top Worst Performing Report:**
71
+ - **Rankings**: Instances sorted by selected metric
72
+ - **Key Metrics**: Latency, CPU, burst balance for each instance
73
+ - **Summary**: Overall patterns across the fleet
74
+
75
+ ### Example Responses:
76
+
77
+ When asked about database performance issues:
78
+ 1. First use `datadog_rds_top_worst_performing` to identify problem instances
79
+ 2. Then use `datadog_rds_performance_report` on the worst performers
80
+ 3. Summarize findings and provide prioritized recommendations
81
+
82
+ Always consider the time range - recent data (last hour) for current issues, longer ranges (last 24 hours) for trends.
@@ -1,7 +1,7 @@
1
1
  from enum import Enum
2
2
  import json
3
3
  import logging
4
- from typing import Any, Optional, Dict, Tuple
4
+ from typing import Any, Optional, Dict, Tuple, Set
5
5
  from holmes.core.tools import (
6
6
  CallablePrerequisite,
7
7
  ToolsetTag,
@@ -12,14 +12,16 @@ from holmes.plugins.toolsets.consts import TOOLSET_CONFIG_MISSING_ERROR
12
12
  from holmes.plugins.toolsets.datadog.datadog_api import (
13
13
  DatadogBaseConfig,
14
14
  DataDogRequestError,
15
- execute_datadog_http_request,
15
+ execute_paginated_datadog_http_request,
16
16
  get_headers,
17
17
  MAX_RETRY_COUNT_ON_RATE_LIMIT,
18
18
  )
19
19
  from holmes.plugins.toolsets.logging_utils.logging_api import (
20
20
  DEFAULT_TIME_SPAN_SECONDS,
21
+ DEFAULT_LOG_LIMIT,
21
22
  BasePodLoggingToolset,
22
23
  FetchPodLogsParams,
24
+ LoggingCapability,
23
25
  PodLoggingTool,
24
26
  )
25
27
  from holmes.plugins.toolsets.utils import process_timestamps_to_rfc3339
@@ -47,7 +49,7 @@ class DatadogLogsConfig(DatadogBaseConfig):
47
49
  )
48
50
  labels: DataDogLabelsMapping = DataDogLabelsMapping()
49
51
  page_size: int = 300
50
- default_limit: int = 1000
52
+ default_limit: int = DEFAULT_LOG_LIMIT
51
53
 
52
54
 
53
55
  def calculate_page_size(
@@ -96,7 +98,7 @@ def fetch_paginated_logs(
96
98
  "page": {"limit": calculate_page_size(params, dd_config, [])},
97
99
  }
98
100
 
99
- logs, cursor = execute_datadog_http_request(
101
+ logs, cursor = execute_paginated_datadog_http_request(
100
102
  url=url,
101
103
  headers=headers,
102
104
  payload_or_params=payload,
@@ -105,7 +107,7 @@ def fetch_paginated_logs(
105
107
 
106
108
  while cursor and len(logs) < limit:
107
109
  payload["page"]["cursor"] = cursor
108
- new_logs, cursor = execute_datadog_http_request(
110
+ new_logs, cursor = execute_paginated_datadog_http_request(
109
111
  url=url,
110
112
  headers=headers,
111
113
  payload_or_params=payload,
@@ -137,6 +139,11 @@ def format_logs(raw_logs: list[dict]) -> str:
137
139
  class DatadogLogsToolset(BasePodLoggingToolset):
138
140
  dd_config: Optional[DatadogLogsConfig] = None
139
141
 
142
+ @property
143
+ def supported_capabilities(self) -> Set[LoggingCapability]:
144
+ """Datadog logs API only supports substring matching, no exclude filter"""
145
+ return set() # No regex support, no exclude filter
146
+
140
147
  def __init__(self):
141
148
  super().__init__(
142
149
  name="datadog/logs",
@@ -26,16 +26,20 @@ from holmes.plugins.toolsets.utils import (
26
26
  get_param_or_raise,
27
27
  process_timestamps_to_int,
28
28
  standard_start_datetime_tool_param_description,
29
+ toolset_name_for_one_liner,
29
30
  )
31
+ from holmes.plugins.toolsets.logging_utils.logging_api import (
32
+ DEFAULT_TIME_SPAN_SECONDS,
33
+ DEFAULT_LOG_LIMIT,
34
+ )
35
+
30
36
  from datetime import datetime
31
37
 
32
38
  from holmes.utils.keygen_utils import generate_random_key
33
39
 
34
- DEFAULT_TIME_SPAN_SECONDS = 3600
35
-
36
40
 
37
41
  class DatadogMetricsConfig(DatadogBaseConfig):
38
- default_limit: int = 1000
42
+ default_limit: int = DEFAULT_LOG_LIMIT
39
43
 
40
44
 
41
45
  class BaseDatadogMetricsTool(Tool):
@@ -63,7 +67,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
63
67
  required=False,
64
68
  ),
65
69
  "tag_filter": ToolParameter(
66
- description="Filter metrics by tags in the format tag:value. pod tag is pod_name. namespace tag is kube_namespace.",
70
+ description="Filter metrics by tags in the format tag:value.",
67
71
  type="string",
68
72
  required=False,
69
73
  ),
@@ -113,6 +117,12 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
113
117
  )
114
118
 
115
119
  metrics = data.get("metrics", [])
120
+ if not metrics:
121
+ return StructuredToolResult(
122
+ status=ToolResultStatus.ERROR,
123
+ data="Your filter returned no metrics. Change your filter and try again",
124
+ params=params,
125
+ )
116
126
 
117
127
  output = ["Metric Name"]
118
128
  output.append("-" * 50)
@@ -164,8 +174,8 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
164
174
  filters.append(f"host={params['host']}")
165
175
  if params.get("tag_filter"):
166
176
  filters.append(f"tag_filter={params['tag_filter']}")
167
- filter_str = f" with filters: {', '.join(filters)}" if filters else ""
168
- return f"List active Datadog metrics{filter_str}"
177
+ filter_str = f"{', '.join(filters)}" if filters else "all"
178
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: List Active Metrics ({filter_str})"
169
179
 
170
180
 
171
181
  class QueryMetrics(BaseDatadogMetricsTool):
@@ -342,9 +352,8 @@ class QueryMetrics(BaseDatadogMetricsTool):
342
352
  )
343
353
 
344
354
  def get_parameterized_one_liner(self, params) -> str:
345
- query = params.get("query", "<no query>")
346
355
  description = params.get("description", "")
347
- return f"Query Datadog metrics: query='{query}', description='{description}'"
356
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Query Metrics ({description})"
348
357
 
349
358
 
350
359
  class QueryMetricsMetadata(BaseDatadogMetricsTool):
@@ -455,10 +464,10 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
455
464
  metric_names = params.get("metric_names", [])
456
465
  if isinstance(metric_names, list):
457
466
  if len(metric_names) == 1:
458
- return f"Get Datadog metric metadata for: {metric_names[0]}"
467
+ return f"Get Metric Metadata ({metric_names[0]})"
459
468
  elif len(metric_names) > 1:
460
- return f"Get Datadog metric metadata for {len(metric_names)} metrics"
461
- return "Get Datadog metric metadata"
469
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Datadog metric metadata for {len(metric_names)} metrics"
470
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Datadog metric metadata"
462
471
 
463
472
 
464
473
  class ListMetricTags(BaseDatadogMetricsTool):