holmesgpt 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +4 -3
  3. holmes/common/env_vars.py +18 -2
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +11 -6
  6. holmes/core/conversations.py +30 -13
  7. holmes/core/investigation.py +21 -25
  8. holmes/core/investigation_structured_output.py +3 -3
  9. holmes/core/issue.py +1 -1
  10. holmes/core/llm.py +50 -31
  11. holmes/core/models.py +19 -17
  12. holmes/core/openai_formatting.py +1 -1
  13. holmes/core/prompt.py +47 -2
  14. holmes/core/runbooks.py +1 -0
  15. holmes/core/safeguards.py +4 -2
  16. holmes/core/supabase_dal.py +4 -2
  17. holmes/core/tool_calling_llm.py +102 -141
  18. holmes/core/tools.py +19 -28
  19. holmes/core/tools_utils/token_counting.py +9 -2
  20. holmes/core/tools_utils/tool_context_window_limiter.py +13 -30
  21. holmes/core/tools_utils/tool_executor.py +0 -18
  22. holmes/core/tools_utils/toolset_utils.py +1 -0
  23. holmes/core/toolset_manager.py +37 -2
  24. holmes/core/tracing.py +13 -2
  25. holmes/core/transformers/__init__.py +1 -1
  26. holmes/core/transformers/base.py +1 -0
  27. holmes/core/transformers/llm_summarize.py +3 -2
  28. holmes/core/transformers/registry.py +2 -1
  29. holmes/core/transformers/transformer.py +1 -0
  30. holmes/core/truncation/compaction.py +37 -2
  31. holmes/core/truncation/input_context_window_limiter.py +3 -2
  32. holmes/interactive.py +52 -8
  33. holmes/main.py +17 -37
  34. holmes/plugins/interfaces.py +2 -1
  35. holmes/plugins/prompts/__init__.py +2 -1
  36. holmes/plugins/prompts/_fetch_logs.jinja2 +5 -5
  37. holmes/plugins/prompts/_runbook_instructions.jinja2 +2 -1
  38. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  39. holmes/plugins/prompts/conversation_history_compaction.jinja2 +2 -1
  40. holmes/plugins/prompts/generic_ask.jinja2 +0 -2
  41. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -2
  42. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -2
  43. holmes/plugins/prompts/generic_investigation.jinja2 +0 -2
  44. holmes/plugins/prompts/investigation_procedure.jinja2 +2 -1
  45. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -2
  46. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -2
  47. holmes/plugins/runbooks/__init__.py +32 -3
  48. holmes/plugins/sources/github/__init__.py +4 -2
  49. holmes/plugins/sources/prometheus/models.py +1 -0
  50. holmes/plugins/toolsets/__init__.py +30 -26
  51. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +13 -12
  52. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  53. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  54. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  55. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  56. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  57. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -12
  58. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +7 -7
  59. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -7
  60. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -5
  61. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  62. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -7
  63. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -8
  64. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -3
  65. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -3
  66. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -3
  67. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -3
  68. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  69. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  70. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  71. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  72. holmes/plugins/toolsets/bash/bash_toolset.py +2 -3
  73. holmes/plugins/toolsets/bash/common/bash.py +19 -9
  74. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  75. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  76. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  77. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  78. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  79. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  80. holmes/plugins/toolsets/connectivity_check.py +124 -0
  81. holmes/plugins/toolsets/coralogix/api.py +132 -119
  82. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  83. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  84. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  85. holmes/plugins/toolsets/datadog/datadog_api.py +36 -3
  86. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +34 -1
  87. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  88. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  89. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  90. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  91. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +71 -28
  92. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +224 -375
  93. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +67 -36
  94. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +360 -343
  95. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  96. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  97. holmes/plugins/toolsets/git.py +7 -8
  98. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  99. holmes/plugins/toolsets/grafana/common.py +2 -30
  100. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +2 -1
  101. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +18 -2
  102. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +92 -18
  103. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  104. holmes/plugins/toolsets/grafana/toolset_grafana.py +109 -25
  105. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +22 -0
  106. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +201 -33
  107. holmes/plugins/toolsets/grafana/trace_parser.py +3 -2
  108. holmes/plugins/toolsets/internet/internet.py +10 -10
  109. holmes/plugins/toolsets/internet/notion.py +5 -6
  110. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  111. holmes/plugins/toolsets/investigator/model.py +3 -1
  112. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  113. holmes/plugins/toolsets/kafka.py +12 -7
  114. holmes/plugins/toolsets/kubernetes.yaml +260 -30
  115. holmes/plugins/toolsets/kubernetes_logs.py +3 -3
  116. holmes/plugins/toolsets/logging_utils/logging_api.py +16 -6
  117. holmes/plugins/toolsets/mcp/toolset_mcp.py +88 -60
  118. holmes/plugins/toolsets/newrelic/new_relic_api.py +41 -1
  119. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +24 -0
  120. holmes/plugins/toolsets/newrelic/newrelic.py +212 -55
  121. holmes/plugins/toolsets/prometheus/prometheus.py +358 -102
  122. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +11 -3
  123. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  124. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +5 -5
  125. holmes/plugins/toolsets/robusta/robusta.py +5 -5
  126. holmes/plugins/toolsets/runbook/runbook_fetcher.py +25 -6
  127. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +1 -1
  128. holmes/plugins/toolsets/utils.py +1 -1
  129. holmes/utils/config_utils.py +1 -1
  130. holmes/utils/connection_utils.py +31 -0
  131. holmes/utils/console/result.py +10 -0
  132. holmes/utils/file_utils.py +2 -1
  133. holmes/utils/global_instructions.py +10 -26
  134. holmes/utils/holmes_status.py +4 -3
  135. holmes/utils/log.py +15 -0
  136. holmes/utils/markdown_utils.py +2 -3
  137. holmes/utils/memory_limit.py +58 -0
  138. holmes/utils/sentry_helper.py +23 -0
  139. holmes/utils/stream.py +12 -5
  140. holmes/utils/tags.py +4 -3
  141. holmes/version.py +3 -1
  142. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +12 -10
  143. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  144. holmes/plugins/toolsets/aws.yaml +0 -80
  145. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -114
  146. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  147. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -736
  148. holmes/plugins/toolsets/grafana/grafana_api.py +0 -64
  149. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  150. holmes/plugins/toolsets/opensearch/opensearch.py +0 -250
  151. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  152. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -215
  153. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  154. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  155. holmes/utils/keygen_utils.py +0 -6
  156. holmesgpt-0.16.2a0.dist-info/RECORD +0 -258
  157. holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_ppl_query_docs.jinja2 +0 -0
  158. holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist.py +2 -2
  159. /holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist_instructions.jinja2 +0 -0
  160. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/LICENSE +0 -0
  161. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  162. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,14 @@
1
1
 
2
2
  # Prometheus/PromQL queries
3
3
 
4
+ {%- if config and config.prometheus_url and "coralogix" in config.prometheus_url %}
5
+ You are using Coralogix Prometheus.
6
+ * Metrics/labels may differ; discover names with `get_metric_names` first.
7
+ * For high-cardinality, wrap with `topk(5, <query>)`.
8
+ * Always include explicit time ranges for range queries.
9
+ * Example: `container_cpu_utilization{namespace="test-173"}` can fail if the label is named differently (e.g., `k8s_namespace_name`) or the metric is named differently. Do not assume names—only use labels you have seen returned or been told exist.
10
+ {%- endif %}
11
+
4
12
  ## Efficient Metric Discovery (when needed)
5
13
  * When you need to discover metrics, use `get_metric_names` with filters - it's the fastest method
6
14
  * Combine multiple patterns with regex OR (|) to reduce API calls:
@@ -26,7 +34,7 @@
26
34
  * Use prometheus to execute promql queries with the tools `execute_prometheus_instant_query` and `execute_prometheus_range_query`
27
35
  * To create queries, use 'start_timestamp' and 'end_timestamp' as graphs start and end times
28
36
  * ALWAYS embed the execution results into your answer
29
- * You only need to embed the partial result in your response. Include the "tool_name" and "random_key". For example: << {"type": "promql", "tool_name": "execute_prometheus_range_query", "random_key": "92jf2hf"} >>
37
+ * You only need to embed the partial result in your response. Include the "tool_name" and "tool_call_id". For example: << {"type": "promql", "tool_name": "execute_prometheus_range_query", "tool_call_id": "92jf2hf"} >>
30
38
  * Use these tools to generate charts that users can see. Here are standard metrics but you can use different ones:
31
39
  ** For memory consumption: `container_memory_working_set_bytes`
32
40
  ** For CPU usage: `container_cpu_usage_seconds_total`
@@ -67,9 +75,9 @@
67
75
  * When embedding multiple graphs, always add line spacing between them
68
76
  For example:
69
77
 
70
- <<{"type": "promql", "tool_name": "execute_prometheus_range_query", "random_key": "lBaA"}>>
78
+ <<{"type": "promql", "tool_name": "execute_prometheus_range_query", "tool_call_id": "lBaA"}>>
71
79
 
72
- <<{"type": "promql", "tool_name": "execute_prometheus_range_query", "random_key": "IKtq"}>>
80
+ <<{"type": "promql", "tool_name": "execute_prometheus_range_query", "tool_call_id": "IKtq"}>>
73
81
 
74
82
  {%- if config and config.additional_labels and config.additional_labels.keys()|list|length > 0 %}
75
83
  * ALWAYS add the following additional labels to ALL PromQL queries:
@@ -1,11 +1,11 @@
1
- from enum import Enum
2
1
  import logging
2
+ from enum import Enum
3
3
  from typing import Any, Dict, List, Optional, Set
4
4
  from urllib.parse import urljoin, urlparse
5
5
 
6
6
  import backoff
7
- from pydantic import BaseModel
8
7
  import requests # type: ignore
8
+ from pydantic import BaseModel, ConfigDict, model_validator
9
9
  from requests.auth import HTTPBasicAuth # type: ignore
10
10
 
11
11
  # --- Enums and Pydantic Models (Mostly Unchanged) ---
@@ -17,12 +17,31 @@ class ClusterConnectionStatus(str, Enum):
17
17
 
18
18
 
19
19
  class RabbitMQClusterConfig(BaseModel):
20
+ model_config = ConfigDict(extra="allow")
21
+
20
22
  id: str = "rabbitmq" # must be unique
21
23
  management_url: str # e.g., http://rabbitmq-service:15672
22
24
  username: Optional[str] = None
23
25
  password: Optional[str] = None
24
26
  request_timeout_seconds: int = 30
25
- verify_certs: bool = True
27
+ verify_ssl: bool = True
28
+
29
+ @model_validator(mode="after")
30
+ def handle_deprecated_fields(self):
31
+ extra = self.model_extra or {}
32
+ deprecated = []
33
+
34
+ # Map old name to new name
35
+ if "verify_certs" in extra:
36
+ self.verify_ssl = extra["verify_certs"]
37
+ deprecated.append("verify_certs -> verify_ssl")
38
+
39
+ if deprecated:
40
+ logging.warning(
41
+ f"RabbitMQ config uses deprecated field names: {', '.join(deprecated)}. "
42
+ "Please update your configuration."
43
+ )
44
+ return self
26
45
 
27
46
  # For internal use
28
47
  connection_status: Optional[ClusterConnectionStatus] = None
@@ -111,7 +130,7 @@ def make_request(
111
130
  params=params,
112
131
  json=data,
113
132
  timeout=config.request_timeout_seconds,
114
- verify=config.verify_certs,
133
+ verify=config.verify_ssl,
115
134
  )
116
135
  response.raise_for_status()
117
136
  return response.json()
@@ -1,21 +1,21 @@
1
- import os
2
1
  import logging
2
+ import os
3
3
  from typing import Any, List, Optional, Tuple
4
+ from urllib.parse import urljoin
4
5
 
5
6
  from pydantic import BaseModel
7
+ from requests import RequestException # type: ignore
8
+
6
9
  from holmes.core.tools import (
7
10
  CallablePrerequisite,
8
11
  StructuredToolResult,
12
+ StructuredToolResultStatus,
9
13
  Tool,
10
14
  ToolInvokeContext,
11
15
  ToolParameter,
12
- StructuredToolResultStatus,
13
16
  Toolset,
14
17
  ToolsetTag,
15
18
  )
16
- from requests import RequestException # type: ignore
17
- from urllib.parse import urljoin
18
-
19
19
  from holmes.plugins.toolsets.rabbitmq.api import (
20
20
  ClusterConnectionStatus,
21
21
  RabbitMQClusterConfig,
@@ -1,19 +1,19 @@
1
- import os
2
-
3
1
  import logging
2
+ import os
3
+ from typing import Any, Dict, List, Optional
4
4
 
5
- from typing import Optional, Dict, Any, List
6
5
  from holmes.common.env_vars import load_bool
7
- from holmes.core.supabase_dal import SupabaseDal, FindingType
6
+ from holmes.core.supabase_dal import FindingType, SupabaseDal
8
7
  from holmes.core.tools import (
9
8
  StaticPrerequisite,
9
+ StructuredToolResult,
10
+ StructuredToolResultStatus,
10
11
  Tool,
11
12
  ToolInvokeContext,
12
13
  ToolParameter,
13
14
  Toolset,
14
15
  ToolsetTag,
15
16
  )
16
- from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
17
17
 
18
18
  PULL_EXTERNAL_FINDINGS = load_bool("PULL_EXTERNAL_FINDINGS", False)
19
19
 
@@ -1,22 +1,23 @@
1
1
  import logging
2
2
  import os
3
3
  import textwrap
4
- from typing import Any, Dict, List, Optional
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional, Union, cast
6
+
5
7
  from holmes.core.supabase_dal import SupabaseDal
6
8
  from holmes.core.tools import (
7
9
  StructuredToolResult,
10
+ StructuredToolResultStatus,
8
11
  Tool,
9
12
  ToolInvokeContext,
10
13
  ToolParameter,
11
- StructuredToolResultStatus,
12
14
  Toolset,
13
15
  ToolsetTag,
14
16
  )
15
-
16
17
  from holmes.plugins.runbooks import (
18
+ DEFAULT_RUNBOOK_SEARCH_PATH,
17
19
  get_runbook_by_path,
18
20
  load_runbook_catalog,
19
- DEFAULT_RUNBOOK_SEARCH_PATH,
20
21
  )
21
22
  from holmes.plugins.toolsets.utils import toolset_name_for_one_liner
22
23
 
@@ -32,8 +33,11 @@ class RunbookFetcher(Tool):
32
33
  toolset: "RunbookToolset",
33
34
  additional_search_paths: Optional[List[str]] = None,
34
35
  dal: Optional[SupabaseDal] = None,
36
+ custom_catalog_paths: Optional[List[Union[str, Path]]] = None,
35
37
  ):
36
- catalog = load_runbook_catalog(dal=dal)
38
+ catalog = load_runbook_catalog(
39
+ dal=dal, custom_catalog_paths=custom_catalog_paths
40
+ )
37
41
  available_runbooks = []
38
42
  if catalog:
39
43
  available_runbooks = catalog.list_available_runbooks()
@@ -232,12 +236,26 @@ class RunbookToolset(Toolset):
232
236
  if additional_search_paths:
233
237
  config["additional_search_paths"] = additional_search_paths
234
238
 
239
+ # Compute custom catalog paths from additional search paths
240
+ custom_catalog_paths = None
241
+ if additional_search_paths:
242
+ custom_catalog_paths = [
243
+ os.path.join(search_path, "catalog.json")
244
+ for search_path in additional_search_paths
245
+ if os.path.isfile(os.path.join(search_path, "catalog.json"))
246
+ ]
247
+
235
248
  super().__init__(
236
249
  name="runbook",
237
250
  description="Fetch runbooks",
238
251
  icon_url="https://platform.robusta.dev/demos/runbook.svg",
239
252
  tools=[
240
- RunbookFetcher(self, additional_search_paths, dal),
253
+ RunbookFetcher(
254
+ self,
255
+ additional_search_paths,
256
+ dal,
257
+ cast(Optional[List[Union[str, Path]]], custom_catalog_paths),
258
+ ),
241
259
  ],
242
260
  docs_url="https://holmesgpt.dev/data-sources/",
243
261
  tags=[
@@ -245,6 +263,7 @@ class RunbookToolset(Toolset):
245
263
  ],
246
264
  is_default=True,
247
265
  config=config,
266
+ enabled=True,
248
267
  )
249
268
 
250
269
  def get_example_config(self) -> Dict[str, Any]:
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  from abc import ABC
3
- from typing import Dict, Optional, cast, Type, ClassVar, Tuple, Any
3
+ from typing import Any, ClassVar, Dict, Optional, Tuple, Type, cast
4
4
  from urllib.parse import urljoin
5
5
 
6
6
  import requests # type: ignore
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import math
3
- import time
4
3
  import re
4
+ import time
5
5
  from typing import Dict, Optional, Tuple, Union
6
6
 
7
7
  from dateutil import parser
@@ -2,7 +2,7 @@
2
2
  Configuration utility functions for HolmesGPT.
3
3
  """
4
4
 
5
- from typing import List, Optional, TYPE_CHECKING
5
+ from typing import TYPE_CHECKING, List, Optional
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  from holmes.core.transformers import Transformer
@@ -0,0 +1,31 @@
1
+ import logging
2
+ import socket
3
+
4
+ from holmes.common.env_vars import KEEPALIVE_CNT, KEEPALIVE_IDLE, KEEPALIVE_INTVL
5
+
6
+
7
+ def patch_socket_create_connection(
8
+ idle: int = KEEPALIVE_IDLE,
9
+ intvl: int = KEEPALIVE_INTVL,
10
+ cnt: int = KEEPALIVE_CNT,
11
+ ) -> None:
12
+ orig = socket.create_connection
13
+
14
+ def new_create_connection(address, timeout=None, source_address=None, **kwargs):
15
+ logging.debug(
16
+ f"Creating patched connection to {address} with timeout {timeout} and source address {source_address}"
17
+ )
18
+ s = orig(address, timeout=timeout, source_address=source_address, **kwargs)
19
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)
20
+
21
+ # Linux-only tuning (these attrs won't exist on macOS/Windows)
22
+ if hasattr(socket, "TCP_KEEPIDLE"):
23
+ s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, int(idle))
24
+ if hasattr(socket, "TCP_KEEPINTVL"):
25
+ s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, int(intvl))
26
+ if hasattr(socket, "TCP_KEEPCNT"):
27
+ s.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, int(cnt))
28
+ return s
29
+
30
+ logging.info("Patching socket.create_connection to force keepalive")
31
+ socket.create_connection = new_create_connection
@@ -17,6 +17,7 @@ def handle_result(
17
17
  issue: Issue,
18
18
  show_tool_output: bool,
19
19
  add_separator: bool,
20
+ log_costs: bool = False,
20
21
  ):
21
22
  if destination == DestinationType.CLI:
22
23
  if show_tool_output and result.tool_calls:
@@ -30,6 +31,15 @@ def handle_result(
30
31
 
31
32
  console.print(f"[bold {AI_COLOR}]AI:[/bold {AI_COLOR}]", end=" ")
32
33
  console.print(Markdown(result.result)) # type: ignore
34
+
35
+ if log_costs and result.total_cost > 0:
36
+ console.print(
37
+ f"\n[bold yellow]💰 Total Cost:[/bold yellow] ${result.total_cost:.6f}"
38
+ )
39
+ console.print(
40
+ f"[dim]Tokens: {result.prompt_tokens:,} prompt + {result.completion_tokens:,} completion = {result.total_tokens:,} total[/dim]"
41
+ )
42
+
33
43
  if add_separator:
34
44
  console.print(Rule())
35
45
 
@@ -1,6 +1,7 @@
1
1
  import json
2
- import os
3
2
  import logging
3
+ import os
4
+
4
5
  import yaml # type: ignore
5
6
 
6
7
 
@@ -1,6 +1,7 @@
1
- from typing import Optional, List, TYPE_CHECKING
1
+ from typing import TYPE_CHECKING, Dict, List, Optional
2
+
2
3
  from pydantic import BaseModel
3
- from holmes.plugins.prompts import load_and_render_prompt
4
+
4
5
  from holmes.plugins.runbooks import RunbookCatalog
5
6
 
6
7
  if TYPE_CHECKING:
@@ -34,24 +35,14 @@ def _format_resource_instructions(
34
35
  return items
35
36
 
36
37
 
37
- def add_runbooks_to_user_prompt(
38
- user_prompt: str,
38
+ def generate_runbooks_args(
39
39
  runbook_catalog: Optional[RunbookCatalog],
40
40
  global_instructions: Optional[Instructions] = None,
41
41
  issue_instructions: Optional[List[str]] = None,
42
42
  resource_instructions: Optional["ResourceInstructions"] = None, # type: ignore
43
- ) -> str:
44
- if (
45
- not runbook_catalog
46
- and not issue_instructions
47
- and not resource_instructions
48
- and not global_instructions
49
- ):
50
- return user_prompt
51
-
43
+ ) -> Dict[str, str]:
52
44
  catalog_str = runbook_catalog.to_prompt_string() if runbook_catalog else ""
53
45
 
54
- # Combine and format all instructions
55
46
  combined_instructions = []
56
47
  if issue_instructions:
57
48
  combined_instructions.extend(issue_instructions)
@@ -71,15 +62,8 @@ def add_runbooks_to_user_prompt(
71
62
  else ""
72
63
  )
73
64
 
74
- rendered = load_and_render_prompt(
75
- "builtin://_runbook_instructions.jinja2",
76
- context={
77
- "runbook_catalog": catalog_str,
78
- "custom_instructions": issue_block,
79
- "global_instructions": global_block,
80
- },
81
- )
82
-
83
- if user_prompt and not user_prompt.endswith("\n"):
84
- user_prompt += "\n"
85
- return f"{user_prompt}\n{rendered}"
65
+ return {
66
+ "runbook_catalog": catalog_str,
67
+ "custom_instructions": issue_block,
68
+ "global_instructions": global_block,
69
+ }
@@ -1,9 +1,10 @@
1
1
  import json
2
- from holmes.core.supabase_dal import SupabaseDal
3
- from holmes.config import Config
4
- from holmes import get_version # type: ignore
5
2
  import logging
6
3
 
4
+ from holmes import get_version # type: ignore
5
+ from holmes.config import Config
6
+ from holmes.core.supabase_dal import SupabaseDal
7
+
7
8
 
8
9
  def update_holmes_status_in_db(dal: SupabaseDal, config: Config):
9
10
  logging.info("Updating status of holmes")
holmes/utils/log.py ADDED
@@ -0,0 +1,15 @@
1
+ """Logging utilities for Holmes."""
2
+
3
+ import logging
4
+ from typing import Any
5
+
6
+
7
+ class EndpointFilter(logging.Filter):
8
+ """Filter out log records for specific endpoint paths."""
9
+
10
+ def __init__(self, path: str, *args: Any, **kwargs: Any):
11
+ super().__init__(*args, **kwargs)
12
+ self._path = path
13
+
14
+ def filter(self, record: logging.LogRecord) -> bool:
15
+ return record.getMessage().find(self._path) == -1
@@ -1,9 +1,8 @@
1
1
  # based on https://github.com/kostyachum/python-markdown-plain-text/blob/main/markdown_plain_text/extention.py
2
2
  # MIT licensed
3
- from markdown import Extension, Markdown # type: ignore
3
+ from xml.etree.ElementTree import Comment, ElementTree, ProcessingInstruction
4
4
 
5
- from xml.etree.ElementTree import ProcessingInstruction
6
- from xml.etree.ElementTree import Comment, ElementTree
5
+ from markdown import Extension, Markdown # type: ignore
7
6
 
8
7
 
9
8
  def _serialize_plain_text(write, elem):
@@ -0,0 +1,58 @@
1
+ """
2
+ Memory limit utilities for tool subprocess execution.
3
+ """
4
+
5
+ import logging
6
+
7
+ from holmes.common.env_vars import TOOL_MEMORY_LIMIT_MB
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def get_ulimit_prefix() -> str:
13
+ """
14
+ Get the ulimit command prefix for memory protection.
15
+
16
+ Returns a shell command prefix that sets virtual memory limit.
17
+ The '|| true' ensures we continue even if ulimit is not supported.
18
+ """
19
+ memory_limit_kb = TOOL_MEMORY_LIMIT_MB * 1024
20
+ return f"ulimit -v {memory_limit_kb} || true; "
21
+
22
+
23
+ def check_oom_and_append_hint(output: str, return_code: int) -> str:
24
+ """
25
+ Check if a command was OOM killed and append a helpful hint.
26
+
27
+ Args:
28
+ output: The command output
29
+ return_code: The command's return code
30
+
31
+ Returns:
32
+ Output with OOM hint appended if OOM was detected
33
+ """
34
+ # Common OOM indicators:
35
+ # - Return code 137 (128 + 9 = SIGKILL, commonly OOM)
36
+ # - Return code -9 (SIGKILL on some systems)
37
+ # - "Killed" in output (Linux OOM killer message)
38
+ # - "MemoryError" (Python)
39
+ # - "Cannot allocate memory" (various tools)
40
+ is_oom = (
41
+ return_code in (137, -9)
42
+ or "Killed" in output
43
+ or "MemoryError" in output
44
+ or "Cannot allocate memory" in output
45
+ or "bad_alloc" in output
46
+ )
47
+
48
+ if is_oom:
49
+ hint = (
50
+ f"\n\n[OOM] Command was killed due to memory limits (current limit: {TOOL_MEMORY_LIMIT_MB} MB). "
51
+ f"Try querying the data differently to reduce memory usage - add filters to narrow the results, "
52
+ f"use smaller time ranges, or try alternative tools that may be more memory-efficient. "
53
+ f"If you cannot succeed with a modified query, you may recommend the user increase the limit "
54
+ f"by setting the TOOL_MEMORY_LIMIT_MB environment variable (Tool memory limit, MB)."
55
+ )
56
+ return output + hint
57
+
58
+ return output
@@ -1,4 +1,7 @@
1
+ from typing import Optional
2
+
1
3
  import sentry_sdk
4
+
2
5
  from holmes.core.models import ToolCallResult, TruncationMetadata
3
6
 
4
7
 
@@ -39,3 +42,23 @@ def capture_structured_output_incorrect_tool_call():
39
42
  "Structured output incorrect tool call",
40
43
  level="warning",
41
44
  )
45
+
46
+
47
+ def capture_sections_none(content: Optional[str]):
48
+ # Limit display length to avoid sending huge payloads to Sentry
49
+ _MAX_DISPLAY_LENGTH = 1500
50
+ display_content = ""
51
+ if content:
52
+ if len(content) > _MAX_DISPLAY_LENGTH * 2:
53
+ # Show first and last portions of content
54
+ display_content = f"{content[:_MAX_DISPLAY_LENGTH]}...\n\n...{content[-_MAX_DISPLAY_LENGTH:]}"
55
+ else:
56
+ display_content = content
57
+
58
+ with sentry_sdk.push_scope() as scope:
59
+ scope.set_extra("content", display_content)
60
+ scope.set_extra("content_length", len(content) if content else 0)
61
+ sentry_sdk.capture_message(
62
+ "Holmes answer couldn't be parsed into sections",
63
+ level="warning",
64
+ )
holmes/utils/stream.py CHANGED
@@ -1,15 +1,17 @@
1
1
  import json
2
+ import logging
2
3
  from enum import Enum
3
- from typing import Generator, Optional, List, Union
4
- import litellm
5
- from pydantic import BaseModel, Field
6
- from holmes.core.investigation_structured_output import process_response_into_sections
7
4
  from functools import partial
8
- import logging
5
+ from typing import Generator, List, Optional, Union
6
+
7
+ import litellm
9
8
  from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
10
9
  from litellm.types.utils import ModelResponse, TextCompletionResponse
10
+ from pydantic import BaseModel, Field
11
11
 
12
+ from holmes.core.investigation_structured_output import process_response_into_sections
12
13
  from holmes.core.llm import TokenCountMetadata, get_llm_usage
14
+ from holmes.utils import sentry_helper
13
15
 
14
16
 
15
17
  class StreamEvents(str, Enum):
@@ -63,6 +65,11 @@ def stream_investigate_formatter(
63
65
  message.data.get("content")
64
66
  )
65
67
 
68
+ if sections is None:
69
+ sentry_helper.capture_sections_none(
70
+ content=message.data.get("content"),
71
+ )
72
+
66
73
  yield create_sse_message(
67
74
  StreamEvents.ANSWER_END.value,
68
75
  {
holmes/utils/tags.py CHANGED
@@ -1,9 +1,10 @@
1
+ import json
1
2
  import logging
2
- from typing import Optional
3
- from typing_extensions import Dict, List
4
3
  import re
5
- import json
6
4
  from copy import deepcopy
5
+ from typing import Optional
6
+
7
+ from typing_extensions import Dict, List
7
8
 
8
9
 
9
10
  def stringify_tag(tag: Dict[str, str]) -> Optional[str]:
holmes/version.py CHANGED
@@ -8,10 +8,12 @@ import os
8
8
  import subprocess
9
9
  import sys
10
10
  import threading
11
- from typing import Optional, NamedTuple
12
11
  from functools import cache
12
+ from typing import NamedTuple, Optional
13
+
13
14
  import requests # type: ignore
14
15
  from pydantic import BaseModel, ConfigDict
16
+
15
17
  from holmes.common.env_vars import ROBUSTA_API_ENDPOINT
16
18
 
17
19
  # For relative imports to work in Python 3.6 - see https://stackoverflow.com/a/49375740
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: holmesgpt
3
- Version: 0.16.2a0
3
+ Version: 0.18.4
4
4
  Summary:
5
5
  Author: Natan Yellin
6
6
  Author-email: natan@robusta.dev
@@ -28,6 +28,7 @@ Requires-Dist: google-cloud-aiplatform (>=1.38)
28
28
  Requires-Dist: httpx[socks] (<0.28)
29
29
  Requires-Dist: humanize (>=4.9.0,<5.0.0)
30
30
  Requires-Dist: jinja2 (>=3.1.2,<4.0.0)
31
+ Requires-Dist: jq (>=1.10.0,<2.0.0)
31
32
  Requires-Dist: kubernetes (>=32.0.1,<33.0.0)
32
33
  Requires-Dist: litellm (==1.77.1)
33
34
  Requires-Dist: markdown (>=3.6,<4.0)
@@ -58,6 +59,9 @@ Description-Content-Type: text/markdown
58
59
 
59
60
  HolmesGPT is an AI agent for investigating problems in your cloud, finding the root cause, and suggesting remediations. It has dozens of built-in integrations for cloud providers, observability tools, and on-call systems.
60
61
 
62
+ [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/11586/badge)](https://www.bestpractices.dev/projects/11586)
63
+ [![OpenSSF Scorecard](https://api.scorecard.dev/projects/github.com/HolmesGPT/holmesgpt/badge)](https://scorecard.dev/viewer/?uri=github.com/HolmesGPT/holmesgpt)
64
+
61
65
  >🎉 **HolmesGPT is now a CNCF Sandbox Project!**
62
66
  HolmesGPT was originally created by [Robusta.Dev](https://home.robusta.dev/) and is a CNCF sandbox project.
63
67
 
@@ -70,7 +74,7 @@ Find more about HolmesGPT's maintainers and adopters [here](./ADOPTERS.md).
70
74
  <a href="#installation"><strong>Installation</strong></a> |
71
75
  <a href="#supported-llm-providers"><strong>LLM Providers</strong></a> |
72
76
  <a href="https://www.youtube.com/watch?v=TfQfx65LsDQ"><strong>YouTube Demo</strong></a> |
73
- <a href="https://deepwiki.com/robusta-dev/holmesgpt"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"></a>
77
+ <a href="https://deepwiki.com/HolmesGPT/holmesgpt"><img src="https://deepwiki.com/badge.svg" alt="Ask DeepWiki"></a>
74
78
  </p>
75
79
  </div>
76
80
 
@@ -248,27 +252,25 @@ Because HolmesGPT relies on LLMs, it relies on [a suite of pytest based evaluati
248
252
 
249
253
 
250
254
  ## License
251
- Distributed under the Apache 2.0 License. See [LICENSE](https://github.com/robusta-dev/holmesgpt/blob/master/LICENSE) for more information.
255
+ Distributed under the Apache 2.0 License. See [LICENSE](https://github.com/HolmesGPT/holmesgpt/blob/master/LICENSE) for more information.
252
256
  <!-- Change License -->
253
257
 
254
258
  ## Community
255
259
 
256
260
  Join our community to discuss the HolmesGPT roadmap and share feedback:
257
261
 
258
- 📹 **First Community Meetup Recording:** [Watch on YouTube](https://youtu.be/slQRc6nlFQU)
259
- - **Topics:** Roadmap discussion, community feedback, and Q&A
260
- - **Resources:** [📝 Meeting Notes](https://docs.google.com/document/d/1sIHCcTivyzrF5XNvos7ZT_UcxEOqgwfawsTbb9wMJe4/edit?tab=t.0) | [📋 Community Page](https://holmesgpt.dev/community/)
262
+ - [Community Meetups](https://docs.google.com/document/d/1q3L2iUd8tNu-NmZ6QIVOJcCLHrile9CC5QguOGTn_tg/edit?tab=t.0#heading=h.ihdnrt5bstrv)
261
263
 
262
264
  ## Support
263
265
 
264
- If you have any questions, feel free to message us on [robustacommunity.slack.com](https://bit.ly/robusta-slack)
266
+ If you have any questions, feel free to message us on [HolmesGPT Slack Channel](https://cloud-native.slack.com/archives/C0A1SPQM5PZ)
265
267
 
266
268
  ## How to Contribute
267
269
 
268
270
  Please read our [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines and instructions.
269
271
 
270
- For help, contact us on [Slack](https://bit.ly/robusta-slack) or ask [DeepWiki AI](https://deepwiki.com/robusta-dev/holmesgpt) your questions.
272
+ For help, contact us on [Slack](https://cloud-native.slack.com/archives/C0A1SPQM5PZ) or ask [DeepWiki AI](https://deepwiki.com/HolmesGPT/holmesgpt) your questions.
271
273
 
272
- Please make sure to follow the CNCF code of conduct - [details here](https://github.com/robusta-dev/holmesgpt/blob/master/CODE_OF_CONDUCT.md).
273
- [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/robusta-dev/holmesgpt)
274
+ Please make sure to follow the CNCF code of conduct - [details here](https://github.com/HolmesGPT/holmesgpt/blob/master/CODE_OF_CONDUCT.md).
275
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/HolmesGPT/holmesgpt)
274
276