holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -1,21 +1,4 @@
1
- {% if enabled %}
2
- {% if is_default %}
3
- This integration is enabled by default.
4
-
5
- If you would like to disable this toolset (not recommended), you need to update the `generated_values.yaml` configuration.
6
- {% else %}
7
- To disable this integration, you need to update the `generated_values.yaml` configuration.
8
- {% endif %}
9
-
10
- ```yaml
11
- holmes:
12
- toolsets:
13
- {{toolset_name}}:
14
- enabled: false
15
- ```
16
-
17
- {% else %}
18
- To enable this integration, update the Helm values for Robusta (`generated_values.yaml`):
1
+ To enable/disable this integration, update the Helm values for Robusta (`generated_values.yaml`):
19
2
 
20
3
  ```yaml
21
4
  holmes:
@@ -34,11 +17,7 @@ holmes:
34
17
  {{ example_config | indent(8) }}
35
18
  {% endif %}
36
19
  ```
37
-
38
- {% endif %}
39
-
40
20
  And deploy the updated configuration using Helm:
41
-
42
21
  ```bash
43
22
  helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>
44
23
  ```
holmes/utils/env.py CHANGED
@@ -6,6 +6,13 @@ from typing import Any, Optional
6
6
  from pydantic import SecretStr
7
7
 
8
8
 
9
+ def environ_get_safe_int(env_var: str, default: str = "0") -> int:
10
+ try:
11
+ return max(int(os.environ.get(env_var, default)), 0)
12
+ except ValueError:
13
+ return int(default)
14
+
15
+
9
16
  def get_env_replacement(value: str) -> Optional[str]:
10
17
  env_patterns = re.findall(r"{{\s*env\.([^}]*)\s*}}", value)
11
18
 
@@ -1,6 +1,7 @@
1
1
  import json
2
- import os
3
2
  import logging
3
+ import os
4
+
4
5
  import yaml # type: ignore
5
6
 
6
7
 
@@ -1,20 +1,69 @@
1
- from typing import List, Optional
1
+ from typing import TYPE_CHECKING, Dict, List, Optional
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
5
+ from holmes.plugins.runbooks import RunbookCatalog
6
+
7
+ if TYPE_CHECKING:
8
+ from holmes.core.resource_instruction import ResourceInstructions
9
+
5
10
 
6
11
  class Instructions(BaseModel):
7
12
  instructions: List[str] = []
8
13
 
9
14
 
10
- def add_global_instructions_to_user_prompt(
11
- user_prompt: str, global_instructions: Optional[Instructions]
15
+ def _format_instructions_block(
16
+ items: List[str], header: str = "My instructions to check:"
12
17
  ) -> str:
13
- if (
14
- global_instructions
15
- and global_instructions.instructions
16
- and len(global_instructions.instructions[0]) > 0
17
- ):
18
- instructions = "\n\n".join(global_instructions.instructions)
19
- user_prompt += f"\n\nGlobal Instructions (use if relevant): {instructions}\n"
20
- return user_prompt
18
+ lines = [f"* {s}" for s in items if isinstance(s, str) and s.strip()]
19
+ if not lines:
20
+ return ""
21
+ bullets = "\n".join(lines) + "\n"
22
+ return f"{header}\n{bullets}"
23
+
24
+
25
+ def _format_resource_instructions(
26
+ resource_instructions: Optional["ResourceInstructions"],
27
+ ) -> List[str]: # type: ignore
28
+ items = []
29
+ if resource_instructions is not None:
30
+ if getattr(resource_instructions, "instructions", None):
31
+ items.extend(resource_instructions.instructions)
32
+ if getattr(resource_instructions, "documents", None):
33
+ for document in resource_instructions.documents:
34
+ items.append(f"fetch information from this URL: {document.url}")
35
+ return items
36
+
37
+
38
+ def generate_runbooks_args(
39
+ runbook_catalog: Optional[RunbookCatalog],
40
+ global_instructions: Optional[Instructions] = None,
41
+ issue_instructions: Optional[List[str]] = None,
42
+ resource_instructions: Optional["ResourceInstructions"] = None, # type: ignore
43
+ ) -> Dict[str, str]:
44
+ catalog_str = runbook_catalog.to_prompt_string() if runbook_catalog else ""
45
+
46
+ combined_instructions = []
47
+ if issue_instructions:
48
+ combined_instructions.extend(issue_instructions)
49
+ combined_instructions.extend(_format_resource_instructions(resource_instructions))
50
+ issue_block = (
51
+ _format_instructions_block(combined_instructions)
52
+ if combined_instructions
53
+ else ""
54
+ )
55
+
56
+ gi_list = getattr(global_instructions, "instructions", None) or []
57
+ global_block = (
58
+ _format_instructions_block(
59
+ [s for s in gi_list if isinstance(s, str)], header=""
60
+ )
61
+ if gi_list
62
+ else ""
63
+ )
64
+
65
+ return {
66
+ "runbook_catalog": catalog_str,
67
+ "custom_instructions": issue_block,
68
+ "global_instructions": global_block,
69
+ }
@@ -1,8 +1,10 @@
1
- from holmes.core.supabase_dal import SupabaseDal
2
- from holmes.config import Config
3
- from holmes import get_version # type: ignore
1
+ import json
4
2
  import logging
5
3
 
4
+ from holmes import get_version # type: ignore
5
+ from holmes.config import Config
6
+ from holmes.core.supabase_dal import SupabaseDal
7
+
6
8
 
7
9
  def update_holmes_status_in_db(dal: SupabaseDal, config: Config):
8
10
  logging.info("Updating status of holmes")
@@ -16,7 +18,7 @@ def update_holmes_status_in_db(dal: SupabaseDal, config: Config):
16
18
  dal.upsert_holmes_status(
17
19
  {
18
20
  "cluster_id": config.cluster_name,
19
- "model": config.get_models_list(),
21
+ "model": json.dumps(config.get_models_list()),
20
22
  "version": get_version(),
21
23
  }
22
24
  )
@@ -66,8 +66,6 @@ def render_default_installation_instructions_for_toolset(toolset: Toolset) -> st
66
66
  context: dict[str, Any] = {
67
67
  "env_vars": env_vars if env_vars else [],
68
68
  "toolset_name": toolset.name,
69
- "is_default": toolset.is_default,
70
- "enabled": toolset.enabled,
71
69
  }
72
70
 
73
71
  example_config = toolset.get_example_config()
@@ -0,0 +1,188 @@
1
+ """Utilities for KRR (Kubernetes Resource Recommendations) data processing."""
2
+
3
+ import logging
4
+ from typing import Any, Dict
5
+
6
+
7
+ def parse_cpu(cpu_value: Any) -> float:
8
+ """Parse Kubernetes CPU value to float (in cores).
9
+
10
+ Handles:
11
+ - Numeric values (0.1, 1, etc.) - already in cores
12
+ - String values with 'm' suffix (100m = 0.1 cores)
13
+ - String numeric values ("0.5")
14
+
15
+ Args:
16
+ cpu_value: CPU value to parse (can be int, float, str, or None)
17
+
18
+ Returns:
19
+ CPU value in cores as float, or 0.0 if invalid
20
+ """
21
+ if cpu_value is None or cpu_value == "" or cpu_value == "?":
22
+ return 0.0
23
+ try:
24
+ if isinstance(cpu_value, (int, float)):
25
+ return float(cpu_value)
26
+
27
+ cpu_str = str(cpu_value).strip()
28
+ if cpu_str.endswith("m"):
29
+ return float(cpu_str[:-1]) / 1000.0
30
+ return float(cpu_str)
31
+ except (ValueError, AttributeError, TypeError):
32
+ return 0.0
33
+
34
+
35
+ def parse_memory(memory_value: Any) -> float:
36
+ """Parse Kubernetes memory value to float (in bytes).
37
+
38
+ Handles:
39
+ - Numeric values (already in bytes)
40
+ - String values with units (100Mi, 1Gi, etc.)
41
+ - String numeric values ("1048576")
42
+
43
+ Args:
44
+ memory_value: Memory value to parse (can be int, float, str, or None)
45
+
46
+ Returns:
47
+ Memory value in bytes as float, or 0.0 if invalid
48
+ """
49
+ if memory_value is None or memory_value == "" or memory_value == "?":
50
+ return 0.0
51
+ try:
52
+ if isinstance(memory_value, (int, float)):
53
+ return float(memory_value)
54
+
55
+ memory_str = str(memory_value).strip()
56
+ units = {
57
+ "Ki": 1024,
58
+ "Mi": 1024**2,
59
+ "Gi": 1024**3,
60
+ "Ti": 1024**4,
61
+ "K": 1000,
62
+ "M": 1000**2,
63
+ "G": 1000**3,
64
+ "T": 1000**4,
65
+ }
66
+ for unit, multiplier in units.items():
67
+ if memory_str.endswith(unit):
68
+ return float(memory_str[: -len(unit)]) * multiplier
69
+ return float(memory_str)
70
+ except (ValueError, AttributeError, TypeError):
71
+ return 0.0
72
+
73
+
74
+ # Helper to get numeric value from allocated/recommended, handling "?" strings
75
+ def get_value(data: Dict, field: str, subfield: str) -> Any:
76
+ if not data:
77
+ return 0.0
78
+ val = data.get(field, {}).get(subfield)
79
+ if val is None or val == "?":
80
+ return 0.0
81
+ return val
82
+
83
+
84
+ def calculate_krr_savings(result: Dict, sort_by: str) -> float:
85
+ """Calculate potential savings from KRR recommendation data.
86
+
87
+ The KRR data structure has a 'content' field that contains a list of resource
88
+ recommendations. Each item in the list represents either CPU or memory, with:
89
+ - resource: "cpu" or "memory"
90
+ - allocated: {request: value, limit: value} - current allocation
91
+ - recommended: {request: value, limit: value} - recommended allocation
92
+
93
+ Args:
94
+ result: KRR scan result dictionary with 'content' field
95
+ sort_by: Sorting criteria, one of:
96
+ - "cpu_total": Total CPU savings (requests + limits)
97
+ - "memory_total": Total memory savings (requests + limits)
98
+ - "cpu_requests": CPU requests savings only
99
+ - "memory_requests": Memory requests savings only
100
+ - "cpu_limits": CPU limits savings only
101
+ - "memory_limits": Memory limits savings only
102
+
103
+ Returns:
104
+ Calculated savings as a float (>= 0.0). Returns 0.0 for invalid data
105
+ or when recommended values are higher than allocated.
106
+ """
107
+ try:
108
+ content_list = result.get("content", [])
109
+ if not content_list or not isinstance(content_list, list):
110
+ return 0.0
111
+
112
+ cpu_data = None
113
+ memory_data = None
114
+ for item in content_list:
115
+ if item.get("resource") == "cpu":
116
+ cpu_data = item
117
+ elif item.get("resource") == "memory":
118
+ memory_data = item
119
+
120
+ if not cpu_data and not memory_data:
121
+ return 0.0
122
+
123
+ savings = 0.0
124
+
125
+ if sort_by == "cpu_total" and cpu_data:
126
+ cpu_req_allocated = parse_cpu(get_value(cpu_data, "allocated", "request"))
127
+ cpu_req_recommended = parse_cpu(
128
+ get_value(cpu_data, "recommended", "request")
129
+ )
130
+ cpu_lim_allocated = parse_cpu(get_value(cpu_data, "allocated", "limit"))
131
+ cpu_lim_recommended = parse_cpu(get_value(cpu_data, "recommended", "limit"))
132
+
133
+ savings = (cpu_req_allocated - cpu_req_recommended) + (
134
+ cpu_lim_allocated - cpu_lim_recommended
135
+ )
136
+
137
+ elif sort_by == "memory_total" and memory_data:
138
+ mem_req_allocated = parse_memory(
139
+ get_value(memory_data, "allocated", "request")
140
+ )
141
+ mem_req_recommended = parse_memory(
142
+ get_value(memory_data, "recommended", "request")
143
+ )
144
+ mem_lim_allocated = parse_memory(
145
+ get_value(memory_data, "allocated", "limit")
146
+ )
147
+ mem_lim_recommended = parse_memory(
148
+ get_value(memory_data, "recommended", "limit")
149
+ )
150
+
151
+ savings = (mem_req_allocated - mem_req_recommended) + (
152
+ mem_lim_allocated - mem_lim_recommended
153
+ )
154
+
155
+ elif sort_by == "cpu_requests" and cpu_data:
156
+ cpu_req_allocated = parse_cpu(get_value(cpu_data, "allocated", "request"))
157
+ cpu_req_recommended = parse_cpu(
158
+ get_value(cpu_data, "recommended", "request")
159
+ )
160
+ savings = cpu_req_allocated - cpu_req_recommended
161
+
162
+ elif sort_by == "memory_requests" and memory_data:
163
+ mem_req_allocated = parse_memory(
164
+ get_value(memory_data, "allocated", "request")
165
+ )
166
+ mem_req_recommended = parse_memory(
167
+ get_value(memory_data, "recommended", "request")
168
+ )
169
+ savings = mem_req_allocated - mem_req_recommended
170
+
171
+ elif sort_by == "cpu_limits" and cpu_data:
172
+ cpu_lim_allocated = parse_cpu(get_value(cpu_data, "allocated", "limit"))
173
+ cpu_lim_recommended = parse_cpu(get_value(cpu_data, "recommended", "limit"))
174
+ savings = cpu_lim_allocated - cpu_lim_recommended
175
+
176
+ elif sort_by == "memory_limits" and memory_data:
177
+ mem_lim_allocated = parse_memory(
178
+ get_value(memory_data, "allocated", "limit")
179
+ )
180
+ mem_lim_recommended = parse_memory(
181
+ get_value(memory_data, "recommended", "limit")
182
+ )
183
+ savings = mem_lim_allocated - mem_lim_recommended
184
+
185
+ return savings
186
+ except Exception as e:
187
+ logging.debug(f"Error calculating savings for result: {e}")
188
+ return 0.0
holmes/utils/log.py ADDED
@@ -0,0 +1,15 @@
1
+ """Logging utilities for Holmes."""
2
+
3
+ import logging
4
+ from typing import Any
5
+
6
+
7
+ class EndpointFilter(logging.Filter):
8
+ """Filter out log records for specific endpoint paths."""
9
+
10
+ def __init__(self, path: str, *args: Any, **kwargs: Any):
11
+ super().__init__(*args, **kwargs)
12
+ self._path = path
13
+
14
+ def filter(self, record: logging.LogRecord) -> bool:
15
+ return record.getMessage().find(self._path) == -1
@@ -1,9 +1,8 @@
1
1
  # based on https://github.com/kostyachum/python-markdown-plain-text/blob/main/markdown_plain_text/extention.py
2
2
  # MIT licensed
3
- from markdown import Extension, Markdown # type: ignore
3
+ from xml.etree.ElementTree import Comment, ElementTree, ProcessingInstruction
4
4
 
5
- from xml.etree.ElementTree import ProcessingInstruction
6
- from xml.etree.ElementTree import Comment, ElementTree
5
+ from markdown import Extension, Markdown # type: ignore
7
6
 
8
7
 
9
8
  def _serialize_plain_text(write, elem):
@@ -0,0 +1,58 @@
1
+ """
2
+ Memory limit utilities for tool subprocess execution.
3
+ """
4
+
5
+ import logging
6
+
7
+ from holmes.common.env_vars import TOOL_MEMORY_LIMIT_MB
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def get_ulimit_prefix() -> str:
13
+ """
14
+ Get the ulimit command prefix for memory protection.
15
+
16
+ Returns a shell command prefix that sets virtual memory limit.
17
+ The '|| true' ensures we continue even if ulimit is not supported.
18
+ """
19
+ memory_limit_kb = TOOL_MEMORY_LIMIT_MB * 1024
20
+ return f"ulimit -v {memory_limit_kb} || true; "
21
+
22
+
23
+ def check_oom_and_append_hint(output: str, return_code: int) -> str:
24
+ """
25
+ Check if a command was OOM killed and append a helpful hint.
26
+
27
+ Args:
28
+ output: The command output
29
+ return_code: The command's return code
30
+
31
+ Returns:
32
+ Output with OOM hint appended if OOM was detected
33
+ """
34
+ # Common OOM indicators:
35
+ # - Return code 137 (128 + 9 = SIGKILL, commonly OOM)
36
+ # - Return code -9 (SIGKILL on some systems)
37
+ # - "Killed" in output (Linux OOM killer message)
38
+ # - "MemoryError" (Python)
39
+ # - "Cannot allocate memory" (various tools)
40
+ is_oom = (
41
+ return_code in (137, -9)
42
+ or "Killed" in output
43
+ or "MemoryError" in output
44
+ or "Cannot allocate memory" in output
45
+ or "bad_alloc" in output
46
+ )
47
+
48
+ if is_oom:
49
+ hint = (
50
+ f"\n\n[OOM] Command was killed due to memory limits (current limit: {TOOL_MEMORY_LIMIT_MB} MB). "
51
+ f"Try querying the data differently to reduce memory usage - add filters to narrow the results, "
52
+ f"use smaller time ranges, or try alternative tools that may be more memory-efficient. "
53
+ f"If you cannot succeed with a modified query, you may recommend the user increase the limit "
54
+ f"by setting the TOOL_MEMORY_LIMIT_MB environment variable (Tool memory limit, MB)."
55
+ )
56
+ return output + hint
57
+
58
+ return output
@@ -0,0 +1,64 @@
1
+ from typing import Optional
2
+
3
+ import sentry_sdk
4
+
5
+ from holmes.core.models import ToolCallResult, TruncationMetadata
6
+
7
+
8
+ def capture_tool_truncations(truncations: list[TruncationMetadata]):
9
+ for truncation in truncations:
10
+ _capture_tool_truncation(truncation)
11
+
12
+
13
+ def _capture_tool_truncation(truncation: TruncationMetadata):
14
+ sentry_sdk.capture_message(
15
+ f"Tool {truncation.tool_name} was truncated",
16
+ level="warning",
17
+ tags={
18
+ "tool_name": truncation.tool_name,
19
+ "tool_original_token_count": truncation.original_token_count,
20
+ "tool_new_token_count": truncation.end_index,
21
+ },
22
+ )
23
+
24
+
25
+ def capture_toolcall_contains_too_many_tokens(
26
+ tool_call_result: ToolCallResult, token_count: int, max_allowed_token_count: int
27
+ ):
28
+ sentry_sdk.capture_message(
29
+ f"Tool call {tool_call_result.tool_name} contains too many tokens",
30
+ level="warning",
31
+ tags={
32
+ "tool_name": tool_call_result.tool_name,
33
+ "tool_original_token_count": token_count,
34
+ "tool_max_allowed_token_count": max_allowed_token_count,
35
+ "tool_description": tool_call_result.description,
36
+ },
37
+ )
38
+
39
+
40
+ def capture_structured_output_incorrect_tool_call():
41
+ sentry_sdk.capture_message(
42
+ "Structured output incorrect tool call",
43
+ level="warning",
44
+ )
45
+
46
+
47
+ def capture_sections_none(content: Optional[str]):
48
+ # Limit display length to avoid sending huge payloads to Sentry
49
+ _MAX_DISPLAY_LENGTH = 1500
50
+ display_content = ""
51
+ if content:
52
+ if len(content) > _MAX_DISPLAY_LENGTH * 2:
53
+ # Show first and last portions of content
54
+ display_content = f"{content[:_MAX_DISPLAY_LENGTH]}...\n\n...{content[-_MAX_DISPLAY_LENGTH:]}"
55
+ else:
56
+ display_content = content
57
+
58
+ with sentry_sdk.push_scope() as scope:
59
+ scope.set_extra("content", display_content)
60
+ scope.set_extra("content_length", len(content) if content else 0)
61
+ sentry_sdk.capture_message(
62
+ "Holmes answer couldn't be parsed into sections",
63
+ level="warning",
64
+ )
holmes/utils/stream.py CHANGED
@@ -1,10 +1,17 @@
1
1
  import json
2
+ import logging
2
3
  from enum import Enum
3
- from typing import Generator, Optional, List
4
+ from functools import partial
5
+ from typing import Generator, List, Optional, Union
6
+
4
7
  import litellm
8
+ from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
9
+ from litellm.types.utils import ModelResponse, TextCompletionResponse
5
10
  from pydantic import BaseModel, Field
11
+
6
12
  from holmes.core.investigation_structured_output import process_response_into_sections
7
- from functools import partial
13
+ from holmes.core.llm import TokenCountMetadata, get_llm_usage
14
+ from holmes.utils import sentry_helper
8
15
 
9
16
 
10
17
  class StreamEvents(str, Enum):
@@ -13,6 +20,9 @@ class StreamEvents(str, Enum):
13
20
  TOOL_RESULT = "tool_calling_result"
14
21
  ERROR = "error"
15
22
  AI_MESSAGE = "ai_message"
23
+ APPROVAL_REQUIRED = "approval_required"
24
+ TOKEN_COUNT = "token_count"
25
+ CONVERSATION_HISTORY_COMPACTED = "conversation_history_compacted"
16
26
 
17
27
 
18
28
  class StreamMessage(BaseModel):
@@ -55,12 +65,18 @@ def stream_investigate_formatter(
55
65
  message.data.get("content")
56
66
  )
57
67
 
68
+ if sections is None:
69
+ sentry_helper.capture_sections_none(
70
+ content=message.data.get("content"),
71
+ )
72
+
58
73
  yield create_sse_message(
59
74
  StreamEvents.ANSWER_END.value,
60
75
  {
61
76
  "sections": sections or {},
62
77
  "analysis": text_response,
63
78
  "instructions": runbooks or [],
79
+ "metadata": message.data.get("metadata") or {},
64
80
  },
65
81
  )
66
82
  else:
@@ -76,15 +92,60 @@ def stream_chat_formatter(
76
92
  try:
77
93
  for message in call_stream:
78
94
  if message.event == StreamEvents.ANSWER_END:
95
+ response_data = {
96
+ "analysis": message.data.get("content"),
97
+ "conversation_history": message.data.get("messages"),
98
+ "follow_up_actions": followups,
99
+ "metadata": message.data.get("metadata") or {},
100
+ }
101
+
102
+ yield create_sse_message(StreamEvents.ANSWER_END.value, response_data)
103
+ elif message.event == StreamEvents.APPROVAL_REQUIRED:
104
+ response_data = {
105
+ "analysis": message.data.get("content"),
106
+ "conversation_history": message.data.get("messages"),
107
+ "follow_up_actions": followups,
108
+ }
109
+
110
+ response_data["requires_approval"] = True
111
+ response_data["pending_approvals"] = message.data.get(
112
+ "pending_approvals", []
113
+ )
114
+
79
115
  yield create_sse_message(
80
- StreamEvents.ANSWER_END.value,
81
- {
82
- "analysis": message.data.get("content"),
83
- "conversation_history": message.data.get("messages"),
84
- "follow_up_actions": followups,
85
- },
116
+ StreamEvents.APPROVAL_REQUIRED.value, response_data
86
117
  )
87
118
  else:
88
119
  yield create_sse_message(message.event.value, message.data)
89
120
  except litellm.exceptions.RateLimitError as e:
90
121
  yield create_rate_limit_error_message(str(e))
122
+ except Exception as e:
123
+ logging.error(e)
124
+ if "Model is getting throttled" in str(e): # happens for bedrock
125
+ yield create_rate_limit_error_message(str(e))
126
+ else:
127
+ yield create_sse_error_message(description=str(e), error_code=1, msg=str(e))
128
+
129
+
130
+ def add_token_count_to_metadata(
131
+ tokens: TokenCountMetadata,
132
+ metadata: dict,
133
+ max_context_size: int,
134
+ maximum_output_token: int,
135
+ full_llm_response: Union[
136
+ ModelResponse, CustomStreamWrapper, TextCompletionResponse
137
+ ],
138
+ ):
139
+ metadata["usage"] = get_llm_usage(full_llm_response)
140
+ metadata["tokens"] = tokens.model_dump()
141
+ metadata["max_tokens"] = max_context_size
142
+ metadata["max_output_tokens"] = maximum_output_token
143
+
144
+
145
+ def build_stream_event_token_count(metadata: dict) -> StreamMessage:
146
+ return StreamMessage(
147
+ event=StreamEvents.TOKEN_COUNT,
148
+ data={
149
+ "metadata": metadata,
150
+ },
151
+ )
holmes/utils/tags.py CHANGED
@@ -1,9 +1,10 @@
1
+ import json
1
2
  import logging
2
- from typing import Optional
3
- from typing_extensions import Dict, List
4
3
  import re
5
- import json
6
4
  from copy import deepcopy
5
+ from typing import Optional
6
+
7
+ from typing_extensions import Dict, List
7
8
 
8
9
 
9
10
  def stringify_tag(tag: Dict[str, str]) -> Optional[str]: