holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -2,8 +2,7 @@ import concurrent.futures
2
2
  import json
3
3
  import logging
4
4
  import textwrap
5
- from typing import Dict, List, Optional, Type, Union, Callable
6
-
5
+ from typing import Any, Callable, Dict, List, Optional, Type, Union
7
6
 
8
7
  import sentry_sdk
9
8
  from openai import BadRequestError
@@ -14,11 +13,10 @@ from pydantic import BaseModel, Field
14
13
  from rich.console import Console
15
14
 
16
15
  from holmes.common.env_vars import (
17
- TEMPERATURE,
18
- MAX_OUTPUT_TOKEN_RESERVATION,
19
16
  LOG_LLM_USAGE_RESPONSE,
17
+ RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
18
+ TEMPERATURE,
20
19
  )
21
-
22
20
  from holmes.core.investigation_structured_output import (
23
21
  DEFAULT_SECTIONS,
24
22
  REQUEST_STRUCTURED_OUTPUT_FROM_LLM,
@@ -28,21 +26,42 @@ from holmes.core.investigation_structured_output import (
28
26
  )
29
27
  from holmes.core.issue import Issue
30
28
  from holmes.core.llm import LLM
31
- from holmes.core.performance_timing import PerformanceTiming
32
- from holmes.core.resource_instruction import ResourceInstructions
29
+ from holmes.core.models import (
30
+ PendingToolApproval,
31
+ ToolApprovalDecision,
32
+ ToolCallResult,
33
+ )
34
+ from holmes.core.prompt import generate_user_prompt
33
35
  from holmes.core.runbooks import RunbookManager
34
36
  from holmes.core.safeguards import prevent_overly_repeated_tool_call
35
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
36
- from holmes.plugins.prompts import load_and_render_prompt
37
- from holmes.utils.global_instructions import (
38
- Instructions,
39
- add_global_instructions_to_user_prompt,
37
+ from holmes.core.tools import (
38
+ StructuredToolResult,
39
+ StructuredToolResultStatus,
40
+ ToolInvokeContext,
41
+ )
42
+ from holmes.core.tools_utils.tool_context_window_limiter import (
43
+ prevent_overly_big_tool_response,
40
44
  )
41
- from holmes.utils.tags import format_tags_in_string, parse_messages_tags
42
45
  from holmes.core.tools_utils.tool_executor import ToolExecutor
43
46
  from holmes.core.tracing import DummySpan
47
+ from holmes.core.truncation.input_context_window_limiter import (
48
+ limit_input_context_window,
49
+ )
50
+ from holmes.plugins.prompts import load_and_render_prompt
51
+ from holmes.plugins.runbooks import RunbookCatalog
52
+ from holmes.utils import sentry_helper
44
53
  from holmes.utils.colors import AI_COLOR
45
- from holmes.utils.stream import StreamEvents, StreamMessage
54
+ from holmes.utils.global_instructions import (
55
+ Instructions,
56
+ generate_runbooks_args,
57
+ )
58
+ from holmes.utils.stream import (
59
+ StreamEvents,
60
+ StreamMessage,
61
+ add_token_count_to_metadata,
62
+ build_stream_event_token_count,
63
+ )
64
+ from holmes.utils.tags import parse_messages_tags
46
65
 
47
66
  # Create a named logger for cost tracking
48
67
  cost_logger = logging.getLogger("holmes.costs")
@@ -119,156 +138,16 @@ def _process_cost_info(
119
138
  logging.debug(f"Could not extract cost information: {e}")
120
139
 
121
140
 
122
- def format_tool_result_data(tool_result: StructuredToolResult) -> str:
123
- tool_response = tool_result.data
124
- if isinstance(tool_result.data, str):
125
- tool_response = tool_result.data
126
- else:
127
- try:
128
- if isinstance(tool_result.data, BaseModel):
129
- tool_response = tool_result.data.model_dump_json(indent=2)
130
- else:
131
- tool_response = json.dumps(tool_result.data, indent=2)
132
- except Exception:
133
- tool_response = str(tool_result.data)
134
- if tool_result.status == ToolResultStatus.ERROR:
135
- tool_response = f"{tool_result.error or 'Tool execution failed'}:\n\n{tool_result.data or ''}".strip()
136
- return tool_response
137
-
138
-
139
- # TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
140
- # However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
141
- # We should fix this in the future
142
- # TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
143
- # token truncation and not character truncation
144
- def truncate_messages_to_fit_context(
145
- messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
146
- ) -> list:
147
- """
148
- Helper function to truncate tool messages to fit within context limits.
149
-
150
- Args:
151
- messages: List of message dictionaries with roles and content
152
- max_context_size: Maximum context window size for the model
153
- maximum_output_token: Maximum tokens reserved for model output
154
- count_tokens_fn: Function to count tokens for a list of messages
155
-
156
- Returns:
157
- Modified list of messages with truncated tool responses
158
-
159
- Raises:
160
- Exception: If non-tool messages exceed available context space
161
- """
162
- messages_except_tools = [
163
- message for message in messages if message["role"] != "tool"
164
- ]
165
- message_size_without_tools = count_tokens_fn(messages_except_tools)
166
-
167
- tool_call_messages = [message for message in messages if message["role"] == "tool"]
168
-
169
- reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
170
- if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
171
- logging.error(
172
- f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
173
- )
174
- raise Exception(
175
- f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
176
- )
177
-
178
- if len(tool_call_messages) == 0:
179
- return messages
180
-
181
- available_space = (
182
- max_context_size - message_size_without_tools - maximum_output_token
183
- )
184
- remaining_space = available_space
185
- tool_call_messages.sort(key=lambda x: len(x["content"]))
186
-
187
- # Allocate space starting with small tools and going to larger tools, while maintaining fairness
188
- # Small tools can often get exactly what they need, while larger tools may need to be truncated
189
- # We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
190
- for i, msg in enumerate(tool_call_messages):
191
- remaining_tools = len(tool_call_messages) - i
192
- max_allocation = remaining_space // remaining_tools
193
- needed_space = len(msg["content"])
194
- allocated_space = min(needed_space, max_allocation)
195
-
196
- if needed_space > allocated_space:
197
- truncation_notice = "\n\n[TRUNCATED]"
198
- # Ensure the indicator fits in the allocated space
199
- if allocated_space > len(truncation_notice):
200
- msg["content"] = (
201
- msg["content"][: allocated_space - len(truncation_notice)]
202
- + truncation_notice
203
- )
204
- logging.info(
205
- f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space-len(truncation_notice)} tokens"
206
- )
207
- else:
208
- msg["content"] = truncation_notice[:allocated_space]
209
- logging.info(
210
- f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space} tokens"
211
- )
212
- msg.pop("token_count", None) # Remove token_count if present
213
-
214
- remaining_space -= allocated_space
215
- return messages
216
-
217
-
218
- class ToolCallResult(BaseModel):
219
- tool_call_id: str
220
- tool_name: str
221
- description: str
222
- result: StructuredToolResult
223
- size: Optional[int] = None
224
-
225
- def as_tool_call_message(self):
226
- content = format_tool_result_data(self.result)
227
- if self.result.params:
228
- content = (
229
- f"Params used for the tool call: {json.dumps(self.result.params)}. The tool call output follows on the next line.\n"
230
- + content
231
- )
232
- return {
233
- "tool_call_id": self.tool_call_id,
234
- "role": "tool",
235
- "name": self.tool_name,
236
- "content": content,
237
- }
238
-
239
- def as_tool_result_response(self):
240
- result_dump = self.result.model_dump()
241
- result_dump["data"] = self.result.get_stringified_data()
242
-
243
- return {
244
- "tool_call_id": self.tool_call_id,
245
- "tool_name": self.tool_name,
246
- "description": self.description,
247
- "role": "tool",
248
- "result": result_dump,
249
- }
250
-
251
- def as_streaming_tool_result_response(self):
252
- result_dump = self.result.model_dump()
253
- result_dump["data"] = self.result.get_stringified_data()
254
-
255
- return {
256
- "tool_call_id": self.tool_call_id,
257
- "role": "tool",
258
- "description": self.description,
259
- "name": self.tool_name,
260
- "result": result_dump,
261
- }
262
-
263
-
264
141
  class LLMResult(LLMCosts):
265
142
  tool_calls: Optional[List[ToolCallResult]] = None
143
+ num_llm_calls: Optional[int] = None # Number of LLM API calls (turns)
266
144
  result: Optional[str] = None
267
145
  unprocessed_result: Optional[str] = None
268
146
  instructions: List[str] = Field(default_factory=list)
269
147
  # TODO: clean up these two
270
148
  prompt: Optional[str] = None
271
149
  messages: Optional[List[dict]] = None
150
+ metadata: Optional[Dict[Any, Any]] = None
272
151
 
273
152
  def get_tool_usage_summary(self):
274
153
  return "AI used info from issue and " + ",".join(
@@ -276,6 +155,12 @@ class LLMResult(LLMCosts):
276
155
  )
277
156
 
278
157
 
158
+ class ToolCallWithDecision(BaseModel):
159
+ message_index: int
160
+ tool_call: ChatCompletionMessageToolCall
161
+ decision: Optional[ToolApprovalDecision]
162
+
163
+
279
164
  class ToolCallingLLM:
280
165
  llm: LLM
281
166
 
@@ -290,11 +175,99 @@ class ToolCallingLLM:
290
175
  Callable[[StructuredToolResult], tuple[bool, Optional[str]]]
291
176
  ] = None
292
177
 
178
+ def process_tool_decisions(
179
+ self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
180
+ ) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
181
+ """
182
+ Process tool approval decisions and execute approved tools.
183
+
184
+ Args:
185
+ messages: Current conversation messages
186
+ tool_decisions: List of ToolApprovalDecision objects
187
+
188
+ Returns:
189
+ Updated messages list with tool execution results
190
+ """
191
+ events: list[StreamMessage] = []
192
+ if not tool_decisions:
193
+ return messages, events
194
+
195
+ # Create decision lookup
196
+ decisions_by_tool_call_id = {
197
+ decision.tool_call_id: decision for decision in tool_decisions
198
+ }
199
+
200
+ pending_tool_calls: list[ToolCallWithDecision] = []
201
+
202
+ for i in reversed(range(len(messages))):
203
+ msg = messages[i]
204
+ if msg.get("role") == "assistant" and msg.get("tool_calls"):
205
+ message_tool_calls = msg.get("tool_calls", [])
206
+ for tool_call in message_tool_calls:
207
+ decision = decisions_by_tool_call_id.get(tool_call.get("id"), None)
208
+ if tool_call.get("pending_approval"):
209
+ del tool_call[
210
+ "pending_approval"
211
+ ] # Cleanup so that a pending approval is not tagged on message in a future response
212
+ pending_tool_calls.append(
213
+ ToolCallWithDecision(
214
+ tool_call=ChatCompletionMessageToolCall(**tool_call),
215
+ decision=decision,
216
+ message_index=i,
217
+ )
218
+ )
219
+
220
+ if not pending_tool_calls:
221
+ error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
222
+ logging.error(error_message)
223
+ raise Exception(error_message)
224
+ for tool_call_with_decision in pending_tool_calls:
225
+ tool_call_message: dict
226
+ tool_call = tool_call_with_decision.tool_call
227
+ decision = tool_call_with_decision.decision
228
+ tool_result: Optional[ToolCallResult] = None
229
+ if decision and decision.approved:
230
+ tool_result = self._invoke_llm_tool_call(
231
+ tool_to_call=tool_call,
232
+ previous_tool_calls=[],
233
+ trace_span=DummySpan(), # TODO: replace with proper span
234
+ tool_number=None,
235
+ user_approved=True,
236
+ )
237
+ else:
238
+ # Tool was rejected or no decision found, add rejection message
239
+ tool_result = ToolCallResult(
240
+ tool_call_id=tool_call.id,
241
+ tool_name=tool_call.function.name,
242
+ description=tool_call.function.name,
243
+ result=StructuredToolResult(
244
+ status=StructuredToolResultStatus.ERROR,
245
+ error="Tool execution was denied by the user.",
246
+ ),
247
+ )
248
+
249
+ events.append(
250
+ StreamMessage(
251
+ event=StreamEvents.TOOL_RESULT,
252
+ data=tool_result.as_streaming_tool_result_response(),
253
+ )
254
+ )
255
+
256
+ tool_call_message = tool_result.as_tool_call_message()
257
+
258
+ # It is expected that the tool call result directly follows the tool call request from the LLM
259
+ # The API call may contain a user ask which is appended to the messages so we can't just append
260
+ # tool call results; they need to be inserted right after the llm's message requesting tool calls
261
+ messages.insert(
262
+ tool_call_with_decision.message_index + 1, tool_call_message
263
+ )
264
+
265
+ return messages, events
266
+
293
267
  def prompt_call(
294
268
  self,
295
269
  system_prompt: str,
296
270
  user_prompt: str,
297
- post_process_prompt: Optional[str] = None,
298
271
  response_format: Optional[Union[dict, Type[BaseModel]]] = None,
299
272
  sections: Optional[InputSectionsDataType] = None,
300
273
  trace_span=DummySpan(),
@@ -305,8 +278,7 @@ class ToolCallingLLM:
305
278
  ]
306
279
  return self.call(
307
280
  messages,
308
- post_process_prompt,
309
- response_format,
281
+ response_format=response_format,
310
282
  user_prompt=user_prompt,
311
283
  sections=sections,
312
284
  trace_span=trace_span,
@@ -315,55 +287,52 @@ class ToolCallingLLM:
315
287
  def messages_call(
316
288
  self,
317
289
  messages: List[Dict[str, str]],
318
- post_process_prompt: Optional[str] = None,
319
290
  response_format: Optional[Union[dict, Type[BaseModel]]] = None,
320
291
  trace_span=DummySpan(),
321
292
  ) -> LLMResult:
322
293
  return self.call(
323
- messages, post_process_prompt, response_format, trace_span=trace_span
294
+ messages, response_format=response_format, trace_span=trace_span
324
295
  )
325
296
 
326
297
  @sentry_sdk.trace
327
298
  def call( # type: ignore
328
299
  self,
329
300
  messages: List[Dict[str, str]],
330
- post_process_prompt: Optional[str] = None,
331
301
  response_format: Optional[Union[dict, Type[BaseModel]]] = None,
332
302
  user_prompt: Optional[str] = None,
333
303
  sections: Optional[InputSectionsDataType] = None,
334
304
  trace_span=DummySpan(),
335
305
  tool_number_offset: int = 0,
336
306
  ) -> LLMResult:
337
- perf_timing = PerformanceTiming("tool_calling_llm.call")
338
- tool_calls = [] # type: ignore
307
+ tool_calls: list[
308
+ dict
309
+ ] = [] # Used for preventing repeated tool calls. potentially reset after compaction
310
+ all_tool_calls = [] # type: ignore
339
311
  costs = LLMCosts()
340
-
341
312
  tools = self.tool_executor.get_all_tools_openai_format(
342
313
  target_model=self.llm.model
343
314
  )
344
- perf_timing.measure("get_all_tools_openai_format")
345
315
  max_steps = self.max_steps
346
316
  i = 0
347
-
317
+ metadata: Dict[Any, Any] = {}
348
318
  while i < max_steps:
349
319
  i += 1
350
- perf_timing.measure(f"start iteration {i}")
351
320
  logging.debug(f"running iteration {i}")
352
321
  # on the last step we don't allow tools - we want to force a reply, not a request to run another tool
353
322
  tools = None if i == max_steps else tools
354
323
  tool_choice = "auto" if tools else None
355
324
 
356
- total_tokens = self.llm.count_tokens_for_message(messages)
357
- max_context_size = self.llm.get_context_window_size()
358
- maximum_output_token = self.llm.get_maximum_output_token()
359
- perf_timing.measure("count tokens")
325
+ limit_result = limit_input_context_window(
326
+ llm=self.llm, messages=messages, tools=tools
327
+ )
328
+ messages = limit_result.messages
329
+ metadata = metadata | limit_result.metadata
360
330
 
361
- if (total_tokens + maximum_output_token) > max_context_size:
362
- logging.warning("Token limit exceeded. Truncating tool responses.")
363
- messages = self.truncate_messages_to_fit_context(
364
- messages, max_context_size, maximum_output_token
365
- )
366
- perf_timing.measure("truncate_messages_to_fit_context")
331
+ if (
332
+ limit_result.conversation_history_compacted
333
+ and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
334
+ ):
335
+ tool_calls = []
367
336
 
368
337
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
369
338
 
@@ -381,7 +350,6 @@ class ToolCallingLLM:
381
350
  # Extract and accumulate cost information
382
351
  _process_cost_info(full_response, costs, "LLM call")
383
352
 
384
- perf_timing.measure("llm.completion")
385
353
  # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
386
354
  except BadRequestError as e:
387
355
  if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -405,9 +373,10 @@ class ToolCallingLLM:
405
373
 
406
374
  if incorrect_tool_call:
407
375
  logging.warning(
408
- "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
376
+ "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
409
377
  )
410
378
  # disable structured output going forward and and retry
379
+ sentry_helper.capture_structured_output_incorrect_tool_call()
411
380
  response_format = None
412
381
  max_steps = max_steps + 1
413
382
  continue
@@ -424,42 +393,29 @@ class ToolCallingLLM:
424
393
  hasattr(response_message, "reasoning_content")
425
394
  and response_message.reasoning_content
426
395
  ):
427
- logging.debug(
428
- f"[bold {AI_COLOR}]AI (reasoning) 🤔:[/bold {AI_COLOR}] {response_message.reasoning_content}\n"
396
+ logging.info(
397
+ f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
429
398
  )
430
399
 
431
400
  if not tools_to_call:
432
- # For chatty models post process and summarize the result
433
- # this only works for calls where user prompt is explicitly passed through
434
- if post_process_prompt and user_prompt:
435
- logging.info("Running post processing on investigation.")
436
- raw_response = text_response
437
- post_processed_response, post_processing_cost = (
438
- self._post_processing_call(
439
- prompt=user_prompt,
440
- investigation=raw_response,
441
- user_prompt=post_process_prompt,
442
- )
443
- )
444
- costs.total_cost += post_processing_cost
445
-
446
- perf_timing.end(f"- completed in {i} iterations -")
447
- return LLMResult(
448
- result=post_processed_response,
449
- unprocessed_result=raw_response,
450
- tool_calls=tool_calls,
451
- prompt=json.dumps(messages, indent=2),
452
- messages=messages,
453
- **costs.model_dump(), # Include all cost fields
454
- )
401
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
402
+
403
+ add_token_count_to_metadata(
404
+ tokens=tokens,
405
+ full_llm_response=full_response,
406
+ max_context_size=limit_result.max_context_size,
407
+ maximum_output_token=limit_result.maximum_output_token,
408
+ metadata=metadata,
409
+ )
455
410
 
456
- perf_timing.end(f"- completed in {i} iterations -")
457
411
  return LLMResult(
458
412
  result=text_response,
459
- tool_calls=tool_calls,
413
+ tool_calls=all_tool_calls,
414
+ num_llm_calls=i,
460
415
  prompt=json.dumps(messages, indent=2),
461
416
  messages=messages,
462
417
  **costs.model_dump(), # Include all cost fields
418
+ metadata=metadata,
463
419
  )
464
420
 
465
421
  if text_response and text_response.strip():
@@ -467,7 +423,6 @@ class ToolCallingLLM:
467
423
  logging.info(
468
424
  f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
469
425
  )
470
- perf_timing.measure("pre-tool-calls")
471
426
  with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
472
427
  futures = []
473
428
  futures_tool_numbers: dict[
@@ -477,6 +432,7 @@ class ToolCallingLLM:
477
432
  for tool_index, t in enumerate(tools_to_call, 1):
478
433
  logging.debug(f"Tool to call: {t}")
479
434
  tool_number = tool_number_offset + tool_index
435
+
480
436
  future = executor.submit(
481
437
  self._invoke_llm_tool_call,
482
438
  tool_to_call=t,
@@ -495,14 +451,24 @@ class ToolCallingLLM:
495
451
  if future in futures_tool_numbers
496
452
  else None
497
453
  )
498
- tool_call_result = self.handle_tool_call_approval(
499
- tool_call_result=tool_call_result, tool_number=tool_number
500
- )
501
454
 
502
- tool_calls.append(tool_call_result.as_tool_result_response())
503
- messages.append(tool_call_result.as_tool_call_message())
455
+ if (
456
+ tool_call_result.result.status
457
+ == StructuredToolResultStatus.APPROVAL_REQUIRED
458
+ ):
459
+ tool_call_result = self._handle_tool_call_approval(
460
+ tool_call_result=tool_call_result,
461
+ tool_number=tool_number,
462
+ trace_span=trace_span,
463
+ )
504
464
 
505
- perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
465
+ tool_result_response_dict = (
466
+ tool_call_result.as_tool_result_response()
467
+ )
468
+ tool_calls.append(tool_result_response_dict)
469
+ all_tool_calls.append(tool_result_response_dict)
470
+ messages.append(tool_call_result.as_tool_call_message())
471
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
506
472
 
507
473
  # Update the tool number offset for the next iteration
508
474
  tool_number_offset += len(tools_to_call)
@@ -513,91 +479,55 @@ class ToolCallingLLM:
513
479
 
514
480
  raise Exception(f"Too many LLM calls - exceeded max_steps: {i}/{max_steps}")
515
481
 
516
- def _directly_invoke_tool(
482
+ def _directly_invoke_tool_call(
517
483
  self,
518
484
  tool_name: str,
519
485
  tool_params: dict,
520
486
  user_approved: bool,
521
- trace_span=DummySpan(),
487
+ tool_call_id: str,
522
488
  tool_number: Optional[int] = None,
523
489
  ) -> StructuredToolResult:
524
- tool_span = trace_span.start_span(name=tool_name, type="tool")
525
490
  tool = self.tool_executor.get_tool_by_name(tool_name)
526
- tool_response = None
491
+ if not tool:
492
+ logging.warning(
493
+ f"Skipping tool execution for {tool_name}: args: {tool_params}"
494
+ )
495
+ return StructuredToolResult(
496
+ status=StructuredToolResultStatus.ERROR,
497
+ error=f"Failed to find tool {tool_name}",
498
+ params=tool_params,
499
+ )
500
+
527
501
  try:
528
- if (not tool) or (tool_params is None):
529
- logging.warning(
530
- f"Skipping tool execution for {tool_name}: args: {tool_params}"
531
- )
532
- tool_response = StructuredToolResult(
533
- status=ToolResultStatus.ERROR,
534
- error=f"Failed to find tool {tool_name}",
535
- params=tool_params,
536
- )
537
- else:
538
- tool_response = tool.invoke(
539
- tool_params, tool_number=tool_number, user_approved=user_approved
540
- )
502
+ invoke_context = ToolInvokeContext(
503
+ tool_number=tool_number,
504
+ user_approved=user_approved,
505
+ llm=self.llm,
506
+ max_token_count=self.llm.get_max_token_count_for_single_tool(),
507
+ tool_name=tool_name,
508
+ tool_call_id=tool_call_id,
509
+ )
510
+ tool_response = tool.invoke(tool_params, context=invoke_context)
541
511
  except Exception as e:
542
512
  logging.error(
543
513
  f"Tool call to {tool_name} failed with an Exception", exc_info=True
544
514
  )
545
515
  tool_response = StructuredToolResult(
546
- status=ToolResultStatus.ERROR,
516
+ status=StructuredToolResultStatus.ERROR,
547
517
  error=f"Tool call failed: {e}",
548
518
  params=tool_params,
549
519
  )
550
-
551
- # Log error to trace span
552
- tool_span.log(
553
- input=tool_params, output=str(e), metadata={"status": "ERROR"}
554
- )
555
-
556
- tool_span.log(
557
- input=tool_params,
558
- output=tool_response.data,
559
- metadata={
560
- "status": tool_response.status.value,
561
- "error": tool_response.error,
562
- "description": tool.get_parameterized_one_liner(tool_params)
563
- if tool
564
- else "",
565
- "structured_tool_result": tool_response,
566
- },
567
- )
568
- tool_span.end()
569
-
570
520
  return tool_response
571
521
 
572
- def _invoke_llm_tool_call(
522
+ def _get_tool_call_result(
573
523
  self,
574
- tool_to_call: ChatCompletionMessageToolCall,
524
+ tool_call_id: str,
525
+ tool_name: str,
526
+ tool_arguments: str,
527
+ user_approved: bool,
575
528
  previous_tool_calls: list[dict],
576
- trace_span=DummySpan(),
577
- tool_number=None,
529
+ tool_number: Optional[int] = None,
578
530
  ) -> ToolCallResult:
579
- # Handle the union type - ChatCompletionMessageToolCall can be either
580
- # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
581
- # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
582
- # We use hasattr to check for the 'function' attribute as it's more flexible
583
- # and doesn't require importing the specific type.
584
- if hasattr(tool_to_call, "function"):
585
- tool_name = tool_to_call.function.name
586
- tool_arguments = tool_to_call.function.arguments
587
- else:
588
- # This is a custom tool call - we don't support these currently
589
- logging.error(f"Unsupported custom tool call: {tool_to_call}")
590
- return ToolCallResult(
591
- tool_call_id=tool_to_call.id,
592
- tool_name="unknown",
593
- description="NA",
594
- result=StructuredToolResult(
595
- status=ToolResultStatus.ERROR,
596
- error="Custom tool calls are not supported",
597
- params=None,
598
- ),
599
- )
600
-
601
531
  tool_params = {}
602
532
  try:
603
533
  tool_params = json.loads(tool_arguments)
@@ -606,21 +536,21 @@ class ToolCallingLLM:
606
536
  f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
607
537
  )
608
538
 
609
- tool_call_id = tool_to_call.id
610
-
611
- tool_response = prevent_overly_repeated_tool_call(
612
- tool_name=tool_name,
613
- tool_params=tool_params,
614
- tool_calls=previous_tool_calls,
615
- )
539
+ tool_response = None
540
+ if not user_approved:
541
+ tool_response = prevent_overly_repeated_tool_call(
542
+ tool_name=tool_name,
543
+ tool_params=tool_params,
544
+ tool_calls=previous_tool_calls,
545
+ )
616
546
 
617
547
  if not tool_response:
618
- tool_response = self._directly_invoke_tool(
548
+ tool_response = self._directly_invoke_tool_call(
619
549
  tool_name=tool_name,
620
550
  tool_params=tool_params,
621
- user_approved=False,
622
- trace_span=trace_span,
551
+ user_approved=user_approved,
623
552
  tool_number=tool_number,
553
+ tool_call_id=tool_call_id,
624
554
  )
625
555
 
626
556
  if not isinstance(tool_response, StructuredToolResult):
@@ -629,124 +559,165 @@ class ToolCallingLLM:
629
559
  f"Tool {tool_name} return type is not StructuredToolResult. Nesting the tool result into StructuredToolResult..."
630
560
  )
631
561
  tool_response = StructuredToolResult(
632
- status=ToolResultStatus.SUCCESS,
562
+ status=StructuredToolResultStatus.SUCCESS,
633
563
  data=tool_response,
634
564
  params=tool_params,
635
565
  )
636
566
 
637
567
  tool = self.tool_executor.get_tool_by_name(tool_name)
568
+
638
569
  return ToolCallResult(
639
570
  tool_call_id=tool_call_id,
640
571
  tool_name=tool_name,
641
- description=tool.get_parameterized_one_liner(tool_params) if tool else "",
572
+ description=str(tool.get_parameterized_one_liner(tool_params))
573
+ if tool
574
+ else "",
642
575
  result=tool_response,
643
576
  )
644
577
 
645
- def handle_tool_call_approval(
646
- self, tool_call_result: ToolCallResult, tool_number: Optional[int]
578
+ @staticmethod
579
+ def _log_tool_call_result(
580
+ tool_span,
581
+ tool_call_result: ToolCallResult,
582
+ approval_possible=True,
583
+ original_token_count=None,
584
+ ):
585
+ tool_span.set_attributes(name=tool_call_result.tool_name)
586
+ status = tool_call_result.result.status
587
+
588
+ if (
589
+ status == StructuredToolResultStatus.APPROVAL_REQUIRED
590
+ and not approval_possible
591
+ ):
592
+ status = StructuredToolResultStatus.ERROR
593
+
594
+ if status == StructuredToolResultStatus.ERROR:
595
+ error = (
596
+ tool_call_result.result.error
597
+ if tool_call_result.result.error
598
+ else "Unspecified error"
599
+ )
600
+ else:
601
+ error = None
602
+ tool_span.log(
603
+ input=tool_call_result.result.params,
604
+ output=tool_call_result.result.data,
605
+ error=error,
606
+ metadata={
607
+ "status": status,
608
+ "description": tool_call_result.description,
609
+ "return_code": tool_call_result.result.return_code,
610
+ "error": tool_call_result.result.error,
611
+ "original_token_count": original_token_count,
612
+ },
613
+ )
614
+
615
+ def _invoke_llm_tool_call(
616
+ self,
617
+ tool_to_call: ChatCompletionMessageToolCall,
618
+ previous_tool_calls: list[dict],
619
+ trace_span=None,
620
+ tool_number=None,
621
+ user_approved: bool = False,
622
+ ) -> ToolCallResult:
623
+ if trace_span is None:
624
+ trace_span = DummySpan()
625
+ with trace_span.start_span(type="tool") as tool_span:
626
+ if not hasattr(tool_to_call, "function"):
627
+ # Handle the union type - ChatCompletionMessageToolCall can be either
628
+ # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
629
+ # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
630
+ # We use hasattr to check for the 'function' attribute as it's more flexible
631
+ # and doesn't require importing the specific type.
632
+ tool_name = "Unknown_Custom_Tool"
633
+ logging.error(f"Unsupported custom tool call: {tool_to_call}")
634
+ tool_call_result = ToolCallResult(
635
+ tool_call_id=tool_to_call.id,
636
+ tool_name=tool_name,
637
+ description="NA",
638
+ result=StructuredToolResult(
639
+ status=StructuredToolResultStatus.ERROR,
640
+ error="Custom tool calls are not supported",
641
+ params=None,
642
+ ),
643
+ )
644
+ else:
645
+ tool_name = tool_to_call.function.name
646
+ tool_arguments = tool_to_call.function.arguments
647
+ tool_id = tool_to_call.id
648
+ tool_call_result = self._get_tool_call_result(
649
+ tool_id,
650
+ tool_name,
651
+ tool_arguments,
652
+ previous_tool_calls=previous_tool_calls,
653
+ tool_number=tool_number,
654
+ user_approved=user_approved,
655
+ )
656
+
657
+ original_token_count = prevent_overly_big_tool_response(
658
+ tool_call_result=tool_call_result, llm=self.llm
659
+ )
660
+
661
+ ToolCallingLLM._log_tool_call_result(
662
+ tool_span,
663
+ tool_call_result,
664
+ self.approval_callback is not None,
665
+ original_token_count,
666
+ )
667
+ return tool_call_result
668
+
669
+ def _handle_tool_call_approval(
670
+ self,
671
+ tool_call_result: ToolCallResult,
672
+ tool_number: Optional[int],
673
+ trace_span: Any,
647
674
  ) -> ToolCallResult:
648
675
  """
649
676
  Handle approval for a single tool call if required.
650
677
 
651
678
  Args:
652
679
  tool_call_result: A single tool call result that may require approval
680
+ tool_number: The tool call number
653
681
 
654
682
  Returns:
655
683
  Updated tool call result with approved/denied status
656
684
  """
657
685
 
658
- if tool_call_result.result.status != ToolResultStatus.APPROVAL_REQUIRED:
659
- return tool_call_result
660
-
661
686
  # If no approval callback, convert to ERROR because it is assumed the client may not be able to handle approvals
662
687
  if not self.approval_callback:
663
- tool_call_result.result.status = ToolResultStatus.ERROR
688
+ tool_call_result.result.status = StructuredToolResultStatus.ERROR
664
689
  return tool_call_result
665
690
 
666
691
  # Get approval from user
667
- approved, feedback = self.approval_callback(tool_call_result.result)
668
-
669
- if approved:
670
- logging.debug(
671
- f"User approved command: {tool_call_result.result.invocation}"
672
- )
673
-
674
- new_response = self._directly_invoke_tool(
675
- tool_name=tool_call_result.tool_name,
676
- tool_params=tool_call_result.result.params or {},
677
- user_approved=True,
678
- trace_span=DummySpan(),
679
- tool_number=tool_number,
680
- )
681
- tool_call_result.result = new_response
682
- else:
683
- # User denied - update to error
684
- feedback_text = f" User feedback: {feedback}" if feedback else ""
685
- tool_call_result.result.status = ToolResultStatus.ERROR
686
- tool_call_result.result.error = (
687
- f"User denied command execution.{feedback_text}"
688
- )
689
-
690
- return tool_call_result
691
-
692
- @staticmethod
693
- def __load_post_processing_user_prompt(
694
- input_prompt, investigation, user_prompt: Optional[str] = None
695
- ) -> str:
696
- if not user_prompt:
697
- user_prompt = "builtin://generic_post_processing.jinja2"
698
- return load_and_render_prompt(
699
- user_prompt, {"investigation": investigation, "prompt": input_prompt}
700
- )
701
-
702
- def _post_processing_call(
703
- self,
704
- prompt,
705
- investigation,
706
- user_prompt: Optional[str] = None,
707
- system_prompt: str = "You are an AI assistant summarizing Kubernetes issues.",
708
- ) -> tuple[Optional[str], float]:
709
- try:
710
- user_prompt = ToolCallingLLM.__load_post_processing_user_prompt(
711
- prompt, investigation, user_prompt
712
- )
713
-
714
- logging.debug(f'Post processing prompt:\n"""\n{user_prompt}\n"""')
715
- messages = [
716
- {
717
- "role": "system",
718
- "content": system_prompt,
719
- },
720
- {
721
- "role": "user",
722
- "content": format_tags_in_string(user_prompt),
723
- },
724
- ]
725
- full_response = self.llm.completion(messages=messages, temperature=0)
726
- logging.debug(f"Post processing response {full_response}")
727
-
728
- # Extract and log cost information for post-processing
729
- post_processing_cost = _extract_cost_from_response(full_response)
730
- if post_processing_cost > 0:
731
- cost_logger.debug(
732
- f"Post-processing LLM cost: ${post_processing_cost:.6f}"
692
+ with trace_span.start_span(
693
+ type="task", name=f"Ask approval for {tool_call_result.tool_name}"
694
+ ):
695
+ approved, feedback = self.approval_callback(tool_call_result.result)
696
+
697
+ # Note - Tool calls are currently logged twice, once when returning APPROVAL_REQUIRED and once here
698
+ with trace_span.start_span(type="tool") as tool_span:
699
+ if approved:
700
+ logging.debug(
701
+ f"User approved command: {tool_call_result.result.invocation}"
733
702
  )
703
+ new_response = self._directly_invoke_tool_call(
704
+ tool_name=tool_call_result.tool_name,
705
+ tool_params=tool_call_result.result.params or {},
706
+ user_approved=True,
707
+ tool_number=tool_number,
708
+ tool_call_id=tool_call_result.tool_call_id,
709
+ )
710
+ tool_call_result.result = new_response
711
+ else:
712
+ # User denied - update to error
713
+ feedback_text = f" User feedback: {feedback}" if feedback else ""
714
+ tool_call_result.result.status = StructuredToolResultStatus.ERROR
715
+ tool_call_result.result.error = (
716
+ f"User denied command execution.{feedback_text}"
717
+ )
718
+ ToolCallingLLM._log_tool_call_result(tool_span, tool_call_result)
734
719
 
735
- return full_response.choices[0].message.content, post_processing_cost # type: ignore
736
- except Exception:
737
- logging.exception("Failed to run post processing", exc_info=True)
738
- return investigation, 0.0
739
-
740
- @sentry_sdk.trace
741
- def truncate_messages_to_fit_context(
742
- self, messages: list, max_context_size: int, maximum_output_token: int
743
- ) -> list:
744
- return truncate_messages_to_fit_context(
745
- messages,
746
- max_context_size,
747
- maximum_output_token,
748
- self.llm.count_tokens_for_message,
749
- )
720
+ return tool_call_result
750
721
 
751
722
  def call_stream(
752
723
  self,
@@ -755,47 +726,55 @@ class ToolCallingLLM:
755
726
  response_format: Optional[Union[dict, Type[BaseModel]]] = None,
756
727
  sections: Optional[InputSectionsDataType] = None,
757
728
  msgs: Optional[list[dict]] = None,
729
+ enable_tool_approval: bool = False,
730
+ tool_decisions: List[ToolApprovalDecision] | None = None,
758
731
  ):
759
732
  """
760
733
  This function DOES NOT call llm.completion(stream=true).
761
734
  This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
762
735
  """
763
- messages = []
736
+
737
+ # Process tool decisions if provided
738
+ if msgs and tool_decisions:
739
+ logging.info(f"Processing {len(tool_decisions)} tool decisions")
740
+ msgs, events = self.process_tool_decisions(msgs, tool_decisions)
741
+ yield from events
742
+
743
+ messages: list[dict] = []
764
744
  if system_prompt:
765
745
  messages.append({"role": "system", "content": system_prompt})
766
746
  if user_prompt:
767
747
  messages.append({"role": "user", "content": user_prompt})
768
748
  if msgs:
769
749
  messages.extend(msgs)
770
- perf_timing = PerformanceTiming("tool_calling_llm.call")
771
750
  tool_calls: list[dict] = []
772
751
  tools = self.tool_executor.get_all_tools_openai_format(
773
752
  target_model=self.llm.model
774
753
  )
775
- perf_timing.measure("get_all_tools_openai_format")
776
754
  max_steps = self.max_steps
755
+ metadata: Dict[Any, Any] = {}
777
756
  i = 0
778
757
  tool_number_offset = 0
779
758
 
780
759
  while i < max_steps:
781
760
  i += 1
782
- perf_timing.measure(f"start iteration {i}")
783
761
  logging.debug(f"running iteration {i}")
784
762
 
785
763
  tools = None if i == max_steps else tools
786
764
  tool_choice = "auto" if tools else None
787
765
 
788
- total_tokens = self.llm.count_tokens_for_message(messages) # type: ignore
789
- max_context_size = self.llm.get_context_window_size()
790
- maximum_output_token = self.llm.get_maximum_output_token()
791
- perf_timing.measure("count tokens")
766
+ limit_result = limit_input_context_window(
767
+ llm=self.llm, messages=messages, tools=tools
768
+ )
769
+ yield from limit_result.events
770
+ messages = limit_result.messages
771
+ metadata = metadata | limit_result.metadata
792
772
 
793
- if (total_tokens + maximum_output_token) > max_context_size:
794
- logging.warning("Token limit exceeded. Truncating tool responses.")
795
- messages = self.truncate_messages_to_fit_context(
796
- messages, max_context_size, maximum_output_token
797
- )
798
- perf_timing.measure("truncate_messages_to_fit_context")
773
+ if (
774
+ limit_result.conversation_history_compacted
775
+ and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
776
+ ):
777
+ tool_calls = []
799
778
 
800
779
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
801
780
  try:
@@ -812,7 +791,6 @@ class ToolCallingLLM:
812
791
  # Log cost information for this iteration (no accumulation in streaming)
813
792
  _process_cost_info(full_response, log_prefix="LLM iteration")
814
793
 
815
- perf_timing.measure("llm.completion")
816
794
  # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
817
795
  except BadRequestError as e:
818
796
  if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -834,9 +812,10 @@ class ToolCallingLLM:
834
812
 
835
813
  if incorrect_tool_call:
836
814
  logging.warning(
837
- "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
815
+ "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
838
816
  )
839
817
  # disable structured output going forward and and retry
818
+ sentry_helper.capture_structured_output_incorrect_tool_call()
840
819
  response_format = None
841
820
  max_steps = max_steps + 1
842
821
  continue
@@ -847,11 +826,25 @@ class ToolCallingLLM:
847
826
  )
848
827
  )
849
828
 
829
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
830
+ add_token_count_to_metadata(
831
+ tokens=tokens,
832
+ full_llm_response=full_response,
833
+ max_context_size=limit_result.max_context_size,
834
+ maximum_output_token=limit_result.maximum_output_token,
835
+ metadata=metadata,
836
+ )
837
+ yield build_stream_event_token_count(metadata=metadata)
838
+
850
839
  tools_to_call = getattr(response_message, "tool_calls", None)
851
840
  if not tools_to_call:
852
841
  yield StreamMessage(
853
842
  event=StreamEvents.ANSWER_END,
854
- data={"content": response_message.content, "messages": messages},
843
+ data={
844
+ "content": response_message.content,
845
+ "messages": messages,
846
+ "metadata": metadata,
847
+ },
855
848
  )
856
849
  return
857
850
 
@@ -860,14 +853,22 @@ class ToolCallingLLM:
860
853
  if reasoning or message:
861
854
  yield StreamMessage(
862
855
  event=StreamEvents.AI_MESSAGE,
863
- data={"content": message, "reasoning": reasoning},
856
+ data={
857
+ "content": message,
858
+ "reasoning": reasoning,
859
+ "metadata": metadata,
860
+ },
864
861
  )
865
862
 
866
- perf_timing.measure("pre-tool-calls")
863
+ # Check if any tools require approval first
864
+ pending_approvals = []
865
+ approval_required_tools = []
866
+
867
867
  with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
868
868
  futures = []
869
869
  for tool_index, t in enumerate(tools_to_call, 1): # type: ignore
870
870
  tool_number = tool_number_offset + tool_index
871
+
871
872
  future = executor.submit(
872
873
  self._invoke_llm_tool_call,
873
874
  tool_to_call=t, # type: ignore
@@ -884,15 +885,72 @@ class ToolCallingLLM:
884
885
  for future in concurrent.futures.as_completed(futures):
885
886
  tool_call_result: ToolCallResult = future.result()
886
887
 
887
- tool_calls.append(tool_call_result.as_tool_result_response())
888
- messages.append(tool_call_result.as_tool_call_message())
888
+ if (
889
+ tool_call_result.result.status
890
+ == StructuredToolResultStatus.APPROVAL_REQUIRED
891
+ ):
892
+ if enable_tool_approval:
893
+ pending_approvals.append(
894
+ PendingToolApproval(
895
+ tool_call_id=tool_call_result.tool_call_id,
896
+ tool_name=tool_call_result.tool_name,
897
+ description=tool_call_result.description,
898
+ params=tool_call_result.result.params or {},
899
+ )
900
+ )
901
+ approval_required_tools.append(tool_call_result)
902
+
903
+ yield StreamMessage(
904
+ event=StreamEvents.TOOL_RESULT,
905
+ data=tool_call_result.as_streaming_tool_result_response(),
906
+ )
907
+ else:
908
+ tool_call_result.result.status = (
909
+ StructuredToolResultStatus.ERROR
910
+ )
911
+ tool_call_result.result.error = f"Tool call rejected for security reasons: {tool_call_result.result.error}"
912
+
913
+ tool_calls.append(
914
+ tool_call_result.as_tool_result_response()
915
+ )
916
+ messages.append(tool_call_result.as_tool_call_message())
917
+
918
+ yield StreamMessage(
919
+ event=StreamEvents.TOOL_RESULT,
920
+ data=tool_call_result.as_streaming_tool_result_response(),
921
+ )
922
+
923
+ else:
924
+ tool_calls.append(tool_call_result.as_tool_result_response())
925
+ messages.append(tool_call_result.as_tool_call_message())
926
+
927
+ yield StreamMessage(
928
+ event=StreamEvents.TOOL_RESULT,
929
+ data=tool_call_result.as_streaming_tool_result_response(),
930
+ )
889
931
 
890
- perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
932
+ # If we have approval required tools, end the stream with pending approvals
933
+ if pending_approvals:
934
+ # Add assistant message with pending tool calls
935
+ for result in approval_required_tools:
936
+ tool_call = self.find_assistant_tool_call_request(
937
+ tool_call_id=result.tool_call_id, messages=messages
938
+ )
939
+ tool_call["pending_approval"] = True
891
940
 
941
+ # End stream with approvals required
892
942
  yield StreamMessage(
893
- event=StreamEvents.TOOL_RESULT,
894
- data=tool_call_result.as_streaming_tool_result_response(),
943
+ event=StreamEvents.APPROVAL_REQUIRED,
944
+ data={
945
+ "content": None,
946
+ "messages": messages,
947
+ "pending_approvals": [
948
+ approval.model_dump() for approval in pending_approvals
949
+ ],
950
+ "requires_approval": True,
951
+ },
895
952
  )
953
+ return
896
954
 
897
955
  # Update the tool number offset for the next iteration
898
956
  tool_number_offset += len(tools_to_call)
@@ -901,6 +959,21 @@ class ToolCallingLLM:
901
959
  f"Too many LLM calls - exceeded max_steps: {i}/{self.max_steps}"
902
960
  )
903
961
 
962
+ def find_assistant_tool_call_request(
963
+ self, tool_call_id: str, messages: list[dict[str, Any]]
964
+ ) -> dict[str, Any]:
965
+ for message in messages:
966
+ if message.get("role") == "assistant":
967
+ for tool_call in message.get("tool_calls", []):
968
+ if tool_call.get("id") == tool_call_id:
969
+ return tool_call
970
+
971
+ # Should not happen unless there is a bug.
972
+ # If we are here
973
+ raise Exception(
974
+ f"Failed to find assistant request for a tool_call in conversation history. tool_call_id={tool_call_id}"
975
+ )
976
+
904
977
 
905
978
  # TODO: consider getting rid of this entirely and moving templating into the cmds in holmes_cli.py
906
979
  class IssueInvestigator(ToolCallingLLM):
@@ -927,14 +1000,13 @@ class IssueInvestigator(ToolCallingLLM):
927
1000
  self,
928
1001
  issue: Issue,
929
1002
  prompt: str,
930
- instructions: Optional[ResourceInstructions],
931
1003
  console: Optional[Console] = None,
932
1004
  global_instructions: Optional[Instructions] = None,
933
- post_processing_prompt: Optional[str] = None,
934
1005
  sections: Optional[InputSectionsDataType] = None,
935
1006
  trace_span=DummySpan(),
1007
+ runbooks: Optional[RunbookCatalog] = None,
936
1008
  ) -> LLMResult:
937
- runbooks = self.runbook_manager.get_instructions_for_issue(issue)
1009
+ issue_runbooks = self.runbook_manager.get_instructions_for_issue(issue)
938
1010
 
939
1011
  request_structured_output_from_llm = True
940
1012
  response_format = None
@@ -962,12 +1034,9 @@ class IssueInvestigator(ToolCallingLLM):
962
1034
  else:
963
1035
  logging.info("Structured output is disabled for this request")
964
1036
 
965
- if instructions is not None and instructions.instructions:
966
- runbooks.extend(instructions.instructions)
967
-
968
1037
  if console and runbooks:
969
1038
  console.print(
970
- f"[bold]Analyzing with {len(runbooks)} runbooks: {runbooks}[/bold]"
1039
+ f"[bold]Analyzing with {len(issue_runbooks)} runbooks: {issue_runbooks}[/bold]"
971
1040
  )
972
1041
  elif console:
973
1042
  console.print(
@@ -982,29 +1051,22 @@ class IssueInvestigator(ToolCallingLLM):
982
1051
  "structured_output": request_structured_output_from_llm,
983
1052
  "toolsets": self.tool_executor.toolsets,
984
1053
  "cluster_name": self.cluster_name,
1054
+ "runbooks_enabled": True if runbooks else False,
985
1055
  },
986
1056
  )
987
1057
 
988
- if instructions is not None and len(instructions.documents) > 0:
989
- docPrompts = []
990
- for document in instructions.documents:
991
- docPrompts.append(
992
- f"* fetch information from this URL: {document.url}\n"
993
- )
994
- runbooks.extend(docPrompts)
995
-
996
- user_prompt = ""
997
- if runbooks:
998
- for runbook_str in runbooks:
999
- user_prompt += f"* {runbook_str}\n"
1058
+ base_user = ""
1059
+ base_user = f"{base_user}\n #This is context from the issue:\n{issue.raw}"
1000
1060
 
1001
- user_prompt = f'My instructions to check \n"""{user_prompt}"""'
1002
-
1003
- user_prompt = add_global_instructions_to_user_prompt(
1004
- user_prompt, global_instructions
1061
+ runbooks_ctx = generate_runbooks_args(
1062
+ runbook_catalog=runbooks,
1063
+ global_instructions=global_instructions,
1064
+ issue_instructions=issue_runbooks,
1065
+ )
1066
+ user_prompt = generate_user_prompt(
1067
+ base_user,
1068
+ runbooks_ctx,
1005
1069
  )
1006
- user_prompt = f"{user_prompt}\n This is context from the issue {issue.raw}"
1007
-
1008
1070
  logging.debug(
1009
1071
  "Rendered system prompt:\n%s", textwrap.indent(system_prompt, " ")
1010
1072
  )
@@ -1013,10 +1075,9 @@ class IssueInvestigator(ToolCallingLLM):
1013
1075
  res = self.prompt_call(
1014
1076
  system_prompt,
1015
1077
  user_prompt,
1016
- post_processing_prompt,
1017
1078
  response_format=response_format,
1018
1079
  sections=sections,
1019
1080
  trace_span=trace_span,
1020
1081
  )
1021
- res.instructions = runbooks
1082
+ res.instructions = issue_runbooks
1022
1083
  return res