holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +17 -4
  3. holmes/common/env_vars.py +40 -1
  4. holmes/config.py +114 -144
  5. holmes/core/conversations.py +53 -14
  6. holmes/core/feedback.py +191 -0
  7. holmes/core/investigation.py +18 -22
  8. holmes/core/llm.py +489 -88
  9. holmes/core/models.py +103 -1
  10. holmes/core/openai_formatting.py +13 -0
  11. holmes/core/prompt.py +1 -1
  12. holmes/core/safeguards.py +4 -4
  13. holmes/core/supabase_dal.py +293 -100
  14. holmes/core/tool_calling_llm.py +423 -323
  15. holmes/core/tools.py +311 -33
  16. holmes/core/tools_utils/token_counting.py +14 -0
  17. holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
  18. holmes/core/tools_utils/tool_executor.py +13 -8
  19. holmes/core/toolset_manager.py +155 -4
  20. holmes/core/tracing.py +6 -1
  21. holmes/core/transformers/__init__.py +23 -0
  22. holmes/core/transformers/base.py +62 -0
  23. holmes/core/transformers/llm_summarize.py +174 -0
  24. holmes/core/transformers/registry.py +122 -0
  25. holmes/core/transformers/transformer.py +31 -0
  26. holmes/core/truncation/compaction.py +59 -0
  27. holmes/core/truncation/dal_truncation_utils.py +23 -0
  28. holmes/core/truncation/input_context_window_limiter.py +218 -0
  29. holmes/interactive.py +177 -24
  30. holmes/main.py +7 -4
  31. holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
  32. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  33. holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
  34. holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
  35. holmes/plugins/prompts/generic_ask.jinja2 +2 -4
  36. holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
  37. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
  38. holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
  39. holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
  40. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
  41. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
  42. holmes/plugins/runbooks/__init__.py +117 -18
  43. holmes/plugins/runbooks/catalog.json +2 -0
  44. holmes/plugins/toolsets/__init__.py +21 -8
  45. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  46. holmes/plugins/toolsets/aks.yaml +64 -0
  47. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
  48. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
  49. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
  50. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
  51. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
  52. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
  53. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
  54. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
  55. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
  56. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
  57. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
  58. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
  59. holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
  60. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  61. holmes/plugins/toolsets/cilium.yaml +284 -0
  62. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  63. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  64. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  65. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
  66. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  67. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
  68. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
  69. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
  70. holmes/plugins/toolsets/git.py +51 -46
  71. holmes/plugins/toolsets/grafana/common.py +15 -3
  72. holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
  73. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
  74. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
  75. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
  76. holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
  77. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
  78. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  79. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
  80. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  81. holmes/plugins/toolsets/internet/internet.py +6 -7
  82. holmes/plugins/toolsets/internet/notion.py +5 -6
  83. holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
  84. holmes/plugins/toolsets/kafka.py +25 -36
  85. holmes/plugins/toolsets/kubernetes.yaml +58 -84
  86. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  87. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  88. holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
  89. holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
  90. holmes/plugins/toolsets/newrelic/__init__.py +0 -0
  91. holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
  92. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
  93. holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
  94. holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
  95. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  96. holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  97. holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
  98. holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
  99. holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
  100. holmes/plugins/toolsets/openshift.yaml +283 -0
  101. holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
  102. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
  103. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  104. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
  105. holmes/plugins/toolsets/robusta/robusta.py +236 -65
  106. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  107. holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
  108. holmes/plugins/toolsets/service_discovery.py +1 -1
  109. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  110. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  111. holmes/plugins/toolsets/utils.py +88 -0
  112. holmes/utils/config_utils.py +91 -0
  113. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  114. holmes/utils/env.py +7 -0
  115. holmes/utils/global_instructions.py +75 -10
  116. holmes/utils/holmes_status.py +2 -1
  117. holmes/utils/holmes_sync_toolsets.py +0 -2
  118. holmes/utils/krr_utils.py +188 -0
  119. holmes/utils/sentry_helper.py +41 -0
  120. holmes/utils/stream.py +61 -7
  121. holmes/version.py +34 -14
  122. holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
  123. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
  124. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
  125. holmes/core/performance_timing.py +0 -72
  126. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  127. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  128. holmes/plugins/toolsets/newrelic.py +0 -231
  129. holmes/plugins/toolsets/servicenow/install.md +0 -37
  130. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  131. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  132. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  133. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
  134. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
@@ -2,8 +2,13 @@ import concurrent.futures
2
2
  import json
3
3
  import logging
4
4
  import textwrap
5
- from typing import Dict, List, Optional, Type, Union, Callable
5
+ from typing import Dict, List, Optional, Type, Union, Callable, Any
6
6
 
7
+ from holmes.core.models import (
8
+ ToolApprovalDecision,
9
+ ToolCallResult,
10
+ PendingToolApproval,
11
+ )
7
12
 
8
13
  import sentry_sdk
9
14
  from openai import BadRequestError
@@ -14,8 +19,8 @@ from pydantic import BaseModel, Field
14
19
  from rich.console import Console
15
20
 
16
21
  from holmes.common.env_vars import (
22
+ RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
17
23
  TEMPERATURE,
18
- MAX_OUTPUT_TOKEN_RESERVATION,
19
24
  LOG_LLM_USAGE_RESPONSE,
20
25
  )
21
26
 
@@ -28,21 +33,37 @@ from holmes.core.investigation_structured_output import (
28
33
  )
29
34
  from holmes.core.issue import Issue
30
35
  from holmes.core.llm import LLM
31
- from holmes.core.performance_timing import PerformanceTiming
32
36
  from holmes.core.resource_instruction import ResourceInstructions
33
37
  from holmes.core.runbooks import RunbookManager
34
38
  from holmes.core.safeguards import prevent_overly_repeated_tool_call
35
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
39
+ from holmes.core.tools import (
40
+ StructuredToolResult,
41
+ StructuredToolResultStatus,
42
+ ToolInvokeContext,
43
+ )
44
+ from holmes.core.tools_utils.tool_context_window_limiter import (
45
+ prevent_overly_big_tool_response,
46
+ )
47
+ from holmes.core.truncation.input_context_window_limiter import (
48
+ limit_input_context_window,
49
+ )
36
50
  from holmes.plugins.prompts import load_and_render_prompt
51
+ from holmes.plugins.runbooks import RunbookCatalog
52
+ from holmes.utils import sentry_helper
37
53
  from holmes.utils.global_instructions import (
38
54
  Instructions,
39
- add_global_instructions_to_user_prompt,
55
+ add_runbooks_to_user_prompt,
40
56
  )
41
57
  from holmes.utils.tags import format_tags_in_string, parse_messages_tags
42
58
  from holmes.core.tools_utils.tool_executor import ToolExecutor
43
59
  from holmes.core.tracing import DummySpan
44
60
  from holmes.utils.colors import AI_COLOR
45
- from holmes.utils.stream import StreamEvents, StreamMessage
61
+ from holmes.utils.stream import (
62
+ StreamEvents,
63
+ StreamMessage,
64
+ add_token_count_to_metadata,
65
+ build_stream_event_token_count,
66
+ )
46
67
 
47
68
  # Create a named logger for cost tracking
48
69
  cost_logger = logging.getLogger("holmes.costs")
@@ -119,148 +140,6 @@ def _process_cost_info(
119
140
  logging.debug(f"Could not extract cost information: {e}")
120
141
 
121
142
 
122
- def format_tool_result_data(tool_result: StructuredToolResult) -> str:
123
- tool_response = tool_result.data
124
- if isinstance(tool_result.data, str):
125
- tool_response = tool_result.data
126
- else:
127
- try:
128
- if isinstance(tool_result.data, BaseModel):
129
- tool_response = tool_result.data.model_dump_json(indent=2)
130
- else:
131
- tool_response = json.dumps(tool_result.data, indent=2)
132
- except Exception:
133
- tool_response = str(tool_result.data)
134
- if tool_result.status == ToolResultStatus.ERROR:
135
- tool_response = f"{tool_result.error or 'Tool execution failed'}:\n\n{tool_result.data or ''}".strip()
136
- return tool_response
137
-
138
-
139
- # TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
140
- # However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
141
- # We should fix this in the future
142
- # TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
143
- # token truncation and not character truncation
144
- def truncate_messages_to_fit_context(
145
- messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
146
- ) -> list:
147
- """
148
- Helper function to truncate tool messages to fit within context limits.
149
-
150
- Args:
151
- messages: List of message dictionaries with roles and content
152
- max_context_size: Maximum context window size for the model
153
- maximum_output_token: Maximum tokens reserved for model output
154
- count_tokens_fn: Function to count tokens for a list of messages
155
-
156
- Returns:
157
- Modified list of messages with truncated tool responses
158
-
159
- Raises:
160
- Exception: If non-tool messages exceed available context space
161
- """
162
- messages_except_tools = [
163
- message for message in messages if message["role"] != "tool"
164
- ]
165
- message_size_without_tools = count_tokens_fn(messages_except_tools)
166
-
167
- tool_call_messages = [message for message in messages if message["role"] == "tool"]
168
-
169
- reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
170
- if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
171
- logging.error(
172
- f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
173
- )
174
- raise Exception(
175
- f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
176
- )
177
-
178
- if len(tool_call_messages) == 0:
179
- return messages
180
-
181
- available_space = (
182
- max_context_size - message_size_without_tools - maximum_output_token
183
- )
184
- remaining_space = available_space
185
- tool_call_messages.sort(key=lambda x: len(x["content"]))
186
-
187
- # Allocate space starting with small tools and going to larger tools, while maintaining fairness
188
- # Small tools can often get exactly what they need, while larger tools may need to be truncated
189
- # We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
190
- for i, msg in enumerate(tool_call_messages):
191
- remaining_tools = len(tool_call_messages) - i
192
- max_allocation = remaining_space // remaining_tools
193
- needed_space = len(msg["content"])
194
- allocated_space = min(needed_space, max_allocation)
195
-
196
- if needed_space > allocated_space:
197
- truncation_notice = "\n\n[TRUNCATED]"
198
- # Ensure the indicator fits in the allocated space
199
- if allocated_space > len(truncation_notice):
200
- msg["content"] = (
201
- msg["content"][: allocated_space - len(truncation_notice)]
202
- + truncation_notice
203
- )
204
- logging.info(
205
- f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space-len(truncation_notice)} tokens"
206
- )
207
- else:
208
- msg["content"] = truncation_notice[:allocated_space]
209
- logging.info(
210
- f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space} tokens"
211
- )
212
- msg.pop("token_count", None) # Remove token_count if present
213
-
214
- remaining_space -= allocated_space
215
- return messages
216
-
217
-
218
- class ToolCallResult(BaseModel):
219
- tool_call_id: str
220
- tool_name: str
221
- description: str
222
- result: StructuredToolResult
223
- size: Optional[int] = None
224
-
225
- def as_tool_call_message(self):
226
- content = format_tool_result_data(self.result)
227
- if self.result.params:
228
- content = (
229
- f"Params used for the tool call: {json.dumps(self.result.params)}. The tool call output follows on the next line.\n"
230
- + content
231
- )
232
- return {
233
- "tool_call_id": self.tool_call_id,
234
- "role": "tool",
235
- "name": self.tool_name,
236
- "content": content,
237
- }
238
-
239
- def as_tool_result_response(self):
240
- result_dump = self.result.model_dump()
241
- result_dump["data"] = self.result.get_stringified_data()
242
-
243
- return {
244
- "tool_call_id": self.tool_call_id,
245
- "tool_name": self.tool_name,
246
- "description": self.description,
247
- "role": "tool",
248
- "result": result_dump,
249
- }
250
-
251
- def as_streaming_tool_result_response(self):
252
- result_dump = self.result.model_dump()
253
- result_dump["data"] = self.result.get_stringified_data()
254
-
255
- return {
256
- "tool_call_id": self.tool_call_id,
257
- "role": "tool",
258
- "description": self.description,
259
- "name": self.tool_name,
260
- "result": result_dump,
261
- }
262
-
263
-
264
143
  class LLMResult(LLMCosts):
265
144
  tool_calls: Optional[List[ToolCallResult]] = None
266
145
  result: Optional[str] = None
@@ -269,6 +148,7 @@ class LLMResult(LLMCosts):
269
148
  # TODO: clean up these two
270
149
  prompt: Optional[str] = None
271
150
  messages: Optional[List[dict]] = None
151
+ metadata: Optional[Dict[Any, Any]] = None
272
152
 
273
153
  def get_tool_usage_summary(self):
274
154
  return "AI used info from issue and " + ",".join(
@@ -276,6 +156,12 @@ class LLMResult(LLMCosts):
276
156
  )
277
157
 
278
158
 
159
+ class ToolCallWithDecision(BaseModel):
160
+ message_index: int
161
+ tool_call: ChatCompletionMessageToolCall
162
+ decision: Optional[ToolApprovalDecision]
163
+
164
+
279
165
  class ToolCallingLLM:
280
166
  llm: LLM
281
167
 
@@ -290,6 +176,95 @@ class ToolCallingLLM:
290
176
  Callable[[StructuredToolResult], tuple[bool, Optional[str]]]
291
177
  ] = None
292
178
 
179
+ def process_tool_decisions(
180
+ self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
181
+ ) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
182
+ """
183
+ Process tool approval decisions and execute approved tools.
184
+
185
+ Args:
186
+ messages: Current conversation messages
187
+ tool_decisions: List of ToolApprovalDecision objects
188
+
189
+ Returns:
190
+ Updated messages list with tool execution results
191
+ """
192
+ events: list[StreamMessage] = []
193
+ if not tool_decisions:
194
+ return messages, events
195
+
196
+ # Create decision lookup
197
+ decisions_by_tool_call_id = {
198
+ decision.tool_call_id: decision for decision in tool_decisions
199
+ }
200
+
201
+ pending_tool_calls: list[ToolCallWithDecision] = []
202
+
203
+ for i in reversed(range(len(messages))):
204
+ msg = messages[i]
205
+ if msg.get("role") == "assistant" and msg.get("tool_calls"):
206
+ message_tool_calls = msg.get("tool_calls", [])
207
+ for tool_call in message_tool_calls:
208
+ decision = decisions_by_tool_call_id.get(tool_call.get("id"), None)
209
+ if tool_call.get("pending_approval"):
210
+ del tool_call[
211
+ "pending_approval"
212
+ ] # Cleanup so that a pending approval is not tagged on message in a future response
213
+ pending_tool_calls.append(
214
+ ToolCallWithDecision(
215
+ tool_call=ChatCompletionMessageToolCall(**tool_call),
216
+ decision=decision,
217
+ message_index=i,
218
+ )
219
+ )
220
+
221
+ if not pending_tool_calls:
222
+ error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
223
+ logging.error(error_message)
224
+ raise Exception(error_message)
225
+ for tool_call_with_decision in pending_tool_calls:
226
+ tool_call_message: dict
227
+ tool_call = tool_call_with_decision.tool_call
228
+ decision = tool_call_with_decision.decision
229
+ tool_result: Optional[ToolCallResult] = None
230
+ if decision and decision.approved:
231
+ tool_result = self._invoke_llm_tool_call(
232
+ tool_to_call=tool_call,
233
+ previous_tool_calls=[],
234
+ trace_span=DummySpan(), # TODO: replace with proper span
235
+ tool_number=None,
236
+ user_approved=True,
237
+ )
238
+ else:
239
+ # Tool was rejected or no decision found, add rejection message
240
+ tool_result = ToolCallResult(
241
+ tool_call_id=tool_call.id,
242
+ tool_name=tool_call.function.name,
243
+ description=tool_call.function.name,
244
+ result=StructuredToolResult(
245
+ status=StructuredToolResultStatus.ERROR,
246
+ error="Tool execution was denied by the user.",
247
+ ),
248
+ )
249
+
250
+ events.append(
251
+ StreamMessage(
252
+ event=StreamEvents.TOOL_RESULT,
253
+ data=tool_result.as_streaming_tool_result_response(),
254
+ )
255
+ )
256
+
257
+ tool_call_message = tool_result.as_tool_call_message()
258
+
259
+ # It is expected that the tool call result directly follows the tool call request from the LLM
260
+ # The API call may contain a user ask which is appended to the messages so we can't just append
261
+ # tool call results; they need to be inserted right after the llm's message requesting tool calls
262
+ messages.insert(
263
+ tool_call_with_decision.message_index + 1, tool_call_message
264
+ )
265
+
266
+ return messages, events
267
+
293
268
  def prompt_call(
294
269
  self,
295
270
  system_prompt: str,
@@ -334,36 +309,35 @@ class ToolCallingLLM:
334
309
  trace_span=DummySpan(),
335
310
  tool_number_offset: int = 0,
336
311
  ) -> LLMResult:
337
- perf_timing = PerformanceTiming("tool_calling_llm.call")
338
- tool_calls = [] # type: ignore
312
+ tool_calls: list[
313
+ dict
314
+ ] = [] # Used for preventing repeated tool calls. potentially reset after compaction
315
+ all_tool_calls = [] # type: ignore
339
316
  costs = LLMCosts()
340
-
341
317
  tools = self.tool_executor.get_all_tools_openai_format(
342
318
  target_model=self.llm.model
343
319
  )
344
- perf_timing.measure("get_all_tools_openai_format")
345
320
  max_steps = self.max_steps
346
321
  i = 0
347
-
322
+ metadata: Dict[Any, Any] = {}
348
323
  while i < max_steps:
349
324
  i += 1
350
- perf_timing.measure(f"start iteration {i}")
351
325
  logging.debug(f"running iteration {i}")
352
326
  # on the last step we don't allow tools - we want to force a reply, not a request to run another tool
353
327
  tools = None if i == max_steps else tools
354
328
  tool_choice = "auto" if tools else None
355
329
 
356
- total_tokens = self.llm.count_tokens_for_message(messages)
357
- max_context_size = self.llm.get_context_window_size()
358
- maximum_output_token = self.llm.get_maximum_output_token()
359
- perf_timing.measure("count tokens")
330
+ limit_result = limit_input_context_window(
331
+ llm=self.llm, messages=messages, tools=tools
332
+ )
333
+ messages = limit_result.messages
334
+ metadata = metadata | limit_result.metadata
360
335
 
361
- if (total_tokens + maximum_output_token) > max_context_size:
362
- logging.warning("Token limit exceeded. Truncating tool responses.")
363
- messages = self.truncate_messages_to_fit_context(
364
- messages, max_context_size, maximum_output_token
365
- )
366
- perf_timing.measure("truncate_messages_to_fit_context")
336
+ if (
337
+ limit_result.conversation_history_compacted
338
+ and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
339
+ ):
340
+ tool_calls = []
367
341
 
368
342
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
369
343
 
@@ -381,7 +355,6 @@ class ToolCallingLLM:
381
355
  # Extract and accumulate cost information
382
356
  _process_cost_info(full_response, costs, "LLM call")
383
357
 
384
- perf_timing.measure("llm.completion")
385
358
  # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
386
359
  except BadRequestError as e:
387
360
  if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -405,9 +378,10 @@ class ToolCallingLLM:
405
378
 
406
379
  if incorrect_tool_call:
407
380
  logging.warning(
408
- "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
381
+ "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
409
382
  )
410
383
  # disable structured output going forward and and retry
384
+ sentry_helper.capture_structured_output_incorrect_tool_call()
411
385
  response_format = None
412
386
  max_steps = max_steps + 1
413
387
  continue
@@ -424,8 +398,8 @@ class ToolCallingLLM:
424
398
  hasattr(response_message, "reasoning_content")
425
399
  and response_message.reasoning_content
426
400
  ):
427
- logging.debug(
428
- f"[bold {AI_COLOR}]AI (reasoning) 🤔:[/bold {AI_COLOR}] {response_message.reasoning_content}\n"
401
+ logging.info(
402
+ f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
429
403
  )
430
404
 
431
405
  if not tools_to_call:
@@ -443,23 +417,33 @@ class ToolCallingLLM:
443
417
  )
444
418
  costs.total_cost += post_processing_cost
445
419
 
446
- perf_timing.end(f"- completed in {i} iterations -")
420
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
421
+
422
+ add_token_count_to_metadata(
423
+ tokens=tokens,
424
+ full_llm_response=full_response,
425
+ max_context_size=limit_result.max_context_size,
426
+ maximum_output_token=limit_result.maximum_output_token,
427
+ metadata=metadata,
428
+ )
429
+
447
430
  return LLMResult(
448
431
  result=post_processed_response,
449
432
  unprocessed_result=raw_response,
450
- tool_calls=tool_calls,
433
+ tool_calls=all_tool_calls,
451
434
  prompt=json.dumps(messages, indent=2),
452
435
  messages=messages,
453
436
  **costs.model_dump(), # Include all cost fields
437
+ metadata=metadata,
454
438
  )
455
439
 
456
- perf_timing.end(f"- completed in {i} iterations -")
457
440
  return LLMResult(
458
441
  result=text_response,
459
- tool_calls=tool_calls,
442
+ tool_calls=all_tool_calls,
460
443
  prompt=json.dumps(messages, indent=2),
461
444
  messages=messages,
462
445
  **costs.model_dump(), # Include all cost fields
446
+ metadata=metadata,
463
447
  )
464
448
 
465
449
  if text_response and text_response.strip():
@@ -467,7 +451,6 @@ class ToolCallingLLM:
467
451
  logging.info(
468
452
  f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
469
453
  )
470
- perf_timing.measure("pre-tool-calls")
471
454
  with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
472
455
  futures = []
473
456
  futures_tool_numbers: dict[
@@ -477,6 +460,7 @@ class ToolCallingLLM:
477
460
  for tool_index, t in enumerate(tools_to_call, 1):
478
461
  logging.debug(f"Tool to call: {t}")
479
462
  tool_number = tool_number_offset + tool_index
463
+
480
464
  future = executor.submit(
481
465
  self._invoke_llm_tool_call,
482
466
  tool_to_call=t,
@@ -495,14 +479,27 @@ class ToolCallingLLM:
495
479
  if future in futures_tool_numbers
496
480
  else None
497
481
  )
498
- tool_call_result = self.handle_tool_call_approval(
499
- tool_call_result=tool_call_result, tool_number=tool_number
500
- )
501
482
 
502
- tool_calls.append(tool_call_result.as_tool_result_response())
483
+ if (
484
+ tool_call_result.result.status
485
+ == StructuredToolResultStatus.APPROVAL_REQUIRED
486
+ ):
487
+ with trace_span.start_span(type="tool") as tool_span:
488
+ tool_call_result = self._handle_tool_call_approval(
489
+ tool_call_result=tool_call_result,
490
+ tool_number=tool_number,
491
+ )
492
+ ToolCallingLLM._log_tool_call_result(
493
+ tool_span, tool_call_result
494
+ )
495
+
496
+ tool_result_response_dict = (
497
+ tool_call_result.as_tool_result_response()
498
+ )
499
+ tool_calls.append(tool_result_response_dict)
500
+ all_tool_calls.append(tool_result_response_dict)
503
501
  messages.append(tool_call_result.as_tool_call_message())
504
-
505
- perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
502
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
506
503
 
507
504
  # Update the tool number offset for the next iteration
508
505
  tool_number_offset += len(tools_to_call)
@@ -513,91 +510,52 @@ class ToolCallingLLM:
513
510
 
514
511
  raise Exception(f"Too many LLM calls - exceeded max_steps: {i}/{max_steps}")
515
512
 
516
- def _directly_invoke_tool(
513
+ def _directly_invoke_tool_call(
517
514
  self,
518
515
  tool_name: str,
519
516
  tool_params: dict,
520
517
  user_approved: bool,
521
- trace_span=DummySpan(),
522
518
  tool_number: Optional[int] = None,
523
519
  ) -> StructuredToolResult:
524
- tool_span = trace_span.start_span(name=tool_name, type="tool")
525
520
  tool = self.tool_executor.get_tool_by_name(tool_name)
526
- tool_response = None
521
+ if not tool:
522
+ logging.warning(
523
+ f"Skipping tool execution for {tool_name}: args: {tool_params}"
524
+ )
525
+ return StructuredToolResult(
526
+ status=StructuredToolResultStatus.ERROR,
527
+ error=f"Failed to find tool {tool_name}",
528
+ params=tool_params,
529
+ )
530
+
527
531
  try:
528
- if (not tool) or (tool_params is None):
529
- logging.warning(
530
- f"Skipping tool execution for {tool_name}: args: {tool_params}"
531
- )
532
- tool_response = StructuredToolResult(
533
- status=ToolResultStatus.ERROR,
534
- error=f"Failed to find tool {tool_name}",
535
- params=tool_params,
536
- )
537
- else:
538
- tool_response = tool.invoke(
539
- tool_params, tool_number=tool_number, user_approved=user_approved
540
- )
532
+ invoke_context = ToolInvokeContext(
533
+ tool_number=tool_number,
534
+ user_approved=user_approved,
535
+ llm=self.llm,
536
+ max_token_count=self.llm.get_max_token_count_for_single_tool(),
537
+ )
538
+ tool_response = tool.invoke(tool_params, context=invoke_context)
541
539
  except Exception as e:
542
540
  logging.error(
543
541
  f"Tool call to {tool_name} failed with an Exception", exc_info=True
544
542
  )
545
543
  tool_response = StructuredToolResult(
546
- status=ToolResultStatus.ERROR,
544
+ status=StructuredToolResultStatus.ERROR,
547
545
  error=f"Tool call failed: {e}",
548
546
  params=tool_params,
549
547
  )
550
-
551
- # Log error to trace span
552
- tool_span.log(
553
- input=tool_params, output=str(e), metadata={"status": "ERROR"}
554
- )
555
-
556
- tool_span.log(
557
- input=tool_params,
558
- output=tool_response.data,
559
- metadata={
560
- "status": tool_response.status.value,
561
- "error": tool_response.error,
562
- "description": tool.get_parameterized_one_liner(tool_params)
563
- if tool
564
- else "",
565
- "structured_tool_result": tool_response,
566
- },
567
- )
568
- tool_span.end()
569
-
570
548
  return tool_response
571
549
 
572
- def _invoke_llm_tool_call(
550
+ def _get_tool_call_result(
573
551
  self,
574
- tool_to_call: ChatCompletionMessageToolCall,
552
+ tool_call_id: str,
553
+ tool_name: str,
554
+ tool_arguments: str,
555
+ user_approved: bool,
575
556
  previous_tool_calls: list[dict],
576
- trace_span=DummySpan(),
577
- tool_number=None,
557
+ tool_number: Optional[int] = None,
578
558
  ) -> ToolCallResult:
579
- # Handle the union type - ChatCompletionMessageToolCall can be either
580
- # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
581
- # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
582
- # We use hasattr to check for the 'function' attribute as it's more flexible
583
- # and doesn't require importing the specific type.
584
- if hasattr(tool_to_call, "function"):
585
- tool_name = tool_to_call.function.name
586
- tool_arguments = tool_to_call.function.arguments
587
- else:
588
- # This is a custom tool call - we don't support these currently
589
- logging.error(f"Unsupported custom tool call: {tool_to_call}")
590
- return ToolCallResult(
591
- tool_call_id=tool_to_call.id,
592
- tool_name="unknown",
593
- description="NA",
594
- result=StructuredToolResult(
595
- status=ToolResultStatus.ERROR,
596
- error="Custom tool calls are not supported",
597
- params=None,
598
- ),
599
- )
600
-
601
559
  tool_params = {}
602
560
  try:
603
561
  tool_params = json.loads(tool_arguments)
@@ -606,20 +564,19 @@ class ToolCallingLLM:
606
564
  f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
607
565
  )
608
566
 
609
- tool_call_id = tool_to_call.id
610
-
611
- tool_response = prevent_overly_repeated_tool_call(
612
- tool_name=tool_name,
613
- tool_params=tool_params,
614
- tool_calls=previous_tool_calls,
615
- )
567
+ tool_response = None
568
+ if not user_approved:
569
+ tool_response = prevent_overly_repeated_tool_call(
570
+ tool_name=tool_name,
571
+ tool_params=tool_params,
572
+ tool_calls=previous_tool_calls,
573
+ )
616
574
 
617
575
  if not tool_response:
618
- tool_response = self._directly_invoke_tool(
576
+ tool_response = self._directly_invoke_tool_call(
619
577
  tool_name=tool_name,
620
578
  tool_params=tool_params,
621
- user_approved=False,
622
- trace_span=trace_span,
579
+ user_approved=user_approved,
623
580
  tool_number=tool_number,
624
581
  )
625
582
 
@@ -629,38 +586,103 @@ class ToolCallingLLM:
629
586
  f"Tool {tool_name} return type is not StructuredToolResult. Nesting the tool result into StructuredToolResult..."
630
587
  )
631
588
  tool_response = StructuredToolResult(
632
- status=ToolResultStatus.SUCCESS,
589
+ status=StructuredToolResultStatus.SUCCESS,
633
590
  data=tool_response,
634
591
  params=tool_params,
635
592
  )
636
593
 
637
594
  tool = self.tool_executor.get_tool_by_name(tool_name)
595
+
638
596
  return ToolCallResult(
639
597
  tool_call_id=tool_call_id,
640
598
  tool_name=tool_name,
641
- description=tool.get_parameterized_one_liner(tool_params) if tool else "",
599
+ description=str(tool.get_parameterized_one_liner(tool_params))
600
+ if tool
601
+ else "",
642
602
  result=tool_response,
643
603
  )
644
604
 
645
- def handle_tool_call_approval(
646
- self, tool_call_result: ToolCallResult, tool_number: Optional[int]
605
+ @staticmethod
606
+ def _log_tool_call_result(tool_span, tool_call_result: ToolCallResult):
607
+ tool_span.set_attributes(name=tool_call_result.tool_name)
608
+ tool_span.log(
609
+ input=tool_call_result.result.params,
610
+ output=tool_call_result.result.data,
611
+ error=tool_call_result.result.error,
612
+ metadata={
613
+ "status": tool_call_result.result.status,
614
+ "description": tool_call_result.description,
615
+ },
616
+ )
617
+
618
+ def _invoke_llm_tool_call(
619
+ self,
620
+ tool_to_call: ChatCompletionMessageToolCall,
621
+ previous_tool_calls: list[dict],
622
+ trace_span=None,
623
+ tool_number=None,
624
+ user_approved: bool = False,
625
+ ) -> ToolCallResult:
626
+ if trace_span is None:
627
+ trace_span = DummySpan()
628
+ with trace_span.start_span(type="tool") as tool_span:
629
+ if not hasattr(tool_to_call, "function"):
630
+ # Handle the union type - ChatCompletionMessageToolCall can be either
631
+ # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
632
+ # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
633
+ # We use hasattr to check for the 'function' attribute as it's more flexible
634
+ # and doesn't require importing the specific type.
635
+ tool_name = "Unknown_Custom_Tool"
636
+ logging.error(f"Unsupported custom tool call: {tool_to_call}")
637
+ tool_call_result = ToolCallResult(
638
+ tool_call_id=tool_to_call.id,
639
+ tool_name=tool_name,
640
+ description="NA",
641
+ result=StructuredToolResult(
642
+ status=StructuredToolResultStatus.ERROR,
643
+ error="Custom tool calls are not supported",
644
+ params=None,
645
+ ),
646
+ )
647
+ else:
648
+ tool_name = tool_to_call.function.name
649
+ tool_arguments = tool_to_call.function.arguments
650
+ tool_id = tool_to_call.id
651
+ tool_call_result = self._get_tool_call_result(
652
+ tool_id,
653
+ tool_name,
654
+ tool_arguments,
655
+ previous_tool_calls=previous_tool_calls,
656
+ tool_number=tool_number,
657
+ user_approved=user_approved,
658
+ )
659
+
660
+ prevent_overly_big_tool_response(
661
+ tool_call_result=tool_call_result, llm=self.llm
662
+ )
663
+
664
+ ToolCallingLLM._log_tool_call_result(tool_span, tool_call_result)
665
+ return tool_call_result
666
+
667
+ def _handle_tool_call_approval(
668
+ self,
669
+ tool_call_result: ToolCallResult,
670
+ tool_number: Optional[int],
647
671
  ) -> ToolCallResult:
648
672
  """
649
673
  Handle approval for a single tool call if required.
650
674
 
651
675
  Args:
652
676
  tool_call_result: A single tool call result that may require approval
677
+ tool_number: The tool call number
653
678
 
654
679
  Returns:
655
680
  Updated tool call result with approved/denied status
656
681
  """
657
682
 
658
- if tool_call_result.result.status != ToolResultStatus.APPROVAL_REQUIRED:
659
- return tool_call_result
660
-
661
683
  # If no approval callback, convert to ERROR because it is assumed the client may not be able to handle approvals
662
684
  if not self.approval_callback:
663
- tool_call_result.result.status = ToolResultStatus.ERROR
685
+ tool_call_result.result.status = StructuredToolResultStatus.ERROR
664
686
  return tool_call_result
665
687
 
666
688
  # Get approval from user
@@ -670,19 +692,17 @@ class ToolCallingLLM:
670
692
  logging.debug(
671
693
  f"User approved command: {tool_call_result.result.invocation}"
672
694
  )
673
-
674
- new_response = self._directly_invoke_tool(
695
+ new_response = self._directly_invoke_tool_call(
675
696
  tool_name=tool_call_result.tool_name,
676
697
  tool_params=tool_call_result.result.params or {},
677
698
  user_approved=True,
678
- trace_span=DummySpan(),
679
699
  tool_number=tool_number,
680
700
  )
681
701
  tool_call_result.result = new_response
682
702
  else:
683
703
  # User denied - update to error
684
704
  feedback_text = f" User feedback: {feedback}" if feedback else ""
685
- tool_call_result.result.status = ToolResultStatus.ERROR
705
+ tool_call_result.result.status = StructuredToolResultStatus.ERROR
686
706
  tool_call_result.result.error = (
687
707
  f"User denied command execution.{feedback_text}"
688
708
  )
@@ -737,17 +757,6 @@ class ToolCallingLLM:
737
757
  logging.exception("Failed to run post processing", exc_info=True)
738
758
  return investigation, 0.0
739
759
 
740
- @sentry_sdk.trace
741
- def truncate_messages_to_fit_context(
742
- self, messages: list, max_context_size: int, maximum_output_token: int
743
- ) -> list:
744
- return truncate_messages_to_fit_context(
745
- messages,
746
- max_context_size,
747
- maximum_output_token,
748
- self.llm.count_tokens_for_message,
749
- )
750
-
751
760
  def call_stream(
752
761
  self,
753
762
  system_prompt: str = "",
@@ -755,47 +764,55 @@ class ToolCallingLLM:
755
764
  response_format: Optional[Union[dict, Type[BaseModel]]] = None,
756
765
  sections: Optional[InputSectionsDataType] = None,
757
766
  msgs: Optional[list[dict]] = None,
767
+ enable_tool_approval: bool = False,
768
+ tool_decisions: List[ToolApprovalDecision] | None = None,
758
769
  ):
759
770
  """
760
771
  This function DOES NOT call llm.completion(stream=true).
761
772
  This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
762
773
  """
763
- messages = []
774
+
775
+ # Process tool decisions if provided
776
+ if msgs and tool_decisions:
777
+ logging.info(f"Processing {len(tool_decisions)} tool decisions")
778
+ msgs, events = self.process_tool_decisions(msgs, tool_decisions)
779
+ yield from events
780
+
781
+ messages: list[dict] = []
764
782
  if system_prompt:
765
783
  messages.append({"role": "system", "content": system_prompt})
766
784
  if user_prompt:
767
785
  messages.append({"role": "user", "content": user_prompt})
768
786
  if msgs:
769
787
  messages.extend(msgs)
770
- perf_timing = PerformanceTiming("tool_calling_llm.call")
771
788
  tool_calls: list[dict] = []
772
789
  tools = self.tool_executor.get_all_tools_openai_format(
773
790
  target_model=self.llm.model
774
791
  )
775
- perf_timing.measure("get_all_tools_openai_format")
776
792
  max_steps = self.max_steps
793
+ metadata: Dict[Any, Any] = {}
777
794
  i = 0
778
795
  tool_number_offset = 0
779
796
 
780
797
  while i < max_steps:
781
798
  i += 1
782
- perf_timing.measure(f"start iteration {i}")
783
799
  logging.debug(f"running iteration {i}")
784
800
 
785
801
  tools = None if i == max_steps else tools
786
802
  tool_choice = "auto" if tools else None
787
803
 
788
- total_tokens = self.llm.count_tokens_for_message(messages) # type: ignore
789
- max_context_size = self.llm.get_context_window_size()
790
- maximum_output_token = self.llm.get_maximum_output_token()
791
- perf_timing.measure("count tokens")
804
+ limit_result = limit_input_context_window(
805
+ llm=self.llm, messages=messages, tools=tools
806
+ )
807
+ yield from limit_result.events
808
+ messages = limit_result.messages
809
+ metadata = metadata | limit_result.metadata
792
810
 
793
- if (total_tokens + maximum_output_token) > max_context_size:
794
- logging.warning("Token limit exceeded. Truncating tool responses.")
795
- messages = self.truncate_messages_to_fit_context(
796
- messages, max_context_size, maximum_output_token
797
- )
798
- perf_timing.measure("truncate_messages_to_fit_context")
811
+ if (
812
+ limit_result.conversation_history_compacted
813
+ and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
814
+ ):
815
+ tool_calls = []
799
816
 
800
817
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
801
818
  try:
@@ -812,7 +829,6 @@ class ToolCallingLLM:
812
829
  # Log cost information for this iteration (no accumulation in streaming)
813
830
  _process_cost_info(full_response, log_prefix="LLM iteration")
814
831
 
815
- perf_timing.measure("llm.completion")
816
832
  # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
817
833
  except BadRequestError as e:
818
834
  if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -834,9 +850,10 @@ class ToolCallingLLM:
834
850
 
835
851
  if incorrect_tool_call:
836
852
  logging.warning(
837
- "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
853
+ "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
838
854
  )
839
855
  # disable structured output going forward and and retry
856
+ sentry_helper.capture_structured_output_incorrect_tool_call()
840
857
  response_format = None
841
858
  max_steps = max_steps + 1
842
859
  continue
@@ -847,11 +864,25 @@ class ToolCallingLLM:
847
864
  )
848
865
  )
849
866
 
867
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
868
+ add_token_count_to_metadata(
869
+ tokens=tokens,
870
+ full_llm_response=full_response,
871
+ max_context_size=limit_result.max_context_size,
872
+ maximum_output_token=limit_result.maximum_output_token,
873
+ metadata=metadata,
874
+ )
875
+ yield build_stream_event_token_count(metadata=metadata)
876
+
850
877
  tools_to_call = getattr(response_message, "tool_calls", None)
851
878
  if not tools_to_call:
852
879
  yield StreamMessage(
853
880
  event=StreamEvents.ANSWER_END,
854
- data={"content": response_message.content, "messages": messages},
881
+ data={
882
+ "content": response_message.content,
883
+ "messages": messages,
884
+ "metadata": metadata,
885
+ },
855
886
  )
856
887
  return
857
888
 
@@ -860,14 +891,22 @@ class ToolCallingLLM:
860
891
  if reasoning or message:
861
892
  yield StreamMessage(
862
893
  event=StreamEvents.AI_MESSAGE,
863
- data={"content": message, "reasoning": reasoning},
894
+ data={
895
+ "content": message,
896
+ "reasoning": reasoning,
897
+ "metadata": metadata,
898
+ },
864
899
  )
865
900
 
866
- perf_timing.measure("pre-tool-calls")
901
+ # Check if any tools require approval first
902
+ pending_approvals = []
903
+ approval_required_tools = []
904
+
867
905
  with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
868
906
  futures = []
869
907
  for tool_index, t in enumerate(tools_to_call, 1): # type: ignore
870
908
  tool_number = tool_number_offset + tool_index
909
+
871
910
  future = executor.submit(
872
911
  self._invoke_llm_tool_call,
873
912
  tool_to_call=t, # type: ignore
@@ -884,15 +923,72 @@ class ToolCallingLLM:
884
923
  for future in concurrent.futures.as_completed(futures):
885
924
  tool_call_result: ToolCallResult = future.result()
886
925
 
887
- tool_calls.append(tool_call_result.as_tool_result_response())
888
- messages.append(tool_call_result.as_tool_call_message())
926
+ if (
927
+ tool_call_result.result.status
928
+ == StructuredToolResultStatus.APPROVAL_REQUIRED
929
+ ):
930
+ if enable_tool_approval:
931
+ pending_approvals.append(
932
+ PendingToolApproval(
933
+ tool_call_id=tool_call_result.tool_call_id,
934
+ tool_name=tool_call_result.tool_name,
935
+ description=tool_call_result.description,
936
+ params=tool_call_result.result.params or {},
937
+ )
938
+ )
939
+ approval_required_tools.append(tool_call_result)
940
+
941
+ yield StreamMessage(
942
+ event=StreamEvents.TOOL_RESULT,
943
+ data=tool_call_result.as_streaming_tool_result_response(),
944
+ )
945
+ else:
946
+ tool_call_result.result.status = (
947
+ StructuredToolResultStatus.ERROR
948
+ )
949
+ tool_call_result.result.error = f"Tool call rejected for security reasons: {tool_call_result.result.error}"
950
+
951
+ tool_calls.append(
952
+ tool_call_result.as_tool_result_response()
953
+ )
954
+ messages.append(tool_call_result.as_tool_call_message())
955
+
956
+ yield StreamMessage(
957
+ event=StreamEvents.TOOL_RESULT,
958
+ data=tool_call_result.as_streaming_tool_result_response(),
959
+ )
960
+
961
+ else:
962
+ tool_calls.append(tool_call_result.as_tool_result_response())
963
+ messages.append(tool_call_result.as_tool_call_message())
964
+
965
+ yield StreamMessage(
966
+ event=StreamEvents.TOOL_RESULT,
967
+ data=tool_call_result.as_streaming_tool_result_response(),
968
+ )
889
969
 
890
- perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
970
+ # If we have approval required tools, end the stream with pending approvals
971
+ if pending_approvals:
972
+ # Add assistant message with pending tool calls
973
+ for result in approval_required_tools:
974
+ tool_call = self.find_assistant_tool_call_request(
975
+ tool_call_id=result.tool_call_id, messages=messages
976
+ )
977
+ tool_call["pending_approval"] = True
891
978
 
979
+ # End stream with approvals required
892
980
  yield StreamMessage(
893
- event=StreamEvents.TOOL_RESULT,
894
- data=tool_call_result.as_streaming_tool_result_response(),
981
+ event=StreamEvents.APPROVAL_REQUIRED,
982
+ data={
983
+ "content": None,
984
+ "messages": messages,
985
+ "pending_approvals": [
986
+ approval.model_dump() for approval in pending_approvals
987
+ ],
988
+ "requires_approval": True,
989
+ },
895
990
  )
991
+ return
896
992
 
897
993
  # Update the tool number offset for the next iteration
898
994
  tool_number_offset += len(tools_to_call)
@@ -901,6 +997,21 @@ class ToolCallingLLM:
901
997
  f"Too many LLM calls - exceeded max_steps: {i}/{self.max_steps}"
902
998
  )
903
999
 
1000
+ def find_assistant_tool_call_request(
1001
+ self, tool_call_id: str, messages: list[dict[str, Any]]
1002
+ ) -> dict[str, Any]:
1003
+ for message in messages:
1004
+ if message.get("role") == "assistant":
1005
+ for tool_call in message.get("tool_calls", []):
1006
+ if tool_call.get("id") == tool_call_id:
1007
+ return tool_call
1008
+
1009
+ # Should not happen unless there is a bug.
1010
+ # If we are here
1011
+ raise Exception(
1012
+ f"Failed to find assistant request for a tool_call in conversation history. tool_call_id={tool_call_id}"
1013
+ )
1014
+
904
1015
 
905
1016
  # TODO: consider getting rid of this entirely and moving templating into the cmds in holmes_cli.py
906
1017
  class IssueInvestigator(ToolCallingLLM):
@@ -933,8 +1044,9 @@ class IssueInvestigator(ToolCallingLLM):
933
1044
  post_processing_prompt: Optional[str] = None,
934
1045
  sections: Optional[InputSectionsDataType] = None,
935
1046
  trace_span=DummySpan(),
1047
+ runbooks: Optional[RunbookCatalog] = None,
936
1048
  ) -> LLMResult:
937
- runbooks = self.runbook_manager.get_instructions_for_issue(issue)
1049
+ issue_runbooks = self.runbook_manager.get_instructions_for_issue(issue)
938
1050
 
939
1051
  request_structured_output_from_llm = True
940
1052
  response_format = None
@@ -962,12 +1074,9 @@ class IssueInvestigator(ToolCallingLLM):
962
1074
  else:
963
1075
  logging.info("Structured output is disabled for this request")
964
1076
 
965
- if instructions is not None and instructions.instructions:
966
- runbooks.extend(instructions.instructions)
967
-
968
1077
  if console and runbooks:
969
1078
  console.print(
970
- f"[bold]Analyzing with {len(runbooks)} runbooks: {runbooks}[/bold]"
1079
+ f"[bold]Analyzing with {len(issue_runbooks)} runbooks: {issue_runbooks}[/bold]"
971
1080
  )
972
1081
  elif console:
973
1082
  console.print(
@@ -982,29 +1091,20 @@ class IssueInvestigator(ToolCallingLLM):
982
1091
  "structured_output": request_structured_output_from_llm,
983
1092
  "toolsets": self.tool_executor.toolsets,
984
1093
  "cluster_name": self.cluster_name,
1094
+ "runbooks_enabled": True if runbooks else False,
985
1095
  },
986
1096
  )
987
1097
 
988
- if instructions is not None and len(instructions.documents) > 0:
989
- docPrompts = []
990
- for document in instructions.documents:
991
- docPrompts.append(
992
- f"* fetch information from this URL: {document.url}\n"
993
- )
994
- runbooks.extend(docPrompts)
995
-
996
1098
  user_prompt = ""
997
- if runbooks:
998
- for runbook_str in runbooks:
999
- user_prompt += f"* {runbook_str}\n"
1000
-
1001
- user_prompt = f'My instructions to check \n"""{user_prompt}"""'
1002
1099
 
1003
- user_prompt = add_global_instructions_to_user_prompt(
1004
- user_prompt, global_instructions
1100
+ user_prompt = add_runbooks_to_user_prompt(
1101
+ user_prompt,
1102
+ runbook_catalog=runbooks,
1103
+ global_instructions=global_instructions,
1104
+ issue_instructions=issue_runbooks,
1105
+ resource_instructions=instructions,
1005
1106
  )
1006
- user_prompt = f"{user_prompt}\n This is context from the issue {issue.raw}"
1007
-
1107
+ user_prompt = f"{user_prompt}\n #This is context from the issue:\n{issue.raw}"
1008
1108
  logging.debug(
1009
1109
  "Rendered system prompt:\n%s", textwrap.indent(system_prompt, " ")
1010
1110
  )
@@ -1018,5 +1118,5 @@ class IssueInvestigator(ToolCallingLLM):
1018
1118
  sections=sections,
1019
1119
  trace_span=trace_span,
1020
1120
  )
1021
- res.instructions = runbooks
1121
+ res.instructions = issue_runbooks
1022
1122
  return res