holmesgpt 0.14.4a0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (37) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +12 -10
  3. holmes/common/env_vars.py +22 -0
  4. holmes/config.py +51 -4
  5. holmes/core/conversations.py +3 -2
  6. holmes/core/llm.py +226 -72
  7. holmes/core/openai_formatting.py +13 -0
  8. holmes/core/supabase_dal.py +33 -42
  9. holmes/core/tool_calling_llm.py +185 -282
  10. holmes/core/tools.py +21 -1
  11. holmes/core/tools_utils/token_counting.py +2 -1
  12. holmes/core/tools_utils/tool_context_window_limiter.py +32 -30
  13. holmes/core/truncation/compaction.py +59 -0
  14. holmes/core/truncation/input_context_window_limiter.py +218 -0
  15. holmes/interactive.py +17 -7
  16. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  17. holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
  18. holmes/plugins/toolsets/__init__.py +4 -0
  19. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +0 -1
  20. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
  21. holmes/plugins/toolsets/grafana/grafana_api.py +1 -1
  22. holmes/plugins/toolsets/investigator/core_investigation.py +34 -24
  23. holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  24. holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
  25. holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
  26. holmes/plugins/toolsets/prometheus/prometheus.py +1 -1
  27. holmes/plugins/toolsets/robusta/robusta.py +35 -8
  28. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +4 -3
  29. holmes/plugins/toolsets/service_discovery.py +1 -1
  30. holmes/plugins/toolsets/servicenow/servicenow.py +0 -1
  31. holmes/utils/stream.py +31 -1
  32. {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/METADATA +6 -2
  33. {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/RECORD +36 -31
  34. holmes/core/performance_timing.py +0 -72
  35. {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/LICENSE.txt +0 -0
  36. {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/WHEEL +0 -0
  37. {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/entry_points.txt +0 -0
@@ -7,8 +7,6 @@ from typing import Dict, List, Optional, Type, Union, Callable, Any
7
7
  from holmes.core.models import (
8
8
  ToolApprovalDecision,
9
9
  ToolCallResult,
10
- TruncationResult,
11
- TruncationMetadata,
12
10
  PendingToolApproval,
13
11
  )
14
12
 
@@ -21,8 +19,8 @@ from pydantic import BaseModel, Field
21
19
  from rich.console import Console
22
20
 
23
21
  from holmes.common.env_vars import (
22
+ RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
24
23
  TEMPERATURE,
25
- MAX_OUTPUT_TOKEN_RESERVATION,
26
24
  LOG_LLM_USAGE_RESPONSE,
27
25
  )
28
26
 
@@ -34,8 +32,7 @@ from holmes.core.investigation_structured_output import (
34
32
  is_response_an_incorrect_tool_call,
35
33
  )
36
34
  from holmes.core.issue import Issue
37
- from holmes.core.llm import LLM, get_llm_usage
38
- from holmes.core.performance_timing import PerformanceTiming
35
+ from holmes.core.llm import LLM
39
36
  from holmes.core.resource_instruction import ResourceInstructions
40
37
  from holmes.core.runbooks import RunbookManager
41
38
  from holmes.core.safeguards import prevent_overly_repeated_tool_call
@@ -45,9 +42,11 @@ from holmes.core.tools import (
45
42
  ToolInvokeContext,
46
43
  )
47
44
  from holmes.core.tools_utils.tool_context_window_limiter import (
48
- get_max_token_count_for_single_tool,
49
45
  prevent_overly_big_tool_response,
50
46
  )
47
+ from holmes.core.truncation.input_context_window_limiter import (
48
+ limit_input_context_window,
49
+ )
51
50
  from holmes.plugins.prompts import load_and_render_prompt
52
51
  from holmes.utils import sentry_helper
53
52
  from holmes.utils.global_instructions import (
@@ -58,15 +57,17 @@ from holmes.utils.tags import format_tags_in_string, parse_messages_tags
58
57
  from holmes.core.tools_utils.tool_executor import ToolExecutor
59
58
  from holmes.core.tracing import DummySpan
60
59
  from holmes.utils.colors import AI_COLOR
61
- from holmes.utils.stream import StreamEvents, StreamMessage
60
+ from holmes.utils.stream import (
61
+ StreamEvents,
62
+ StreamMessage,
63
+ add_token_count_to_metadata,
64
+ build_stream_event_token_count,
65
+ )
62
66
 
63
67
  # Create a named logger for cost tracking
64
68
  cost_logger = logging.getLogger("holmes.costs")
65
69
 
66
70
 
67
- TRUNCATION_NOTICE = "\n\n[TRUNCATED]"
68
-
69
-
70
71
  class LLMCosts(BaseModel):
71
72
  """Tracks cost and token usage for LLM calls."""
72
73
 
@@ -138,109 +139,6 @@ def _process_cost_info(
138
139
  logging.debug(f"Could not extract cost information: {e}")
139
140
 
140
141
 
141
- # TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
142
- # However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
143
- # We should fix this in the future
144
- # TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
145
- # token truncation and not character truncation
146
- def truncate_messages_to_fit_context(
147
- messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
148
- ) -> TruncationResult:
149
- """
150
- Helper function to truncate tool messages to fit within context limits.
151
-
152
- Args:
153
- messages: List of message dictionaries with roles and content
154
- max_context_size: Maximum context window size for the model
155
- maximum_output_token: Maximum tokens reserved for model output
156
- count_tokens_fn: Function to count tokens for a list of messages
157
-
158
- Returns:
159
- Modified list of messages with truncated tool responses
160
-
161
- Raises:
162
- Exception: If non-tool messages exceed available context space
163
- """
164
- messages_except_tools = [
165
- message for message in messages if message["role"] != "tool"
166
- ]
167
- message_size_without_tools = count_tokens_fn(messages_except_tools)
168
-
169
- tool_call_messages = [message for message in messages if message["role"] == "tool"]
170
-
171
- reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
172
- if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
173
- logging.error(
174
- f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
175
- )
176
- raise Exception(
177
- f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
178
- )
179
-
180
- if len(tool_call_messages) == 0:
181
- return TruncationResult(truncated_messages=messages, truncations=[])
182
-
183
- available_space = (
184
- max_context_size - message_size_without_tools - reserved_for_output_tokens
185
- )
186
- remaining_space = available_space
187
- tool_call_messages.sort(
188
- key=lambda x: count_tokens_fn([{"role": "tool", "content": x["content"]}])
189
- )
190
-
191
- truncations = []
192
-
193
- # Allocate space starting with small tools and going to larger tools, while maintaining fairness
194
- # Small tools can often get exactly what they need, while larger tools may need to be truncated
195
- # We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
196
- for i, msg in enumerate(tool_call_messages):
197
- remaining_tools = len(tool_call_messages) - i
198
- max_allocation = remaining_space // remaining_tools
199
- needed_space = count_tokens_fn([{"role": "tool", "content": msg["content"]}])
200
- allocated_space = min(needed_space, max_allocation)
201
-
202
- if needed_space > allocated_space:
203
- truncation_metadata = _truncate_tool_message(
204
- msg, allocated_space, needed_space
205
- )
206
- truncations.append(truncation_metadata)
207
-
208
- remaining_space -= allocated_space
209
- return TruncationResult(truncated_messages=messages, truncations=truncations)
210
-
211
-
212
- def _truncate_tool_message(
213
- msg: dict, allocated_space: int, needed_space: int
214
- ) -> TruncationMetadata:
215
- msg_content = msg["content"]
216
- tool_call_id = msg["tool_call_id"]
217
- tool_name = msg["name"]
218
-
219
- # Ensure the indicator fits in the allocated space
220
- if allocated_space > len(TRUNCATION_NOTICE):
221
- original = msg_content if isinstance(msg_content, str) else str(msg_content)
222
- msg["content"] = (
223
- original[: allocated_space - len(TRUNCATION_NOTICE)] + TRUNCATION_NOTICE
224
- )
225
- end_index = allocated_space - len(TRUNCATION_NOTICE)
226
- else:
227
- msg["content"] = TRUNCATION_NOTICE[:allocated_space]
228
- end_index = allocated_space
229
-
230
- msg.pop("token_count", None) # Remove token_count if present
231
- logging.info(
232
- f"Truncating tool message '{tool_name}' from {needed_space} to {allocated_space} tokens"
233
- )
234
- truncation_metadata = TruncationMetadata(
235
- tool_call_id=tool_call_id,
236
- start_index=0,
237
- end_index=end_index,
238
- tool_name=tool_name,
239
- original_token_count=needed_space,
240
- )
241
- return truncation_metadata
242
-
243
-
244
142
  class LLMResult(LLMCosts):
245
143
  tool_calls: Optional[List[ToolCallResult]] = None
246
144
  result: Optional[str] = None
@@ -257,6 +155,12 @@ class LLMResult(LLMCosts):
257
155
  )
258
156
 
259
157
 
158
+ class ToolCallWithDecision(BaseModel):
159
+ message_index: int
160
+ tool_call: ChatCompletionMessageToolCall
161
+ decision: Optional[ToolApprovalDecision]
162
+
163
+
260
164
  class ToolCallingLLM:
261
165
  llm: LLM
262
166
 
@@ -273,7 +177,7 @@ class ToolCallingLLM:
273
177
 
274
178
  def process_tool_decisions(
275
179
  self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
276
- ) -> List[Dict[str, Any]]:
180
+ ) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
277
181
  """
278
182
  Process tool approval decisions and execute approved tools.
279
183
 
@@ -284,85 +188,81 @@ class ToolCallingLLM:
284
188
  Returns:
285
189
  Updated messages list with tool execution results
286
190
  """
287
- # Import here to avoid circular imports
288
-
289
- # Find the last message with pending approvals
290
- pending_message_idx = None
291
- pending_tool_calls = None
292
-
293
- for i in reversed(range(len(messages))):
294
- msg = messages[i]
295
- if msg.get("role") == "assistant" and msg.get("pending_approval"):
296
- pending_message_idx = i
297
- pending_tool_calls = msg.get("tool_calls", [])
298
- break
299
-
300
- if pending_message_idx is None or not pending_tool_calls:
301
- # No pending approvals found
302
- if tool_decisions:
303
- logging.warning(
304
- f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
305
- )
306
- return messages
191
+ events: list[StreamMessage] = []
192
+ if not tool_decisions:
193
+ return messages, events
307
194
 
308
195
  # Create decision lookup
309
- decisions_by_id = {
196
+ decisions_by_tool_call_id = {
310
197
  decision.tool_call_id: decision for decision in tool_decisions
311
198
  }
312
199
 
313
- # Validate that all decisions have corresponding pending tool calls
314
- pending_tool_ids = {tool_call["id"] for tool_call in pending_tool_calls}
315
- invalid_decisions = [
316
- decision.tool_call_id
317
- for decision in tool_decisions
318
- if decision.tool_call_id not in pending_tool_ids
319
- ]
320
-
321
- if invalid_decisions:
322
- logging.warning(
323
- f"Received decisions for non-pending tool calls: {invalid_decisions}"
324
- )
200
+ pending_tool_calls: list[ToolCallWithDecision] = []
325
201
 
326
- # Process each tool call
327
- for tool_call in pending_tool_calls:
328
- tool_call_id = tool_call["id"]
329
- decision = decisions_by_id.get(tool_call_id)
202
+ for i in reversed(range(len(messages))):
203
+ msg = messages[i]
204
+ if msg.get("role") == "assistant" and msg.get("tool_calls"):
205
+ message_tool_calls = msg.get("tool_calls", [])
206
+ for tool_call in message_tool_calls:
207
+ decision = decisions_by_tool_call_id.get(tool_call.get("id"), None)
208
+ if tool_call.get("pending_approval"):
209
+ del tool_call[
210
+ "pending_approval"
211
+ ] # Cleanup so that a pending approval is not tagged on message in a future response
212
+ pending_tool_calls.append(
213
+ ToolCallWithDecision(
214
+ tool_call=ChatCompletionMessageToolCall(**tool_call),
215
+ decision=decision,
216
+ message_index=i,
217
+ )
218
+ )
330
219
 
220
+ if not pending_tool_calls:
221
+ error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
222
+ logging.error(error_message)
223
+ raise Exception(error_message)
224
+ for tool_call_with_decision in pending_tool_calls:
225
+ tool_call_message: dict
226
+ tool_call = tool_call_with_decision.tool_call
227
+ decision = tool_call_with_decision.decision
228
+ tool_result: Optional[ToolCallResult] = None
331
229
  if decision and decision.approved:
332
- try:
333
- tool_call_obj = ChatCompletionMessageToolCall(**tool_call)
334
- llm_tool_result = self._invoke_llm_tool_call(
335
- tool_to_call=tool_call_obj,
336
- previous_tool_calls=[],
337
- trace_span=DummySpan(),
338
- tool_number=None,
339
- )
340
- messages.append(llm_tool_result.as_tool_call_message())
341
-
342
- except Exception as e:
343
- logging.error(
344
- f"Failed to execute approved tool {tool_call_id}: {e}"
345
- )
346
- messages.append(
347
- {
348
- "tool_call_id": tool_call_id,
349
- "role": "tool",
350
- "name": tool_call["function"]["name"],
351
- "content": f"Tool execution failed: {str(e)}",
352
- }
353
- )
230
+ tool_result = self._invoke_llm_tool_call(
231
+ tool_to_call=tool_call,
232
+ previous_tool_calls=[],
233
+ trace_span=DummySpan(), # TODO: replace with proper span
234
+ tool_number=None,
235
+ user_approved=True,
236
+ )
354
237
  else:
355
238
  # Tool was rejected or no decision found, add rejection message
356
- messages.append(
357
- {
358
- "tool_call_id": tool_call_id,
359
- "role": "tool",
360
- "name": tool_call["function"]["name"],
361
- "content": "Tool execution was denied by the user.",
362
- }
239
+ tool_result = ToolCallResult(
240
+ tool_call_id=tool_call.id,
241
+ tool_name=tool_call.function.name,
242
+ description=tool_call.function.name,
243
+ result=StructuredToolResult(
244
+ status=StructuredToolResultStatus.ERROR,
245
+ error="Tool execution was denied by the user.",
246
+ ),
363
247
  )
364
248
 
365
- return messages
249
+ events.append(
250
+ StreamMessage(
251
+ event=StreamEvents.TOOL_RESULT,
252
+ data=tool_result.as_streaming_tool_result_response(),
253
+ )
254
+ )
255
+
256
+ tool_call_message = tool_result.as_tool_call_message()
257
+
258
+ # It is expected that the tool call result directly follows the tool call request from the LLM
259
+ # The API call may contain a user ask which is appended to the messages so we can't just append
260
+ # tool call results; they need to be inserted right after the llm's message requesting tool calls
261
+ messages.insert(
262
+ tool_call_with_decision.message_index + 1, tool_call_message
263
+ )
264
+
265
+ return messages, events
366
266
 
367
267
  def prompt_call(
368
268
  self,
@@ -408,40 +308,35 @@ class ToolCallingLLM:
408
308
  trace_span=DummySpan(),
409
309
  tool_number_offset: int = 0,
410
310
  ) -> LLMResult:
411
- perf_timing = PerformanceTiming("tool_calling_llm.call")
412
- tool_calls = [] # type: ignore
311
+ tool_calls: list[
312
+ dict
313
+ ] = [] # Used for preventing repeated tool calls. potentially reset after compaction
314
+ all_tool_calls = [] # type: ignore
413
315
  costs = LLMCosts()
414
-
415
316
  tools = self.tool_executor.get_all_tools_openai_format(
416
317
  target_model=self.llm.model
417
318
  )
418
- perf_timing.measure("get_all_tools_openai_format")
419
319
  max_steps = self.max_steps
420
320
  i = 0
421
321
  metadata: Dict[Any, Any] = {}
422
322
  while i < max_steps:
423
323
  i += 1
424
- perf_timing.measure(f"start iteration {i}")
425
324
  logging.debug(f"running iteration {i}")
426
325
  # on the last step we don't allow tools - we want to force a reply, not a request to run another tool
427
326
  tools = None if i == max_steps else tools
428
327
  tool_choice = "auto" if tools else None
429
328
 
430
- total_tokens = self.llm.count_tokens_for_message(messages)
431
- max_context_size = self.llm.get_context_window_size()
432
- maximum_output_token = self.llm.get_maximum_output_token()
433
- perf_timing.measure("count tokens")
329
+ limit_result = limit_input_context_window(
330
+ llm=self.llm, messages=messages, tools=tools
331
+ )
332
+ messages = limit_result.messages
333
+ metadata = metadata | limit_result.metadata
434
334
 
435
- if (total_tokens + maximum_output_token) > max_context_size:
436
- logging.warning("Token limit exceeded. Truncating tool responses.")
437
- truncated_res = self.truncate_messages_to_fit_context(
438
- messages, max_context_size, maximum_output_token
439
- )
440
- metadata["truncations"] = [
441
- t.model_dump() for t in truncated_res.truncations
442
- ]
443
- messages = truncated_res.truncated_messages
444
- perf_timing.measure("truncate_messages_to_fit_context")
335
+ if (
336
+ limit_result.conversation_history_compacted
337
+ and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
338
+ ):
339
+ tool_calls = []
445
340
 
446
341
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
447
342
 
@@ -459,7 +354,6 @@ class ToolCallingLLM:
459
354
  # Extract and accumulate cost information
460
355
  _process_cost_info(full_response, costs, "LLM call")
461
356
 
462
- perf_timing.measure("llm.completion")
463
357
  # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
464
358
  except BadRequestError as e:
465
359
  if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -483,7 +377,7 @@ class ToolCallingLLM:
483
377
 
484
378
  if incorrect_tool_call:
485
379
  logging.warning(
486
- "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
380
+ "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
487
381
  )
488
382
  # disable structured output going forward and and retry
489
383
  sentry_helper.capture_structured_output_incorrect_tool_call()
@@ -503,8 +397,8 @@ class ToolCallingLLM:
503
397
  hasattr(response_message, "reasoning_content")
504
398
  and response_message.reasoning_content
505
399
  ):
506
- logging.debug(
507
- f"[bold {AI_COLOR}]AI (reasoning) 🤔:[/bold {AI_COLOR}] {response_message.reasoning_content}\n"
400
+ logging.info(
401
+ f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
508
402
  )
509
403
 
510
404
  if not tools_to_call:
@@ -522,25 +416,29 @@ class ToolCallingLLM:
522
416
  )
523
417
  costs.total_cost += post_processing_cost
524
418
 
525
- self.llm.count_tokens_for_message(messages)
526
- perf_timing.end(f"- completed in {i} iterations -")
527
- metadata["usage"] = get_llm_usage(full_response)
528
- metadata["max_tokens"] = max_context_size
529
- metadata["max_output_tokens"] = maximum_output_token
419
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
420
+
421
+ add_token_count_to_metadata(
422
+ tokens=tokens,
423
+ full_llm_response=full_response,
424
+ max_context_size=limit_result.max_context_size,
425
+ maximum_output_token=limit_result.maximum_output_token,
426
+ metadata=metadata,
427
+ )
428
+
530
429
  return LLMResult(
531
430
  result=post_processed_response,
532
431
  unprocessed_result=raw_response,
533
- tool_calls=tool_calls,
432
+ tool_calls=all_tool_calls,
534
433
  prompt=json.dumps(messages, indent=2),
535
434
  messages=messages,
536
435
  **costs.model_dump(), # Include all cost fields
537
436
  metadata=metadata,
538
437
  )
539
438
 
540
- perf_timing.end(f"- completed in {i} iterations -")
541
439
  return LLMResult(
542
440
  result=text_response,
543
- tool_calls=tool_calls,
441
+ tool_calls=all_tool_calls,
544
442
  prompt=json.dumps(messages, indent=2),
545
443
  messages=messages,
546
444
  **costs.model_dump(), # Include all cost fields
@@ -552,7 +450,6 @@ class ToolCallingLLM:
552
450
  logging.info(
553
451
  f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
554
452
  )
555
- perf_timing.measure("pre-tool-calls")
556
453
  with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
557
454
  futures = []
558
455
  futures_tool_numbers: dict[
@@ -562,6 +459,7 @@ class ToolCallingLLM:
562
459
  for tool_index, t in enumerate(tools_to_call, 1):
563
460
  logging.debug(f"Tool to call: {t}")
564
461
  tool_number = tool_number_offset + tool_index
462
+
565
463
  future = executor.submit(
566
464
  self._invoke_llm_tool_call,
567
465
  tool_to_call=t,
@@ -594,10 +492,13 @@ class ToolCallingLLM:
594
492
  tool_span, tool_call_result
595
493
  )
596
494
 
597
- tool_calls.append(tool_call_result.as_tool_result_response())
495
+ tool_result_response_dict = (
496
+ tool_call_result.as_tool_result_response()
497
+ )
498
+ tool_calls.append(tool_result_response_dict)
499
+ all_tool_calls.append(tool_result_response_dict)
598
500
  messages.append(tool_call_result.as_tool_call_message())
599
-
600
- perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
501
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
601
502
 
602
503
  # Update the tool number offset for the next iteration
603
504
  tool_number_offset += len(tools_to_call)
@@ -631,7 +532,7 @@ class ToolCallingLLM:
631
532
  tool_number=tool_number,
632
533
  user_approved=user_approved,
633
534
  llm=self.llm,
634
- max_token_count=get_max_token_count_for_single_tool(self.llm),
535
+ max_token_count=self.llm.get_max_token_count_for_single_tool(),
635
536
  )
636
537
  tool_response = tool.invoke(tool_params, context=invoke_context)
637
538
  except Exception as e:
@@ -650,6 +551,7 @@ class ToolCallingLLM:
650
551
  tool_call_id: str,
651
552
  tool_name: str,
652
553
  tool_arguments: str,
554
+ user_approved: bool,
653
555
  previous_tool_calls: list[dict],
654
556
  tool_number: Optional[int] = None,
655
557
  ) -> ToolCallResult:
@@ -661,17 +563,19 @@ class ToolCallingLLM:
661
563
  f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
662
564
  )
663
565
 
664
- tool_response = prevent_overly_repeated_tool_call(
665
- tool_name=tool_name,
666
- tool_params=tool_params,
667
- tool_calls=previous_tool_calls,
668
- )
566
+ tool_response = None
567
+ if not user_approved:
568
+ tool_response = prevent_overly_repeated_tool_call(
569
+ tool_name=tool_name,
570
+ tool_params=tool_params,
571
+ tool_calls=previous_tool_calls,
572
+ )
669
573
 
670
574
  if not tool_response:
671
575
  tool_response = self._directly_invoke_tool_call(
672
576
  tool_name=tool_name,
673
577
  tool_params=tool_params,
674
- user_approved=False,
578
+ user_approved=user_approved,
675
579
  tool_number=tool_number,
676
580
  )
677
581
 
@@ -716,6 +620,7 @@ class ToolCallingLLM:
716
620
  previous_tool_calls: list[dict],
717
621
  trace_span=None,
718
622
  tool_number=None,
623
+ user_approved: bool = False,
719
624
  ) -> ToolCallResult:
720
625
  if trace_span is None:
721
626
  trace_span = DummySpan()
@@ -748,6 +653,7 @@ class ToolCallingLLM:
748
653
  tool_arguments,
749
654
  previous_tool_calls=previous_tool_calls,
750
655
  tool_number=tool_number,
656
+ user_approved=user_approved,
751
657
  )
752
658
 
753
659
  prevent_overly_big_tool_response(
@@ -850,20 +756,6 @@ class ToolCallingLLM:
850
756
  logging.exception("Failed to run post processing", exc_info=True)
851
757
  return investigation, 0.0
852
758
 
853
- @sentry_sdk.trace
854
- def truncate_messages_to_fit_context(
855
- self, messages: list, max_context_size: int, maximum_output_token: int
856
- ) -> TruncationResult:
857
- truncated_res = truncate_messages_to_fit_context(
858
- messages,
859
- max_context_size,
860
- maximum_output_token,
861
- self.llm.count_tokens_for_message,
862
- )
863
- if truncated_res.truncations:
864
- sentry_helper.capture_tool_truncations(truncated_res.truncations)
865
- return truncated_res
866
-
867
759
  def call_stream(
868
760
  self,
869
761
  system_prompt: str = "",
@@ -872,11 +764,19 @@ class ToolCallingLLM:
872
764
  sections: Optional[InputSectionsDataType] = None,
873
765
  msgs: Optional[list[dict]] = None,
874
766
  enable_tool_approval: bool = False,
767
+ tool_decisions: List[ToolApprovalDecision] | None = None,
875
768
  ):
876
769
  """
877
770
  This function DOES NOT call llm.completion(stream=true).
878
771
  This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
879
772
  """
773
+
774
+ # Process tool decisions if provided
775
+ if msgs and tool_decisions:
776
+ logging.info(f"Processing {len(tool_decisions)} tool decisions")
777
+ msgs, events = self.process_tool_decisions(msgs, tool_decisions)
778
+ yield from events
779
+
880
780
  messages: list[dict] = []
881
781
  if system_prompt:
882
782
  messages.append({"role": "system", "content": system_prompt})
@@ -884,12 +784,10 @@ class ToolCallingLLM:
884
784
  messages.append({"role": "user", "content": user_prompt})
885
785
  if msgs:
886
786
  messages.extend(msgs)
887
- perf_timing = PerformanceTiming("tool_calling_llm.call")
888
787
  tool_calls: list[dict] = []
889
788
  tools = self.tool_executor.get_all_tools_openai_format(
890
789
  target_model=self.llm.model
891
790
  )
892
- perf_timing.measure("get_all_tools_openai_format")
893
791
  max_steps = self.max_steps
894
792
  metadata: Dict[Any, Any] = {}
895
793
  i = 0
@@ -897,29 +795,23 @@ class ToolCallingLLM:
897
795
 
898
796
  while i < max_steps:
899
797
  i += 1
900
- perf_timing.measure(f"start iteration {i}")
901
798
  logging.debug(f"running iteration {i}")
902
799
 
903
800
  tools = None if i == max_steps else tools
904
801
  tool_choice = "auto" if tools else None
905
802
 
906
- total_tokens = self.llm.count_tokens_for_message(messages) # type: ignore
907
- max_context_size = self.llm.get_context_window_size()
908
- maximum_output_token = self.llm.get_maximum_output_token()
909
- perf_timing.measure("count tokens")
803
+ limit_result = limit_input_context_window(
804
+ llm=self.llm, messages=messages, tools=tools
805
+ )
806
+ yield from limit_result.events
807
+ messages = limit_result.messages
808
+ metadata = metadata | limit_result.metadata
910
809
 
911
- if (total_tokens + maximum_output_token) > max_context_size:
912
- logging.warning("Token limit exceeded. Truncating tool responses.")
913
- truncated_res = self.truncate_messages_to_fit_context(
914
- messages, max_context_size, maximum_output_token
915
- )
916
- metadata["truncations"] = [
917
- t.model_dump() for t in truncated_res.truncations
918
- ]
919
- messages = truncated_res.truncated_messages
920
- perf_timing.measure("truncate_messages_to_fit_context")
921
- else:
922
- metadata["truncations"] = []
810
+ if (
811
+ limit_result.conversation_history_compacted
812
+ and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
813
+ ):
814
+ tool_calls = []
923
815
 
924
816
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
925
817
  try:
@@ -936,7 +828,6 @@ class ToolCallingLLM:
936
828
  # Log cost information for this iteration (no accumulation in streaming)
937
829
  _process_cost_info(full_response, log_prefix="LLM iteration")
938
830
 
939
- perf_timing.measure("llm.completion")
940
831
  # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
941
832
  except BadRequestError as e:
942
833
  if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -958,7 +849,7 @@ class ToolCallingLLM:
958
849
 
959
850
  if incorrect_tool_call:
960
851
  logging.warning(
961
- "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
852
+ "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
962
853
  )
963
854
  # disable structured output going forward and and retry
964
855
  sentry_helper.capture_structured_output_incorrect_tool_call()
@@ -972,12 +863,18 @@ class ToolCallingLLM:
972
863
  )
973
864
  )
974
865
 
866
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
867
+ add_token_count_to_metadata(
868
+ tokens=tokens,
869
+ full_llm_response=full_response,
870
+ max_context_size=limit_result.max_context_size,
871
+ maximum_output_token=limit_result.maximum_output_token,
872
+ metadata=metadata,
873
+ )
874
+ yield build_stream_event_token_count(metadata=metadata)
875
+
975
876
  tools_to_call = getattr(response_message, "tool_calls", None)
976
877
  if not tools_to_call:
977
- self.llm.count_tokens_for_message(messages)
978
- metadata["usage"] = get_llm_usage(full_response)
979
- metadata["max_tokens"] = max_context_size
980
- metadata["max_output_tokens"] = maximum_output_token
981
878
  yield StreamMessage(
982
879
  event=StreamEvents.ANSWER_END,
983
880
  data={
@@ -993,11 +890,13 @@ class ToolCallingLLM:
993
890
  if reasoning or message:
994
891
  yield StreamMessage(
995
892
  event=StreamEvents.AI_MESSAGE,
996
- data={"content": message, "reasoning": reasoning},
893
+ data={
894
+ "content": message,
895
+ "reasoning": reasoning,
896
+ "metadata": metadata,
897
+ },
997
898
  )
998
899
 
999
- perf_timing.measure("pre-tool-calls")
1000
-
1001
900
  # Check if any tools require approval first
1002
901
  pending_approvals = []
1003
902
  approval_required_tools = []
@@ -1006,6 +905,7 @@ class ToolCallingLLM:
1006
905
  futures = []
1007
906
  for tool_index, t in enumerate(tools_to_call, 1): # type: ignore
1008
907
  tool_number = tool_number_offset + tool_index
908
+
1009
909
  future = executor.submit(
1010
910
  self._invoke_llm_tool_call,
1011
911
  tool_to_call=t, # type: ignore
@@ -1069,23 +969,11 @@ class ToolCallingLLM:
1069
969
  # If we have approval required tools, end the stream with pending approvals
1070
970
  if pending_approvals:
1071
971
  # Add assistant message with pending tool calls
1072
- assistant_msg = {
1073
- "role": "assistant",
1074
- "content": response_message.content,
1075
- "tool_calls": [
1076
- {
1077
- "id": result.tool_call_id,
1078
- "type": "function",
1079
- "function": {
1080
- "name": result.tool_name,
1081
- "arguments": json.dumps(result.result.params or {}),
1082
- },
1083
- }
1084
- for result in approval_required_tools
1085
- ],
1086
- "pending_approval": True,
1087
- }
1088
- messages.append(assistant_msg)
972
+ for result in approval_required_tools:
973
+ tool_call = self.find_assistant_tool_call_request(
974
+ tool_call_id=result.tool_call_id, messages=messages
975
+ )
976
+ tool_call["pending_approval"] = True
1089
977
 
1090
978
  # End stream with approvals required
1091
979
  yield StreamMessage(
@@ -1108,6 +996,21 @@ class ToolCallingLLM:
1108
996
  f"Too many LLM calls - exceeded max_steps: {i}/{self.max_steps}"
1109
997
  )
1110
998
 
999
+ def find_assistant_tool_call_request(
1000
+ self, tool_call_id: str, messages: list[dict[str, Any]]
1001
+ ) -> dict[str, Any]:
1002
+ for message in messages:
1003
+ if message.get("role") == "assistant":
1004
+ for tool_call in message.get("tool_calls", []):
1005
+ if tool_call.get("id") == tool_call_id:
1006
+ return tool_call
1007
+
1008
+ # Should not happen unless there is a bug.
1009
+ # If we are here
1010
+ raise Exception(
1011
+ f"Failed to find assistant request for a tool_call in conversation history. tool_call_id={tool_call_id}"
1012
+ )
1013
+
1111
1014
 
1112
1015
  # TODO: consider getting rid of this entirely and moving templating into the cmds in holmes_cli.py
1113
1016
  class IssueInvestigator(ToolCallingLLM):