holmesgpt 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

@@ -7,8 +7,6 @@ from typing import Dict, List, Optional, Type, Union, Callable, Any
7
7
  from holmes.core.models import (
8
8
  ToolApprovalDecision,
9
9
  ToolCallResult,
10
- TruncationResult,
11
- TruncationMetadata,
12
10
  PendingToolApproval,
13
11
  )
14
12
 
@@ -21,8 +19,8 @@ from pydantic import BaseModel, Field
21
19
  from rich.console import Console
22
20
 
23
21
  from holmes.common.env_vars import (
22
+ RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
24
23
  TEMPERATURE,
25
- MAX_OUTPUT_TOKEN_RESERVATION,
26
24
  LOG_LLM_USAGE_RESPONSE,
27
25
  )
28
26
 
@@ -35,7 +33,6 @@ from holmes.core.investigation_structured_output import (
35
33
  )
36
34
  from holmes.core.issue import Issue
37
35
  from holmes.core.llm import LLM
38
- from holmes.core.performance_timing import PerformanceTiming
39
36
  from holmes.core.resource_instruction import ResourceInstructions
40
37
  from holmes.core.runbooks import RunbookManager
41
38
  from holmes.core.safeguards import prevent_overly_repeated_tool_call
@@ -45,9 +42,11 @@ from holmes.core.tools import (
45
42
  ToolInvokeContext,
46
43
  )
47
44
  from holmes.core.tools_utils.tool_context_window_limiter import (
48
- get_max_token_count_for_single_tool,
49
45
  prevent_overly_big_tool_response,
50
46
  )
47
+ from holmes.core.truncation.input_context_window_limiter import (
48
+ limit_input_context_window,
49
+ )
51
50
  from holmes.plugins.prompts import load_and_render_prompt
52
51
  from holmes.utils import sentry_helper
53
52
  from holmes.utils.global_instructions import (
@@ -69,9 +68,6 @@ from holmes.utils.stream import (
69
68
  cost_logger = logging.getLogger("holmes.costs")
70
69
 
71
70
 
72
- TRUNCATION_NOTICE = "\n\n[TRUNCATED]"
73
-
74
-
75
71
  class LLMCosts(BaseModel):
76
72
  """Tracks cost and token usage for LLM calls."""
77
73
 
@@ -143,114 +139,6 @@ def _process_cost_info(
143
139
  logging.debug(f"Could not extract cost information: {e}")
144
140
 
145
141
 
146
- # TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
147
- # However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
148
- # We should fix this in the future
149
- # TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
150
- # token truncation and not character truncation
151
- def truncate_messages_to_fit_context(
152
- messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
153
- ) -> TruncationResult:
154
- """
155
- Helper function to truncate tool messages to fit within context limits.
156
-
157
- Args:
158
- messages: List of message dictionaries with roles and content
159
- max_context_size: Maximum context window size for the model
160
- maximum_output_token: Maximum tokens reserved for model output
161
- count_tokens_fn: Function to count tokens for a list of messages
162
-
163
- Returns:
164
- Modified list of messages with truncated tool responses
165
-
166
- Raises:
167
- Exception: If non-tool messages exceed available context space
168
- """
169
- messages_except_tools = [
170
- message for message in messages if message["role"] != "tool"
171
- ]
172
- tokens = count_tokens_fn(messages_except_tools)
173
- message_size_without_tools = tokens.total_tokens
174
-
175
- tool_call_messages = [message for message in messages if message["role"] == "tool"]
176
-
177
- reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
178
- if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
179
- logging.error(
180
- f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
181
- )
182
- raise Exception(
183
- f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
184
- )
185
-
186
- if len(tool_call_messages) == 0:
187
- return TruncationResult(truncated_messages=messages, truncations=[])
188
-
189
- available_space = (
190
- max_context_size - message_size_without_tools - reserved_for_output_tokens
191
- )
192
- remaining_space = available_space
193
- tool_call_messages.sort(
194
- key=lambda x: count_tokens_fn(
195
- [{"role": "tool", "content": x["content"]}]
196
- ).total_tokens
197
- )
198
-
199
- truncations = []
200
-
201
- # Allocate space starting with small tools and going to larger tools, while maintaining fairness
202
- # Small tools can often get exactly what they need, while larger tools may need to be truncated
203
- # We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
204
- for i, msg in enumerate(tool_call_messages):
205
- remaining_tools = len(tool_call_messages) - i
206
- max_allocation = remaining_space // remaining_tools
207
- needed_space = count_tokens_fn(
208
- [{"role": "tool", "content": msg["content"]}]
209
- ).total_tokens
210
- allocated_space = min(needed_space, max_allocation)
211
-
212
- if needed_space > allocated_space:
213
- truncation_metadata = _truncate_tool_message(
214
- msg, allocated_space, needed_space
215
- )
216
- truncations.append(truncation_metadata)
217
-
218
- remaining_space -= allocated_space
219
- return TruncationResult(truncated_messages=messages, truncations=truncations)
220
-
221
-
222
- def _truncate_tool_message(
223
- msg: dict, allocated_space: int, needed_space: int
224
- ) -> TruncationMetadata:
225
- msg_content = msg["content"]
226
- tool_call_id = msg["tool_call_id"]
227
- tool_name = msg["name"]
228
-
229
- # Ensure the indicator fits in the allocated space
230
- if allocated_space > len(TRUNCATION_NOTICE):
231
- original = msg_content if isinstance(msg_content, str) else str(msg_content)
232
- msg["content"] = (
233
- original[: allocated_space - len(TRUNCATION_NOTICE)] + TRUNCATION_NOTICE
234
- )
235
- end_index = allocated_space - len(TRUNCATION_NOTICE)
236
- else:
237
- msg["content"] = TRUNCATION_NOTICE[:allocated_space]
238
- end_index = allocated_space
239
-
240
- msg.pop("token_count", None) # Remove token_count if present
241
- logging.info(
242
- f"Truncating tool message '{tool_name}' from {needed_space} to {allocated_space} tokens"
243
- )
244
- truncation_metadata = TruncationMetadata(
245
- tool_call_id=tool_call_id,
246
- start_index=0,
247
- end_index=end_index,
248
- tool_name=tool_name,
249
- original_token_count=needed_space,
250
- )
251
- return truncation_metadata
252
-
253
-
254
142
  class LLMResult(LLMCosts):
255
143
  tool_calls: Optional[List[ToolCallResult]] = None
256
144
  result: Optional[str] = None
@@ -289,7 +177,7 @@ class ToolCallingLLM:
289
177
 
290
178
  def process_tool_decisions(
291
179
  self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
292
- ) -> List[Dict[str, Any]]:
180
+ ) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
293
181
  """
294
182
  Process tool approval decisions and execute approved tools.
295
183
 
@@ -300,8 +188,9 @@ class ToolCallingLLM:
300
188
  Returns:
301
189
  Updated messages list with tool execution results
302
190
  """
191
+ events: list[StreamMessage] = []
303
192
  if not tool_decisions:
304
- return messages
193
+ return messages, events
305
194
 
306
195
  # Create decision lookup
307
196
  decisions_by_tool_call_id = {
@@ -332,40 +221,39 @@ class ToolCallingLLM:
332
221
  error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
333
222
  logging.error(error_message)
334
223
  raise Exception(error_message)
335
-
336
224
  for tool_call_with_decision in pending_tool_calls:
337
225
  tool_call_message: dict
338
226
  tool_call = tool_call_with_decision.tool_call
339
227
  decision = tool_call_with_decision.decision
228
+ tool_result: Optional[ToolCallResult] = None
340
229
  if decision and decision.approved:
341
- try:
342
- llm_tool_result = self._invoke_llm_tool_call(
343
- tool_to_call=tool_call,
344
- previous_tool_calls=[],
345
- trace_span=DummySpan(), # TODO: replace with proper span
346
- tool_number=None,
347
- user_approved=True,
348
- )
349
- tool_call_message = llm_tool_result.as_tool_call_message()
350
-
351
- except Exception as e:
352
- logging.error(
353
- f"Failed to execute approved tool {tool_call.id}: {e}"
354
- )
355
- tool_call_message = {
356
- "tool_call_id": tool_call.id,
357
- "role": "tool",
358
- "name": tool_call.function.name,
359
- "content": f"Tool execution failed: {str(e)}",
360
- }
230
+ tool_result = self._invoke_llm_tool_call(
231
+ tool_to_call=tool_call,
232
+ previous_tool_calls=[],
233
+ trace_span=DummySpan(), # TODO: replace with proper span
234
+ tool_number=None,
235
+ user_approved=True,
236
+ )
361
237
  else:
362
238
  # Tool was rejected or no decision found, add rejection message
363
- tool_call_message = {
364
- "tool_call_id": tool_call.id,
365
- "role": "tool",
366
- "name": tool_call.function.name,
367
- "content": "Tool execution was denied by the user.",
368
- }
239
+ tool_result = ToolCallResult(
240
+ tool_call_id=tool_call.id,
241
+ tool_name=tool_call.function.name,
242
+ description=tool_call.function.name,
243
+ result=StructuredToolResult(
244
+ status=StructuredToolResultStatus.ERROR,
245
+ error="Tool execution was denied by the user.",
246
+ ),
247
+ )
248
+
249
+ events.append(
250
+ StreamMessage(
251
+ event=StreamEvents.TOOL_RESULT,
252
+ data=tool_result.as_streaming_tool_result_response(),
253
+ )
254
+ )
255
+
256
+ tool_call_message = tool_result.as_tool_call_message()
369
257
 
370
258
  # It is expected that the tool call result directly follows the tool call request from the LLM
371
259
  # The API call may contain a user ask which is appended to the messages so we can't just append
@@ -374,7 +262,7 @@ class ToolCallingLLM:
374
262
  tool_call_with_decision.message_index + 1, tool_call_message
375
263
  )
376
264
 
377
- return messages
265
+ return messages, events
378
266
 
379
267
  def prompt_call(
380
268
  self,
@@ -420,40 +308,35 @@ class ToolCallingLLM:
420
308
  trace_span=DummySpan(),
421
309
  tool_number_offset: int = 0,
422
310
  ) -> LLMResult:
423
- perf_timing = PerformanceTiming("tool_calling_llm.call")
424
- tool_calls = [] # type: ignore
311
+ tool_calls: list[
312
+ dict
313
+ ] = [] # Used for preventing repeated tool calls. potentially reset after compaction
314
+ all_tool_calls = [] # type: ignore
425
315
  costs = LLMCosts()
426
-
427
316
  tools = self.tool_executor.get_all_tools_openai_format(
428
317
  target_model=self.llm.model
429
318
  )
430
- perf_timing.measure("get_all_tools_openai_format")
431
319
  max_steps = self.max_steps
432
320
  i = 0
433
321
  metadata: Dict[Any, Any] = {}
434
322
  while i < max_steps:
435
323
  i += 1
436
- perf_timing.measure(f"start iteration {i}")
437
324
  logging.debug(f"running iteration {i}")
438
325
  # on the last step we don't allow tools - we want to force a reply, not a request to run another tool
439
326
  tools = None if i == max_steps else tools
440
327
  tool_choice = "auto" if tools else None
441
328
 
442
- tokens = self.llm.count_tokens(messages=messages, tools=tools)
443
- max_context_size = self.llm.get_context_window_size()
444
- maximum_output_token = self.llm.get_maximum_output_token()
445
- perf_timing.measure("count tokens")
446
-
447
- if (tokens.total_tokens + maximum_output_token) > max_context_size:
448
- logging.warning("Token limit exceeded. Truncating tool responses.")
449
- truncated_res = self.truncate_messages_to_fit_context(
450
- messages, max_context_size, maximum_output_token
451
- )
452
- metadata["truncations"] = [
453
- t.model_dump() for t in truncated_res.truncations
454
- ]
455
- messages = truncated_res.truncated_messages
456
- perf_timing.measure("truncate_messages_to_fit_context")
329
+ limit_result = limit_input_context_window(
330
+ llm=self.llm, messages=messages, tools=tools
331
+ )
332
+ messages = limit_result.messages
333
+ metadata = metadata | limit_result.metadata
334
+
335
+ if (
336
+ limit_result.conversation_history_compacted
337
+ and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
338
+ ):
339
+ tool_calls = []
457
340
 
458
341
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
459
342
 
@@ -471,7 +354,6 @@ class ToolCallingLLM:
471
354
  # Extract and accumulate cost information
472
355
  _process_cost_info(full_response, costs, "LLM call")
473
356
 
474
- perf_timing.measure("llm.completion")
475
357
  # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
476
358
  except BadRequestError as e:
477
359
  if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -515,8 +397,8 @@ class ToolCallingLLM:
515
397
  hasattr(response_message, "reasoning_content")
516
398
  and response_message.reasoning_content
517
399
  ):
518
- logging.debug(
519
- f"[bold {AI_COLOR}]AI (reasoning) 🤔:[/bold {AI_COLOR}] {response_message.reasoning_content}\n"
400
+ logging.info(
401
+ f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
520
402
  )
521
403
 
522
404
  if not tools_to_call:
@@ -539,26 +421,24 @@ class ToolCallingLLM:
539
421
  add_token_count_to_metadata(
540
422
  tokens=tokens,
541
423
  full_llm_response=full_response,
542
- max_context_size=max_context_size,
543
- maximum_output_token=maximum_output_token,
424
+ max_context_size=limit_result.max_context_size,
425
+ maximum_output_token=limit_result.maximum_output_token,
544
426
  metadata=metadata,
545
427
  )
546
- perf_timing.end(f"- completed in {i} iterations -")
547
428
 
548
429
  return LLMResult(
549
430
  result=post_processed_response,
550
431
  unprocessed_result=raw_response,
551
- tool_calls=tool_calls,
432
+ tool_calls=all_tool_calls,
552
433
  prompt=json.dumps(messages, indent=2),
553
434
  messages=messages,
554
435
  **costs.model_dump(), # Include all cost fields
555
436
  metadata=metadata,
556
437
  )
557
438
 
558
- perf_timing.end(f"- completed in {i} iterations -")
559
439
  return LLMResult(
560
440
  result=text_response,
561
- tool_calls=tool_calls,
441
+ tool_calls=all_tool_calls,
562
442
  prompt=json.dumps(messages, indent=2),
563
443
  messages=messages,
564
444
  **costs.model_dump(), # Include all cost fields
@@ -570,7 +450,6 @@ class ToolCallingLLM:
570
450
  logging.info(
571
451
  f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
572
452
  )
573
- perf_timing.measure("pre-tool-calls")
574
453
  with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
575
454
  futures = []
576
455
  futures_tool_numbers: dict[
@@ -580,6 +459,7 @@ class ToolCallingLLM:
580
459
  for tool_index, t in enumerate(tools_to_call, 1):
581
460
  logging.debug(f"Tool to call: {t}")
582
461
  tool_number = tool_number_offset + tool_index
462
+
583
463
  future = executor.submit(
584
464
  self._invoke_llm_tool_call,
585
465
  tool_to_call=t,
@@ -612,10 +492,13 @@ class ToolCallingLLM:
612
492
  tool_span, tool_call_result
613
493
  )
614
494
 
615
- tool_calls.append(tool_call_result.as_tool_result_response())
495
+ tool_result_response_dict = (
496
+ tool_call_result.as_tool_result_response()
497
+ )
498
+ tool_calls.append(tool_result_response_dict)
499
+ all_tool_calls.append(tool_result_response_dict)
616
500
  messages.append(tool_call_result.as_tool_call_message())
617
-
618
- perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
501
+ tokens = self.llm.count_tokens(messages=messages, tools=tools)
619
502
 
620
503
  # Update the tool number offset for the next iteration
621
504
  tool_number_offset += len(tools_to_call)
@@ -649,7 +532,7 @@ class ToolCallingLLM:
649
532
  tool_number=tool_number,
650
533
  user_approved=user_approved,
651
534
  llm=self.llm,
652
- max_token_count=get_max_token_count_for_single_tool(self.llm),
535
+ max_token_count=self.llm.get_max_token_count_for_single_tool(),
653
536
  )
654
537
  tool_response = tool.invoke(tool_params, context=invoke_context)
655
538
  except Exception as e:
@@ -680,11 +563,13 @@ class ToolCallingLLM:
680
563
  f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
681
564
  )
682
565
 
683
- tool_response = prevent_overly_repeated_tool_call(
684
- tool_name=tool_name,
685
- tool_params=tool_params,
686
- tool_calls=previous_tool_calls,
687
- )
566
+ tool_response = None
567
+ if not user_approved:
568
+ tool_response = prevent_overly_repeated_tool_call(
569
+ tool_name=tool_name,
570
+ tool_params=tool_params,
571
+ tool_calls=previous_tool_calls,
572
+ )
688
573
 
689
574
  if not tool_response:
690
575
  tool_response = self._directly_invoke_tool_call(
@@ -871,20 +756,6 @@ class ToolCallingLLM:
871
756
  logging.exception("Failed to run post processing", exc_info=True)
872
757
  return investigation, 0.0
873
758
 
874
- @sentry_sdk.trace
875
- def truncate_messages_to_fit_context(
876
- self, messages: list, max_context_size: int, maximum_output_token: int
877
- ) -> TruncationResult:
878
- truncated_res = truncate_messages_to_fit_context(
879
- messages,
880
- max_context_size,
881
- maximum_output_token,
882
- self.llm.count_tokens,
883
- )
884
- if truncated_res.truncations:
885
- sentry_helper.capture_tool_truncations(truncated_res.truncations)
886
- return truncated_res
887
-
888
759
  def call_stream(
889
760
  self,
890
761
  system_prompt: str = "",
@@ -893,11 +764,19 @@ class ToolCallingLLM:
893
764
  sections: Optional[InputSectionsDataType] = None,
894
765
  msgs: Optional[list[dict]] = None,
895
766
  enable_tool_approval: bool = False,
767
+ tool_decisions: List[ToolApprovalDecision] | None = None,
896
768
  ):
897
769
  """
898
770
  This function DOES NOT call llm.completion(stream=true).
899
771
  This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
900
772
  """
773
+
774
+ # Process tool decisions if provided
775
+ if msgs and tool_decisions:
776
+ logging.info(f"Processing {len(tool_decisions)} tool decisions")
777
+ msgs, events = self.process_tool_decisions(msgs, tool_decisions)
778
+ yield from events
779
+
901
780
  messages: list[dict] = []
902
781
  if system_prompt:
903
782
  messages.append({"role": "system", "content": system_prompt})
@@ -905,12 +784,10 @@ class ToolCallingLLM:
905
784
  messages.append({"role": "user", "content": user_prompt})
906
785
  if msgs:
907
786
  messages.extend(msgs)
908
- perf_timing = PerformanceTiming("tool_calling_llm.call")
909
787
  tool_calls: list[dict] = []
910
788
  tools = self.tool_executor.get_all_tools_openai_format(
911
789
  target_model=self.llm.model
912
790
  )
913
- perf_timing.measure("get_all_tools_openai_format")
914
791
  max_steps = self.max_steps
915
792
  metadata: Dict[Any, Any] = {}
916
793
  i = 0
@@ -918,29 +795,23 @@ class ToolCallingLLM:
918
795
 
919
796
  while i < max_steps:
920
797
  i += 1
921
- perf_timing.measure(f"start iteration {i}")
922
798
  logging.debug(f"running iteration {i}")
923
799
 
924
800
  tools = None if i == max_steps else tools
925
801
  tool_choice = "auto" if tools else None
926
802
 
927
- tokens = self.llm.count_tokens(messages=messages, tools=tools) # type: ignore
928
- max_context_size = self.llm.get_context_window_size()
929
- maximum_output_token = self.llm.get_maximum_output_token()
930
- perf_timing.measure("count tokens")
803
+ limit_result = limit_input_context_window(
804
+ llm=self.llm, messages=messages, tools=tools
805
+ )
806
+ yield from limit_result.events
807
+ messages = limit_result.messages
808
+ metadata = metadata | limit_result.metadata
931
809
 
932
- if (tokens.total_tokens + maximum_output_token) > max_context_size:
933
- logging.warning("Token limit exceeded. Truncating tool responses.")
934
- truncated_res = self.truncate_messages_to_fit_context(
935
- messages, max_context_size, maximum_output_token
936
- )
937
- metadata["truncations"] = [
938
- t.model_dump() for t in truncated_res.truncations
939
- ]
940
- messages = truncated_res.truncated_messages
941
- perf_timing.measure("truncate_messages_to_fit_context")
942
- else:
943
- metadata["truncations"] = []
810
+ if (
811
+ limit_result.conversation_history_compacted
812
+ and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
813
+ ):
814
+ tool_calls = []
944
815
 
945
816
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
946
817
  try:
@@ -957,7 +828,6 @@ class ToolCallingLLM:
957
828
  # Log cost information for this iteration (no accumulation in streaming)
958
829
  _process_cost_info(full_response, log_prefix="LLM iteration")
959
830
 
960
- perf_timing.measure("llm.completion")
961
831
  # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
962
832
  except BadRequestError as e:
963
833
  if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -997,8 +867,8 @@ class ToolCallingLLM:
997
867
  add_token_count_to_metadata(
998
868
  tokens=tokens,
999
869
  full_llm_response=full_response,
1000
- max_context_size=max_context_size,
1001
- maximum_output_token=maximum_output_token,
870
+ max_context_size=limit_result.max_context_size,
871
+ maximum_output_token=limit_result.maximum_output_token,
1002
872
  metadata=metadata,
1003
873
  )
1004
874
  yield build_stream_event_token_count(metadata=metadata)
@@ -1027,8 +897,6 @@ class ToolCallingLLM:
1027
897
  },
1028
898
  )
1029
899
 
1030
- perf_timing.measure("pre-tool-calls")
1031
-
1032
900
  # Check if any tools require approval first
1033
901
  pending_approvals = []
1034
902
  approval_required_tools = []
@@ -1037,6 +905,7 @@ class ToolCallingLLM:
1037
905
  futures = []
1038
906
  for tool_index, t in enumerate(tools_to_call, 1): # type: ignore
1039
907
  tool_number = tool_number_offset + tool_index
908
+
1040
909
  future = executor.submit(
1041
910
  self._invoke_llm_tool_call,
1042
911
  tool_to_call=t, # type: ignore
@@ -1,14 +1,16 @@
1
1
  from typing import Optional
2
- from holmes.common.env_vars import (
3
- TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT,
4
- TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_TOKENS,
5
- )
2
+ from pydantic import BaseModel
6
3
  from holmes.core.llm import LLM
7
4
  from holmes.core.tools import StructuredToolResultStatus
8
5
  from holmes.core.models import ToolCallResult
9
6
  from holmes.utils import sentry_helper
10
7
 
11
8
 
9
+ class ToolCallSizeMetadata(BaseModel):
10
+ messages_token: int
11
+ max_tokens_allowed: int
12
+
13
+
12
14
  def get_pct_token_count(percent_of_total_context_window: float, llm: LLM) -> int:
13
15
  context_window_size = llm.get_context_window_size()
14
16
 
@@ -18,47 +20,38 @@ def get_pct_token_count(percent_of_total_context_window: float, llm: LLM) -> int
18
20
  return context_window_size
19
21
 
20
22
 
21
- def get_max_token_count_for_single_tool(llm: LLM) -> int:
22
- return min(
23
- TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_TOKENS,
24
- get_pct_token_count(
25
- percent_of_total_context_window=TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT,
26
- llm=llm,
27
- ),
28
- )
29
-
30
-
31
- def prevent_overly_big_tool_response(tool_call_result: ToolCallResult, llm: LLM):
32
- max_tokens_allowed = get_max_token_count_for_single_tool(llm)
33
-
34
- message = tool_call_result.as_tool_call_message()
35
-
36
- tokens = llm.count_tokens(messages=[message])
37
- messages_token = tokens.total_tokens
38
-
39
- if messages_token > max_tokens_allowed:
40
- relative_pct = ((messages_token - max_tokens_allowed) / messages_token) * 100
41
-
42
- error_message: Optional[str] = (
43
- f"The tool call result is too large to return: {messages_token} tokens.\nThe maximum allowed tokens is {max_tokens_allowed} which is {format(relative_pct, '.1f')}% smaller.\nInstructions for the LLM: try to repeat the query but proactively narrow down the result so that the tool answer fits within the allowed number of tokens."
23
+ def is_tool_call_too_big(
24
+ tool_call_result: ToolCallResult, llm: LLM
25
+ ) -> tuple[bool, Optional[ToolCallSizeMetadata]]:
26
+ if tool_call_result.result.status == StructuredToolResultStatus.SUCCESS:
27
+ message = tool_call_result.as_tool_call_message()
28
+
29
+ tokens = llm.count_tokens(messages=[message])
30
+ max_tokens_allowed = llm.get_max_token_count_for_single_tool()
31
+ return (
32
+ tokens.total_tokens > max_tokens_allowed,
33
+ ToolCallSizeMetadata(
34
+ messages_token=tokens.total_tokens,
35
+ max_tokens_allowed=max_tokens_allowed,
36
+ ),
44
37
  )
38
+ return False, None
45
39
 
46
- if tool_call_result.result.status == StructuredToolResultStatus.NO_DATA:
47
- error_message = None
48
- # tool_call_result.result.data is set to None below which is expected to fix the issue
49
- elif tool_call_result.result.status == StructuredToolResultStatus.ERROR:
50
- original_error = (
51
- tool_call_result.result.error
52
- or tool_call_result.result.data
53
- or "Unknown error"
54
- )
55
- truncated_error = str(original_error)[:100]
56
- error_message = f"The tool call returned an error it is too large to return\nThe following original error is truncated:\n{truncated_error}"
57
40
 
41
+ def prevent_overly_big_tool_response(tool_call_result: ToolCallResult, llm: LLM):
42
+ tool_call_result_is_too_big, metadata = is_tool_call_too_big(
43
+ tool_call_result=tool_call_result, llm=llm
44
+ )
45
+ if tool_call_result_is_too_big and metadata:
46
+ relative_pct = (
47
+ (metadata.messages_token - metadata.max_tokens_allowed)
48
+ / metadata.messages_token
49
+ ) * 100
50
+ error_message = f"The tool call result is too large to return: {metadata.messages_token} tokens.\nThe maximum allowed tokens is {metadata.max_tokens_allowed} which is {format(relative_pct, '.1f')}% smaller.\nInstructions for the LLM: try to repeat the query but proactively narrow down the result so that the tool answer fits within the allowed number of tokens."
58
51
  tool_call_result.result.status = StructuredToolResultStatus.ERROR
59
52
  tool_call_result.result.data = None
60
53
  tool_call_result.result.error = error_message
61
54
 
62
55
  sentry_helper.capture_toolcall_contains_too_many_tokens(
63
- tool_call_result, messages_token, max_tokens_allowed
56
+ tool_call_result, metadata.messages_token, metadata.max_tokens_allowed
64
57
  )