PyPI - holmesgpt - Versions diffs - 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

holmesgpt 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of holmesgpt might be problematic. Click here for more details.

Files changed (19) hide show

holmes/__init__.py +1 -1
holmes/common/env_vars.py +8 -0
holmes/core/llm.py +28 -0
holmes/core/supabase_dal.py +33 -42
holmes/core/tool_calling_llm.py +92 -223
holmes/core/tools_utils/tool_context_window_limiter.py +32 -39
holmes/core/truncation/compaction.py +59 -0
holmes/core/truncation/input_context_window_limiter.py +218 -0
holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
holmes/plugins/toolsets/investigator/core_investigation.py +20 -11
holmes/plugins/toolsets/robusta/robusta.py +35 -8
holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +4 -3
holmes/utils/stream.py +1 -0
{holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/METADATA +4 -2
{holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/RECORD +18 -16
holmes/core/performance_timing.py +0 -72
{holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/LICENSE.txt +0 -0
{holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/WHEEL +0 -0
{holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/entry_points.txt +0 -0

holmes/core/tool_calling_llm.py CHANGED Viewed

@@ -7,8 +7,6 @@ from typing import Dict, List, Optional, Type, Union, Callable, Any
 from holmes.core.models import (
     ToolApprovalDecision,
     ToolCallResult,
-    TruncationResult,
-    TruncationMetadata,
     PendingToolApproval,
 )
@@ -21,8 +19,8 @@ from pydantic import BaseModel, Field
 from rich.console import Console
 from holmes.common.env_vars import (
+    RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
     TEMPERATURE,
-    MAX_OUTPUT_TOKEN_RESERVATION,
     LOG_LLM_USAGE_RESPONSE,
 )
@@ -35,7 +33,6 @@ from holmes.core.investigation_structured_output import (
 )
 from holmes.core.issue import Issue
 from holmes.core.llm import LLM
-from holmes.core.performance_timing import PerformanceTiming
 from holmes.core.resource_instruction import ResourceInstructions
 from holmes.core.runbooks import RunbookManager
 from holmes.core.safeguards import prevent_overly_repeated_tool_call
@@ -45,9 +42,11 @@ from holmes.core.tools import (
     ToolInvokeContext,
 )
 from holmes.core.tools_utils.tool_context_window_limiter import (
-    get_max_token_count_for_single_tool,
     prevent_overly_big_tool_response,
 )
+from holmes.core.truncation.input_context_window_limiter import (
+    limit_input_context_window,
+)
 from holmes.plugins.prompts import load_and_render_prompt
 from holmes.utils import sentry_helper
 from holmes.utils.global_instructions import (
@@ -69,9 +68,6 @@ from holmes.utils.stream import (
 cost_logger = logging.getLogger("holmes.costs")
-TRUNCATION_NOTICE = "\n\n[TRUNCATED]"
 class LLMCosts(BaseModel):
     """Tracks cost and token usage for LLM calls."""
@@ -143,114 +139,6 @@ def _process_cost_info(
         logging.debug(f"Could not extract cost information: {e}")
-# TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
-# However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
-# We should fix this in the future
-# TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
-# token truncation and not character truncation
-def truncate_messages_to_fit_context(
-    messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
-) -> TruncationResult:
-    """
-    Helper function to truncate tool messages to fit within context limits.
-    Args:
-        messages: List of message dictionaries with roles and content
-        max_context_size: Maximum context window size for the model
-        maximum_output_token: Maximum tokens reserved for model output
-        count_tokens_fn: Function to count tokens for a list of messages
-    Returns:
-        Modified list of messages with truncated tool responses
-    Raises:
-        Exception: If non-tool messages exceed available context space
-    """
-    messages_except_tools = [
-        message for message in messages if message["role"] != "tool"
-    ]
-    tokens = count_tokens_fn(messages_except_tools)
-    message_size_without_tools = tokens.total_tokens
-    tool_call_messages = [message for message in messages if message["role"] == "tool"]
-    reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
-    if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
-        logging.error(
-            f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
-        )
-        raise Exception(
-            f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
-        )
-    if len(tool_call_messages) == 0:
-        return TruncationResult(truncated_messages=messages, truncations=[])
-    available_space = (
-        max_context_size - message_size_without_tools - reserved_for_output_tokens
-    )
-    remaining_space = available_space
-    tool_call_messages.sort(
-        key=lambda x: count_tokens_fn(
-            [{"role": "tool", "content": x["content"]}]
-        ).total_tokens
-    )
-    truncations = []
-    # Allocate space starting with small tools and going to larger tools, while maintaining fairness
-    # Small tools can often get exactly what they need, while larger tools may need to be truncated
-    # We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
-    for i, msg in enumerate(tool_call_messages):
-        remaining_tools = len(tool_call_messages) - i
-        max_allocation = remaining_space // remaining_tools
-        needed_space = count_tokens_fn(
-            [{"role": "tool", "content": msg["content"]}]
-        ).total_tokens
-        allocated_space = min(needed_space, max_allocation)
-        if needed_space > allocated_space:
-            truncation_metadata = _truncate_tool_message(
-                msg, allocated_space, needed_space
-            )
-            truncations.append(truncation_metadata)
-        remaining_space -= allocated_space
-    return TruncationResult(truncated_messages=messages, truncations=truncations)
-def _truncate_tool_message(
-    msg: dict, allocated_space: int, needed_space: int
-) -> TruncationMetadata:
-    msg_content = msg["content"]
-    tool_call_id = msg["tool_call_id"]
-    tool_name = msg["name"]
-    # Ensure the indicator fits in the allocated space
-    if allocated_space > len(TRUNCATION_NOTICE):
-        original = msg_content if isinstance(msg_content, str) else str(msg_content)
-        msg["content"] = (
-            original[: allocated_space - len(TRUNCATION_NOTICE)] + TRUNCATION_NOTICE
-        )
-        end_index = allocated_space - len(TRUNCATION_NOTICE)
-    else:
-        msg["content"] = TRUNCATION_NOTICE[:allocated_space]
-        end_index = allocated_space
-    msg.pop("token_count", None)  # Remove token_count if present
-    logging.info(
-        f"Truncating tool message '{tool_name}' from {needed_space} to {allocated_space} tokens"
-    )
-    truncation_metadata = TruncationMetadata(
-        tool_call_id=tool_call_id,
-        start_index=0,
-        end_index=end_index,
-        tool_name=tool_name,
-        original_token_count=needed_space,
-    )
-    return truncation_metadata
 class LLMResult(LLMCosts):
     tool_calls: Optional[List[ToolCallResult]] = None
     result: Optional[str] = None
@@ -289,7 +177,7 @@ class ToolCallingLLM:
     def process_tool_decisions(
         self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
-    ) -> List[Dict[str, Any]]:
+    ) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
         """
         Process tool approval decisions and execute approved tools.
@@ -300,8 +188,9 @@ class ToolCallingLLM:
         Returns:
             Updated messages list with tool execution results
         """
+        events: list[StreamMessage] = []
         if not tool_decisions:
-            return messages
+            return messages, events
         # Create decision lookup
         decisions_by_tool_call_id = {
@@ -332,40 +221,39 @@ class ToolCallingLLM:
             error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
             logging.error(error_message)
             raise Exception(error_message)
         for tool_call_with_decision in pending_tool_calls:
             tool_call_message: dict
             tool_call = tool_call_with_decision.tool_call
             decision = tool_call_with_decision.decision
+            tool_result: Optional[ToolCallResult] = None
             if decision and decision.approved:
-                try:
-                    llm_tool_result = self._invoke_llm_tool_call(
-                        tool_to_call=tool_call,
-                        previous_tool_calls=[],
-                        trace_span=DummySpan(),  # TODO: replace with proper span
-                        tool_number=None,
-                        user_approved=True,
-                    )
-                    tool_call_message = llm_tool_result.as_tool_call_message()
-                except Exception as e:
-                    logging.error(
-                        f"Failed to execute approved tool {tool_call.id}: {e}"
-                    )
-                    tool_call_message = {
-                        "tool_call_id": tool_call.id,
-                        "role": "tool",
-                        "name": tool_call.function.name,
-                        "content": f"Tool execution failed: {str(e)}",
-                    }
+                tool_result = self._invoke_llm_tool_call(
+                    tool_to_call=tool_call,
+                    previous_tool_calls=[],
+                    trace_span=DummySpan(),  # TODO: replace with proper span
+                    tool_number=None,
+                    user_approved=True,
+                )
             else:
                 # Tool was rejected or no decision found, add rejection message
-                tool_call_message = {
-                    "tool_call_id": tool_call.id,
-                    "role": "tool",
-                    "name": tool_call.function.name,
-                    "content": "Tool execution was denied by the user.",
-                }
+                tool_result = ToolCallResult(
+                    tool_call_id=tool_call.id,
+                    tool_name=tool_call.function.name,
+                    description=tool_call.function.name,
+                    result=StructuredToolResult(
+                        status=StructuredToolResultStatus.ERROR,
+                        error="Tool execution was denied by the user.",
+                    ),
+                )
+            events.append(
+                StreamMessage(
+                    event=StreamEvents.TOOL_RESULT,
+                    data=tool_result.as_streaming_tool_result_response(),
+                )
+            )
+            tool_call_message = tool_result.as_tool_call_message()
             # It is expected that the tool call result directly follows the tool call request from the LLM
             # The API call may contain a user ask which is appended to the messages so we can't just append
@@ -374,7 +262,7 @@ class ToolCallingLLM:
                 tool_call_with_decision.message_index + 1, tool_call_message
             )
-        return messages
+        return messages, events
     def prompt_call(
         self,
@@ -420,40 +308,35 @@ class ToolCallingLLM:
         trace_span=DummySpan(),
         tool_number_offset: int = 0,
     ) -> LLMResult:
-        perf_timing = PerformanceTiming("tool_calling_llm.call")
-        tool_calls = []  # type: ignore
+        tool_calls: list[
+            dict
+        ] = []  # Used for preventing repeated tool calls. potentially reset after compaction
+        all_tool_calls = []  # type: ignore
         costs = LLMCosts()
         tools = self.tool_executor.get_all_tools_openai_format(
             target_model=self.llm.model
         )
-        perf_timing.measure("get_all_tools_openai_format")
         max_steps = self.max_steps
         i = 0
         metadata: Dict[Any, Any] = {}
         while i < max_steps:
             i += 1
-            perf_timing.measure(f"start iteration {i}")
             logging.debug(f"running iteration {i}")
             # on the last step we don't allow tools - we want to force a reply, not a request to run another tool
             tools = None if i == max_steps else tools
             tool_choice = "auto" if tools else None
-            tokens = self.llm.count_tokens(messages=messages, tools=tools)
-            max_context_size = self.llm.get_context_window_size()
-            maximum_output_token = self.llm.get_maximum_output_token()
-            perf_timing.measure("count tokens")
-            if (tokens.total_tokens + maximum_output_token) > max_context_size:
-                logging.warning("Token limit exceeded. Truncating tool responses.")
-                truncated_res = self.truncate_messages_to_fit_context(
-                    messages, max_context_size, maximum_output_token
-                )
-                metadata["truncations"] = [
-                    t.model_dump() for t in truncated_res.truncations
-                ]
-                messages = truncated_res.truncated_messages
-                perf_timing.measure("truncate_messages_to_fit_context")
+            limit_result = limit_input_context_window(
+                llm=self.llm, messages=messages, tools=tools
+            )
+            messages = limit_result.messages
+            metadata = metadata | limit_result.metadata
+            if (
+                limit_result.conversation_history_compacted
+                and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
+            ):
+                tool_calls = []
             logging.debug(f"sending messages={messages}\n\ntools={tools}")
@@ -471,7 +354,6 @@ class ToolCallingLLM:
                 # Extract and accumulate cost information
                 _process_cost_info(full_response, costs, "LLM call")
-                perf_timing.measure("llm.completion")
             # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
             except BadRequestError as e:
                 if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -515,8 +397,8 @@ class ToolCallingLLM:
                 hasattr(response_message, "reasoning_content")
                 and response_message.reasoning_content
             ):
-                logging.debug(
-                    f"[bold {AI_COLOR}]AI (reasoning) 🤔:[/bold {AI_COLOR}] {response_message.reasoning_content}\n"
+                logging.info(
+                    f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
                 )
             if not tools_to_call:
@@ -539,26 +421,24 @@ class ToolCallingLLM:
                     add_token_count_to_metadata(
                         tokens=tokens,
                         full_llm_response=full_response,
-                        max_context_size=max_context_size,
-                        maximum_output_token=maximum_output_token,
+                        max_context_size=limit_result.max_context_size,
+                        maximum_output_token=limit_result.maximum_output_token,
                         metadata=metadata,
                     )
-                    perf_timing.end(f"- completed in {i} iterations -")
                     return LLMResult(
                         result=post_processed_response,
                         unprocessed_result=raw_response,
-                        tool_calls=tool_calls,
+                        tool_calls=all_tool_calls,
                         prompt=json.dumps(messages, indent=2),
                         messages=messages,
                         **costs.model_dump(),  # Include all cost fields
                         metadata=metadata,
                     )
-                perf_timing.end(f"- completed in {i} iterations -")
                 return LLMResult(
                     result=text_response,
-                    tool_calls=tool_calls,
+                    tool_calls=all_tool_calls,
                     prompt=json.dumps(messages, indent=2),
                     messages=messages,
                     **costs.model_dump(),  # Include all cost fields
@@ -570,7 +450,6 @@ class ToolCallingLLM:
             logging.info(
                 f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
             )
-            perf_timing.measure("pre-tool-calls")
             with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
                 futures = []
                 futures_tool_numbers: dict[
@@ -580,6 +459,7 @@ class ToolCallingLLM:
                 for tool_index, t in enumerate(tools_to_call, 1):
                     logging.debug(f"Tool to call: {t}")
                     tool_number = tool_number_offset + tool_index
                     future = executor.submit(
                         self._invoke_llm_tool_call,
                         tool_to_call=t,
@@ -612,10 +492,13 @@ class ToolCallingLLM:
                                 tool_span, tool_call_result
                             )
-                    tool_calls.append(tool_call_result.as_tool_result_response())
+                    tool_result_response_dict = (
+                        tool_call_result.as_tool_result_response()
+                    )
+                    tool_calls.append(tool_result_response_dict)
+                    all_tool_calls.append(tool_result_response_dict)
                     messages.append(tool_call_result.as_tool_call_message())
-                    perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
+                    tokens = self.llm.count_tokens(messages=messages, tools=tools)
                 # Update the tool number offset for the next iteration
                 tool_number_offset += len(tools_to_call)
@@ -649,7 +532,7 @@ class ToolCallingLLM:
                 tool_number=tool_number,
                 user_approved=user_approved,
                 llm=self.llm,
-                max_token_count=get_max_token_count_for_single_tool(self.llm),
+                max_token_count=self.llm.get_max_token_count_for_single_tool(),
             )
             tool_response = tool.invoke(tool_params, context=invoke_context)
         except Exception as e:
@@ -680,11 +563,13 @@ class ToolCallingLLM:
                 f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
             )
-        tool_response = prevent_overly_repeated_tool_call(
-            tool_name=tool_name,
-            tool_params=tool_params,
-            tool_calls=previous_tool_calls,
-        )
+        tool_response = None
+        if not user_approved:
+            tool_response = prevent_overly_repeated_tool_call(
+                tool_name=tool_name,
+                tool_params=tool_params,
+                tool_calls=previous_tool_calls,
+            )
         if not tool_response:
             tool_response = self._directly_invoke_tool_call(
@@ -871,20 +756,6 @@ class ToolCallingLLM:
             logging.exception("Failed to run post processing", exc_info=True)
             return investigation, 0.0
-    @sentry_sdk.trace
-    def truncate_messages_to_fit_context(
-        self, messages: list, max_context_size: int, maximum_output_token: int
-    ) -> TruncationResult:
-        truncated_res = truncate_messages_to_fit_context(
-            messages,
-            max_context_size,
-            maximum_output_token,
-            self.llm.count_tokens,
-        )
-        if truncated_res.truncations:
-            sentry_helper.capture_tool_truncations(truncated_res.truncations)
-        return truncated_res
     def call_stream(
         self,
         system_prompt: str = "",
@@ -893,11 +764,19 @@ class ToolCallingLLM:
         sections: Optional[InputSectionsDataType] = None,
         msgs: Optional[list[dict]] = None,
         enable_tool_approval: bool = False,
+        tool_decisions: List[ToolApprovalDecision] | None = None,
     ):
         """
         This function DOES NOT call llm.completion(stream=true).
         This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
         """
+        # Process tool decisions if provided
+        if msgs and tool_decisions:
+            logging.info(f"Processing {len(tool_decisions)} tool decisions")
+            msgs, events = self.process_tool_decisions(msgs, tool_decisions)
+            yield from events
         messages: list[dict] = []
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
@@ -905,12 +784,10 @@ class ToolCallingLLM:
             messages.append({"role": "user", "content": user_prompt})
         if msgs:
             messages.extend(msgs)
-        perf_timing = PerformanceTiming("tool_calling_llm.call")
         tool_calls: list[dict] = []
         tools = self.tool_executor.get_all_tools_openai_format(
             target_model=self.llm.model
         )
-        perf_timing.measure("get_all_tools_openai_format")
         max_steps = self.max_steps
         metadata: Dict[Any, Any] = {}
         i = 0
@@ -918,29 +795,23 @@ class ToolCallingLLM:
         while i < max_steps:
             i += 1
-            perf_timing.measure(f"start iteration {i}")
             logging.debug(f"running iteration {i}")
             tools = None if i == max_steps else tools
             tool_choice = "auto" if tools else None
-            tokens = self.llm.count_tokens(messages=messages, tools=tools)  # type: ignore
-            max_context_size = self.llm.get_context_window_size()
-            maximum_output_token = self.llm.get_maximum_output_token()
-            perf_timing.measure("count tokens")
+            limit_result = limit_input_context_window(
+                llm=self.llm, messages=messages, tools=tools
+            )
+            yield from limit_result.events
+            messages = limit_result.messages
+            metadata = metadata | limit_result.metadata
-            if (tokens.total_tokens + maximum_output_token) > max_context_size:
-                logging.warning("Token limit exceeded. Truncating tool responses.")
-                truncated_res = self.truncate_messages_to_fit_context(
-                    messages, max_context_size, maximum_output_token
-                )
-                metadata["truncations"] = [
-                    t.model_dump() for t in truncated_res.truncations
-                ]
-                messages = truncated_res.truncated_messages
-                perf_timing.measure("truncate_messages_to_fit_context")
-            else:
-                metadata["truncations"] = []
+            if (
+                limit_result.conversation_history_compacted
+                and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
+            ):
+                tool_calls = []
             logging.debug(f"sending messages={messages}\n\ntools={tools}")
             try:
@@ -957,7 +828,6 @@ class ToolCallingLLM:
                 # Log cost information for this iteration (no accumulation in streaming)
                 _process_cost_info(full_response, log_prefix="LLM iteration")
-                perf_timing.measure("llm.completion")
             # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
             except BadRequestError as e:
                 if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -997,8 +867,8 @@ class ToolCallingLLM:
             add_token_count_to_metadata(
                 tokens=tokens,
                 full_llm_response=full_response,
-                max_context_size=max_context_size,
-                maximum_output_token=maximum_output_token,
+                max_context_size=limit_result.max_context_size,
+                maximum_output_token=limit_result.maximum_output_token,
                 metadata=metadata,
             )
             yield build_stream_event_token_count(metadata=metadata)
@@ -1027,8 +897,6 @@ class ToolCallingLLM:
                     },
                 )
-            perf_timing.measure("pre-tool-calls")
             # Check if any tools require approval first
             pending_approvals = []
             approval_required_tools = []
@@ -1037,6 +905,7 @@ class ToolCallingLLM:
                 futures = []
                 for tool_index, t in enumerate(tools_to_call, 1):  # type: ignore
                     tool_number = tool_number_offset + tool_index
                     future = executor.submit(
                         self._invoke_llm_tool_call,
                         tool_to_call=t,  # type: ignore

holmes/core/tools_utils/tool_context_window_limiter.py CHANGED Viewed

@@ -1,14 +1,16 @@
 from typing import Optional
-from holmes.common.env_vars import (
-    TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT,
-    TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_TOKENS,
-)
+from pydantic import BaseModel
 from holmes.core.llm import LLM
 from holmes.core.tools import StructuredToolResultStatus
 from holmes.core.models import ToolCallResult
 from holmes.utils import sentry_helper
+class ToolCallSizeMetadata(BaseModel):
+    messages_token: int
+    max_tokens_allowed: int
 def get_pct_token_count(percent_of_total_context_window: float, llm: LLM) -> int:
     context_window_size = llm.get_context_window_size()
@@ -18,47 +20,38 @@ def get_pct_token_count(percent_of_total_context_window: float, llm: LLM) -> int
         return context_window_size
-def get_max_token_count_for_single_tool(llm: LLM) -> int:
-    return min(
-        TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_TOKENS,
-        get_pct_token_count(
-            percent_of_total_context_window=TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT,
-            llm=llm,
-        ),
-    )
-def prevent_overly_big_tool_response(tool_call_result: ToolCallResult, llm: LLM):
-    max_tokens_allowed = get_max_token_count_for_single_tool(llm)
-    message = tool_call_result.as_tool_call_message()
-    tokens = llm.count_tokens(messages=[message])
-    messages_token = tokens.total_tokens
-    if messages_token > max_tokens_allowed:
-        relative_pct = ((messages_token - max_tokens_allowed) / messages_token) * 100
-        error_message: Optional[str] = (
-            f"The tool call result is too large to return: {messages_token} tokens.\nThe maximum allowed tokens is {max_tokens_allowed} which is {format(relative_pct, '.1f')}% smaller.\nInstructions for the LLM: try to repeat the query but proactively narrow down the result so that the tool answer fits within the allowed number of tokens."
+def is_tool_call_too_big(
+    tool_call_result: ToolCallResult, llm: LLM
+) -> tuple[bool, Optional[ToolCallSizeMetadata]]:
+    if tool_call_result.result.status == StructuredToolResultStatus.SUCCESS:
+        message = tool_call_result.as_tool_call_message()
+        tokens = llm.count_tokens(messages=[message])
+        max_tokens_allowed = llm.get_max_token_count_for_single_tool()
+        return (
+            tokens.total_tokens > max_tokens_allowed,
+            ToolCallSizeMetadata(
+                messages_token=tokens.total_tokens,
+                max_tokens_allowed=max_tokens_allowed,
+            ),
         )
+    return False, None
-        if tool_call_result.result.status == StructuredToolResultStatus.NO_DATA:
-            error_message = None
-            # tool_call_result.result.data is set to None below which is expected to fix the issue
-        elif tool_call_result.result.status == StructuredToolResultStatus.ERROR:
-            original_error = (
-                tool_call_result.result.error
-                or tool_call_result.result.data
-                or "Unknown error"
-            )
-            truncated_error = str(original_error)[:100]
-            error_message = f"The tool call returned an error it is too large to return\nThe following original error is truncated:\n{truncated_error}"
+def prevent_overly_big_tool_response(tool_call_result: ToolCallResult, llm: LLM):
+    tool_call_result_is_too_big, metadata = is_tool_call_too_big(
+        tool_call_result=tool_call_result, llm=llm
+    )
+    if tool_call_result_is_too_big and metadata:
+        relative_pct = (
+            (metadata.messages_token - metadata.max_tokens_allowed)
+            / metadata.messages_token
+        ) * 100
+        error_message = f"The tool call result is too large to return: {metadata.messages_token} tokens.\nThe maximum allowed tokens is {metadata.max_tokens_allowed} which is {format(relative_pct, '.1f')}% smaller.\nInstructions for the LLM: try to repeat the query but proactively narrow down the result so that the tool answer fits within the allowed number of tokens."
         tool_call_result.result.status = StructuredToolResultStatus.ERROR
         tool_call_result.result.data = None
         tool_call_result.result.error = error_message
         sentry_helper.capture_toolcall_contains_too_many_tokens(
-            tool_call_result, messages_token, max_tokens_allowed
+            tool_call_result, metadata.messages_token, metadata.max_tokens_allowed
         )

holmesgpt 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

Potentially problematic release.

holmesgpt 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl