PyPI - holmesgpt - Versions diffs - 0.14.4a0__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

holmesgpt 0.14.4a0py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of holmesgpt might be problematic. Click here for more details.

Files changed (37) hide show

holmes/__init__.py +1 -1
holmes/clients/robusta_client.py +12 -10
holmes/common/env_vars.py +22 -0
holmes/config.py +51 -4
holmes/core/conversations.py +3 -2
holmes/core/llm.py +226 -72
holmes/core/openai_formatting.py +13 -0
holmes/core/supabase_dal.py +33 -42
holmes/core/tool_calling_llm.py +185 -282
holmes/core/tools.py +21 -1
holmes/core/tools_utils/token_counting.py +2 -1
holmes/core/tools_utils/tool_context_window_limiter.py +32 -30
holmes/core/truncation/compaction.py +59 -0
holmes/core/truncation/input_context_window_limiter.py +218 -0
holmes/interactive.py +17 -7
holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
holmes/plugins/toolsets/__init__.py +4 -0
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +0 -1
holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
holmes/plugins/toolsets/grafana/grafana_api.py +1 -1
holmes/plugins/toolsets/investigator/core_investigation.py +34 -24
holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
holmes/plugins/toolsets/prometheus/prometheus.py +1 -1
holmes/plugins/toolsets/robusta/robusta.py +35 -8
holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +4 -3
holmes/plugins/toolsets/service_discovery.py +1 -1
holmes/plugins/toolsets/servicenow/servicenow.py +0 -1
holmes/utils/stream.py +31 -1
{holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/METADATA +6 -2
{holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/RECORD +36 -31
holmes/core/performance_timing.py +0 -72
{holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/LICENSE.txt +0 -0
{holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/WHEEL +0 -0
{holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/entry_points.txt +0 -0

holmes/core/tool_calling_llm.py CHANGED Viewed

@@ -7,8 +7,6 @@ from typing import Dict, List, Optional, Type, Union, Callable, Any
 from holmes.core.models import (
     ToolApprovalDecision,
     ToolCallResult,
-    TruncationResult,
-    TruncationMetadata,
     PendingToolApproval,
 )
@@ -21,8 +19,8 @@ from pydantic import BaseModel, Field
 from rich.console import Console
 from holmes.common.env_vars import (
+    RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
     TEMPERATURE,
-    MAX_OUTPUT_TOKEN_RESERVATION,
     LOG_LLM_USAGE_RESPONSE,
 )
@@ -34,8 +32,7 @@ from holmes.core.investigation_structured_output import (
     is_response_an_incorrect_tool_call,
 )
 from holmes.core.issue import Issue
-from holmes.core.llm import LLM, get_llm_usage
-from holmes.core.performance_timing import PerformanceTiming
+from holmes.core.llm import LLM
 from holmes.core.resource_instruction import ResourceInstructions
 from holmes.core.runbooks import RunbookManager
 from holmes.core.safeguards import prevent_overly_repeated_tool_call
@@ -45,9 +42,11 @@ from holmes.core.tools import (
     ToolInvokeContext,
 )
 from holmes.core.tools_utils.tool_context_window_limiter import (
-    get_max_token_count_for_single_tool,
     prevent_overly_big_tool_response,
 )
+from holmes.core.truncation.input_context_window_limiter import (
+    limit_input_context_window,
+)
 from holmes.plugins.prompts import load_and_render_prompt
 from holmes.utils import sentry_helper
 from holmes.utils.global_instructions import (
@@ -58,15 +57,17 @@ from holmes.utils.tags import format_tags_in_string, parse_messages_tags
 from holmes.core.tools_utils.tool_executor import ToolExecutor
 from holmes.core.tracing import DummySpan
 from holmes.utils.colors import AI_COLOR
-from holmes.utils.stream import StreamEvents, StreamMessage
+from holmes.utils.stream import (
+    StreamEvents,
+    StreamMessage,
+    add_token_count_to_metadata,
+    build_stream_event_token_count,
+)
 # Create a named logger for cost tracking
 cost_logger = logging.getLogger("holmes.costs")
-TRUNCATION_NOTICE = "\n\n[TRUNCATED]"
 class LLMCosts(BaseModel):
     """Tracks cost and token usage for LLM calls."""
@@ -138,109 +139,6 @@ def _process_cost_info(
         logging.debug(f"Could not extract cost information: {e}")
-# TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
-# However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
-# We should fix this in the future
-# TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
-# token truncation and not character truncation
-def truncate_messages_to_fit_context(
-    messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
-) -> TruncationResult:
-    """
-    Helper function to truncate tool messages to fit within context limits.
-    Args:
-        messages: List of message dictionaries with roles and content
-        max_context_size: Maximum context window size for the model
-        maximum_output_token: Maximum tokens reserved for model output
-        count_tokens_fn: Function to count tokens for a list of messages
-    Returns:
-        Modified list of messages with truncated tool responses
-    Raises:
-        Exception: If non-tool messages exceed available context space
-    """
-    messages_except_tools = [
-        message for message in messages if message["role"] != "tool"
-    ]
-    message_size_without_tools = count_tokens_fn(messages_except_tools)
-    tool_call_messages = [message for message in messages if message["role"] == "tool"]
-    reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
-    if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
-        logging.error(
-            f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
-        )
-        raise Exception(
-            f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
-        )
-    if len(tool_call_messages) == 0:
-        return TruncationResult(truncated_messages=messages, truncations=[])
-    available_space = (
-        max_context_size - message_size_without_tools - reserved_for_output_tokens
-    )
-    remaining_space = available_space
-    tool_call_messages.sort(
-        key=lambda x: count_tokens_fn([{"role": "tool", "content": x["content"]}])
-    )
-    truncations = []
-    # Allocate space starting with small tools and going to larger tools, while maintaining fairness
-    # Small tools can often get exactly what they need, while larger tools may need to be truncated
-    # We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
-    for i, msg in enumerate(tool_call_messages):
-        remaining_tools = len(tool_call_messages) - i
-        max_allocation = remaining_space // remaining_tools
-        needed_space = count_tokens_fn([{"role": "tool", "content": msg["content"]}])
-        allocated_space = min(needed_space, max_allocation)
-        if needed_space > allocated_space:
-            truncation_metadata = _truncate_tool_message(
-                msg, allocated_space, needed_space
-            )
-            truncations.append(truncation_metadata)
-        remaining_space -= allocated_space
-    return TruncationResult(truncated_messages=messages, truncations=truncations)
-def _truncate_tool_message(
-    msg: dict, allocated_space: int, needed_space: int
-) -> TruncationMetadata:
-    msg_content = msg["content"]
-    tool_call_id = msg["tool_call_id"]
-    tool_name = msg["name"]
-    # Ensure the indicator fits in the allocated space
-    if allocated_space > len(TRUNCATION_NOTICE):
-        original = msg_content if isinstance(msg_content, str) else str(msg_content)
-        msg["content"] = (
-            original[: allocated_space - len(TRUNCATION_NOTICE)] + TRUNCATION_NOTICE
-        )
-        end_index = allocated_space - len(TRUNCATION_NOTICE)
-    else:
-        msg["content"] = TRUNCATION_NOTICE[:allocated_space]
-        end_index = allocated_space
-    msg.pop("token_count", None)  # Remove token_count if present
-    logging.info(
-        f"Truncating tool message '{tool_name}' from {needed_space} to {allocated_space} tokens"
-    )
-    truncation_metadata = TruncationMetadata(
-        tool_call_id=tool_call_id,
-        start_index=0,
-        end_index=end_index,
-        tool_name=tool_name,
-        original_token_count=needed_space,
-    )
-    return truncation_metadata
 class LLMResult(LLMCosts):
     tool_calls: Optional[List[ToolCallResult]] = None
     result: Optional[str] = None
@@ -257,6 +155,12 @@ class LLMResult(LLMCosts):
         )
+class ToolCallWithDecision(BaseModel):
+    message_index: int
+    tool_call: ChatCompletionMessageToolCall
+    decision: Optional[ToolApprovalDecision]
 class ToolCallingLLM:
     llm: LLM
@@ -273,7 +177,7 @@ class ToolCallingLLM:
     def process_tool_decisions(
         self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
-    ) -> List[Dict[str, Any]]:
+    ) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
         """
         Process tool approval decisions and execute approved tools.
@@ -284,85 +188,81 @@ class ToolCallingLLM:
         Returns:
             Updated messages list with tool execution results
         """
-        # Import here to avoid circular imports
-        # Find the last message with pending approvals
-        pending_message_idx = None
-        pending_tool_calls = None
-        for i in reversed(range(len(messages))):
-            msg = messages[i]
-            if msg.get("role") == "assistant" and msg.get("pending_approval"):
-                pending_message_idx = i
-                pending_tool_calls = msg.get("tool_calls", [])
-                break
-        if pending_message_idx is None or not pending_tool_calls:
-            # No pending approvals found
-            if tool_decisions:
-                logging.warning(
-                    f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
-                )
-            return messages
+        events: list[StreamMessage] = []
+        if not tool_decisions:
+            return messages, events
         # Create decision lookup
-        decisions_by_id = {
+        decisions_by_tool_call_id = {
             decision.tool_call_id: decision for decision in tool_decisions
         }
-        # Validate that all decisions have corresponding pending tool calls
-        pending_tool_ids = {tool_call["id"] for tool_call in pending_tool_calls}
-        invalid_decisions = [
-            decision.tool_call_id
-            for decision in tool_decisions
-            if decision.tool_call_id not in pending_tool_ids
-        ]
-        if invalid_decisions:
-            logging.warning(
-                f"Received decisions for non-pending tool calls: {invalid_decisions}"
-            )
+        pending_tool_calls: list[ToolCallWithDecision] = []
-        # Process each tool call
-        for tool_call in pending_tool_calls:
-            tool_call_id = tool_call["id"]
-            decision = decisions_by_id.get(tool_call_id)
+        for i in reversed(range(len(messages))):
+            msg = messages[i]
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                message_tool_calls = msg.get("tool_calls", [])
+                for tool_call in message_tool_calls:
+                    decision = decisions_by_tool_call_id.get(tool_call.get("id"), None)
+                    if tool_call.get("pending_approval"):
+                        del tool_call[
+                            "pending_approval"
+                        ]  # Cleanup so that a pending approval is not tagged on message in a future response
+                        pending_tool_calls.append(
+                            ToolCallWithDecision(
+                                tool_call=ChatCompletionMessageToolCall(**tool_call),
+                                decision=decision,
+                                message_index=i,
+                            )
+                        )
+        if not pending_tool_calls:
+            error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
+            logging.error(error_message)
+            raise Exception(error_message)
+        for tool_call_with_decision in pending_tool_calls:
+            tool_call_message: dict
+            tool_call = tool_call_with_decision.tool_call
+            decision = tool_call_with_decision.decision
+            tool_result: Optional[ToolCallResult] = None
             if decision and decision.approved:
-                try:
-                    tool_call_obj = ChatCompletionMessageToolCall(**tool_call)
-                    llm_tool_result = self._invoke_llm_tool_call(
-                        tool_to_call=tool_call_obj,
-                        previous_tool_calls=[],
-                        trace_span=DummySpan(),
-                        tool_number=None,
-                    )
-                    messages.append(llm_tool_result.as_tool_call_message())
-                except Exception as e:
-                    logging.error(
-                        f"Failed to execute approved tool {tool_call_id}: {e}"
-                    )
-                    messages.append(
-                        {
-                            "tool_call_id": tool_call_id,
-                            "role": "tool",
-                            "name": tool_call["function"]["name"],
-                            "content": f"Tool execution failed: {str(e)}",
-                        }
-                    )
+                tool_result = self._invoke_llm_tool_call(
+                    tool_to_call=tool_call,
+                    previous_tool_calls=[],
+                    trace_span=DummySpan(),  # TODO: replace with proper span
+                    tool_number=None,
+                    user_approved=True,
+                )
             else:
                 # Tool was rejected or no decision found, add rejection message
-                messages.append(
-                    {
-                        "tool_call_id": tool_call_id,
-                        "role": "tool",
-                        "name": tool_call["function"]["name"],
-                        "content": "Tool execution was denied by the user.",
-                    }
+                tool_result = ToolCallResult(
+                    tool_call_id=tool_call.id,
+                    tool_name=tool_call.function.name,
+                    description=tool_call.function.name,
+                    result=StructuredToolResult(
+                        status=StructuredToolResultStatus.ERROR,
+                        error="Tool execution was denied by the user.",
+                    ),
                 )
-        return messages
+            events.append(
+                StreamMessage(
+                    event=StreamEvents.TOOL_RESULT,
+                    data=tool_result.as_streaming_tool_result_response(),
+                )
+            )
+            tool_call_message = tool_result.as_tool_call_message()
+            # It is expected that the tool call result directly follows the tool call request from the LLM
+            # The API call may contain a user ask which is appended to the messages so we can't just append
+            # tool call results; they need to be inserted right after the llm's message requesting tool calls
+            messages.insert(
+                tool_call_with_decision.message_index + 1, tool_call_message
+            )
+        return messages, events
     def prompt_call(
         self,
@@ -408,40 +308,35 @@ class ToolCallingLLM:
         trace_span=DummySpan(),
         tool_number_offset: int = 0,
     ) -> LLMResult:
-        perf_timing = PerformanceTiming("tool_calling_llm.call")
-        tool_calls = []  # type: ignore
+        tool_calls: list[
+            dict
+        ] = []  # Used for preventing repeated tool calls. potentially reset after compaction
+        all_tool_calls = []  # type: ignore
         costs = LLMCosts()
         tools = self.tool_executor.get_all_tools_openai_format(
             target_model=self.llm.model
         )
-        perf_timing.measure("get_all_tools_openai_format")
         max_steps = self.max_steps
         i = 0
         metadata: Dict[Any, Any] = {}
         while i < max_steps:
             i += 1
-            perf_timing.measure(f"start iteration {i}")
             logging.debug(f"running iteration {i}")
             # on the last step we don't allow tools - we want to force a reply, not a request to run another tool
             tools = None if i == max_steps else tools
             tool_choice = "auto" if tools else None
-            total_tokens = self.llm.count_tokens_for_message(messages)
-            max_context_size = self.llm.get_context_window_size()
-            maximum_output_token = self.llm.get_maximum_output_token()
-            perf_timing.measure("count tokens")
+            limit_result = limit_input_context_window(
+                llm=self.llm, messages=messages, tools=tools
+            )
+            messages = limit_result.messages
+            metadata = metadata | limit_result.metadata
-            if (total_tokens + maximum_output_token) > max_context_size:
-                logging.warning("Token limit exceeded. Truncating tool responses.")
-                truncated_res = self.truncate_messages_to_fit_context(
-                    messages, max_context_size, maximum_output_token
-                )
-                metadata["truncations"] = [
-                    t.model_dump() for t in truncated_res.truncations
-                ]
-                messages = truncated_res.truncated_messages
-                perf_timing.measure("truncate_messages_to_fit_context")
+            if (
+                limit_result.conversation_history_compacted
+                and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
+            ):
+                tool_calls = []
             logging.debug(f"sending messages={messages}\n\ntools={tools}")
@@ -459,7 +354,6 @@ class ToolCallingLLM:
                 # Extract and accumulate cost information
                 _process_cost_info(full_response, costs, "LLM call")
-                perf_timing.measure("llm.completion")
             # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
             except BadRequestError as e:
                 if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -483,7 +377,7 @@ class ToolCallingLLM:
                 if incorrect_tool_call:
                     logging.warning(
-                        "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
+                        "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
                     )
                     # disable structured output going forward and and retry
                     sentry_helper.capture_structured_output_incorrect_tool_call()
@@ -503,8 +397,8 @@ class ToolCallingLLM:
                 hasattr(response_message, "reasoning_content")
                 and response_message.reasoning_content
             ):
-                logging.debug(
-                    f"[bold {AI_COLOR}]AI (reasoning) 🤔:[/bold {AI_COLOR}] {response_message.reasoning_content}\n"
+                logging.info(
+                    f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
                 )
             if not tools_to_call:
@@ -522,25 +416,29 @@ class ToolCallingLLM:
                     )
                     costs.total_cost += post_processing_cost
-                    self.llm.count_tokens_for_message(messages)
-                    perf_timing.end(f"- completed in {i} iterations -")
-                    metadata["usage"] = get_llm_usage(full_response)
-                    metadata["max_tokens"] = max_context_size
-                    metadata["max_output_tokens"] = maximum_output_token
+                    tokens = self.llm.count_tokens(messages=messages, tools=tools)
+                    add_token_count_to_metadata(
+                        tokens=tokens,
+                        full_llm_response=full_response,
+                        max_context_size=limit_result.max_context_size,
+                        maximum_output_token=limit_result.maximum_output_token,
+                        metadata=metadata,
+                    )
                     return LLMResult(
                         result=post_processed_response,
                         unprocessed_result=raw_response,
-                        tool_calls=tool_calls,
+                        tool_calls=all_tool_calls,
                         prompt=json.dumps(messages, indent=2),
                         messages=messages,
                         **costs.model_dump(),  # Include all cost fields
                         metadata=metadata,
                     )
-                perf_timing.end(f"- completed in {i} iterations -")
                 return LLMResult(
                     result=text_response,
-                    tool_calls=tool_calls,
+                    tool_calls=all_tool_calls,
                     prompt=json.dumps(messages, indent=2),
                     messages=messages,
                     **costs.model_dump(),  # Include all cost fields
@@ -552,7 +450,6 @@ class ToolCallingLLM:
             logging.info(
                 f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
             )
-            perf_timing.measure("pre-tool-calls")
             with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
                 futures = []
                 futures_tool_numbers: dict[
@@ -562,6 +459,7 @@ class ToolCallingLLM:
                 for tool_index, t in enumerate(tools_to_call, 1):
                     logging.debug(f"Tool to call: {t}")
                     tool_number = tool_number_offset + tool_index
                     future = executor.submit(
                         self._invoke_llm_tool_call,
                         tool_to_call=t,
@@ -594,10 +492,13 @@ class ToolCallingLLM:
                                 tool_span, tool_call_result
                             )
-                    tool_calls.append(tool_call_result.as_tool_result_response())
+                    tool_result_response_dict = (
+                        tool_call_result.as_tool_result_response()
+                    )
+                    tool_calls.append(tool_result_response_dict)
+                    all_tool_calls.append(tool_result_response_dict)
                     messages.append(tool_call_result.as_tool_call_message())
-                    perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
+                    tokens = self.llm.count_tokens(messages=messages, tools=tools)
                 # Update the tool number offset for the next iteration
                 tool_number_offset += len(tools_to_call)
@@ -631,7 +532,7 @@ class ToolCallingLLM:
                 tool_number=tool_number,
                 user_approved=user_approved,
                 llm=self.llm,
-                max_token_count=get_max_token_count_for_single_tool(self.llm),
+                max_token_count=self.llm.get_max_token_count_for_single_tool(),
             )
             tool_response = tool.invoke(tool_params, context=invoke_context)
         except Exception as e:
@@ -650,6 +551,7 @@ class ToolCallingLLM:
         tool_call_id: str,
         tool_name: str,
         tool_arguments: str,
+        user_approved: bool,
         previous_tool_calls: list[dict],
         tool_number: Optional[int] = None,
     ) -> ToolCallResult:
@@ -661,17 +563,19 @@ class ToolCallingLLM:
                 f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
             )
-        tool_response = prevent_overly_repeated_tool_call(
-            tool_name=tool_name,
-            tool_params=tool_params,
-            tool_calls=previous_tool_calls,
-        )
+        tool_response = None
+        if not user_approved:
+            tool_response = prevent_overly_repeated_tool_call(
+                tool_name=tool_name,
+                tool_params=tool_params,
+                tool_calls=previous_tool_calls,
+            )
         if not tool_response:
             tool_response = self._directly_invoke_tool_call(
                 tool_name=tool_name,
                 tool_params=tool_params,
-                user_approved=False,
+                user_approved=user_approved,
                 tool_number=tool_number,
             )
@@ -716,6 +620,7 @@ class ToolCallingLLM:
         previous_tool_calls: list[dict],
         trace_span=None,
         tool_number=None,
+        user_approved: bool = False,
     ) -> ToolCallResult:
         if trace_span is None:
             trace_span = DummySpan()
@@ -748,6 +653,7 @@ class ToolCallingLLM:
                     tool_arguments,
                     previous_tool_calls=previous_tool_calls,
                     tool_number=tool_number,
+                    user_approved=user_approved,
                 )
             prevent_overly_big_tool_response(
@@ -850,20 +756,6 @@ class ToolCallingLLM:
             logging.exception("Failed to run post processing", exc_info=True)
             return investigation, 0.0
-    @sentry_sdk.trace
-    def truncate_messages_to_fit_context(
-        self, messages: list, max_context_size: int, maximum_output_token: int
-    ) -> TruncationResult:
-        truncated_res = truncate_messages_to_fit_context(
-            messages,
-            max_context_size,
-            maximum_output_token,
-            self.llm.count_tokens_for_message,
-        )
-        if truncated_res.truncations:
-            sentry_helper.capture_tool_truncations(truncated_res.truncations)
-        return truncated_res
     def call_stream(
         self,
         system_prompt: str = "",
@@ -872,11 +764,19 @@ class ToolCallingLLM:
         sections: Optional[InputSectionsDataType] = None,
         msgs: Optional[list[dict]] = None,
         enable_tool_approval: bool = False,
+        tool_decisions: List[ToolApprovalDecision] | None = None,
     ):
         """
         This function DOES NOT call llm.completion(stream=true).
         This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
         """
+        # Process tool decisions if provided
+        if msgs and tool_decisions:
+            logging.info(f"Processing {len(tool_decisions)} tool decisions")
+            msgs, events = self.process_tool_decisions(msgs, tool_decisions)
+            yield from events
         messages: list[dict] = []
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
@@ -884,12 +784,10 @@ class ToolCallingLLM:
             messages.append({"role": "user", "content": user_prompt})
         if msgs:
             messages.extend(msgs)
-        perf_timing = PerformanceTiming("tool_calling_llm.call")
         tool_calls: list[dict] = []
         tools = self.tool_executor.get_all_tools_openai_format(
             target_model=self.llm.model
         )
-        perf_timing.measure("get_all_tools_openai_format")
         max_steps = self.max_steps
         metadata: Dict[Any, Any] = {}
         i = 0
@@ -897,29 +795,23 @@ class ToolCallingLLM:
         while i < max_steps:
             i += 1
-            perf_timing.measure(f"start iteration {i}")
             logging.debug(f"running iteration {i}")
             tools = None if i == max_steps else tools
             tool_choice = "auto" if tools else None
-            total_tokens = self.llm.count_tokens_for_message(messages)  # type: ignore
-            max_context_size = self.llm.get_context_window_size()
-            maximum_output_token = self.llm.get_maximum_output_token()
-            perf_timing.measure("count tokens")
+            limit_result = limit_input_context_window(
+                llm=self.llm, messages=messages, tools=tools
+            )
+            yield from limit_result.events
+            messages = limit_result.messages
+            metadata = metadata | limit_result.metadata
-            if (total_tokens + maximum_output_token) > max_context_size:
-                logging.warning("Token limit exceeded. Truncating tool responses.")
-                truncated_res = self.truncate_messages_to_fit_context(
-                    messages, max_context_size, maximum_output_token
-                )
-                metadata["truncations"] = [
-                    t.model_dump() for t in truncated_res.truncations
-                ]
-                messages = truncated_res.truncated_messages
-                perf_timing.measure("truncate_messages_to_fit_context")
-            else:
-                metadata["truncations"] = []
+            if (
+                limit_result.conversation_history_compacted
+                and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
+            ):
+                tool_calls = []
             logging.debug(f"sending messages={messages}\n\ntools={tools}")
             try:
@@ -936,7 +828,6 @@ class ToolCallingLLM:
                 # Log cost information for this iteration (no accumulation in streaming)
                 _process_cost_info(full_response, log_prefix="LLM iteration")
-                perf_timing.measure("llm.completion")
             # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
             except BadRequestError as e:
                 if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -958,7 +849,7 @@ class ToolCallingLLM:
                 if incorrect_tool_call:
                     logging.warning(
-                        "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
+                        "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
                     )
                     # disable structured output going forward and and retry
                     sentry_helper.capture_structured_output_incorrect_tool_call()
@@ -972,12 +863,18 @@ class ToolCallingLLM:
                 )
             )
+            tokens = self.llm.count_tokens(messages=messages, tools=tools)
+            add_token_count_to_metadata(
+                tokens=tokens,
+                full_llm_response=full_response,
+                max_context_size=limit_result.max_context_size,
+                maximum_output_token=limit_result.maximum_output_token,
+                metadata=metadata,
+            )
+            yield build_stream_event_token_count(metadata=metadata)
             tools_to_call = getattr(response_message, "tool_calls", None)
             if not tools_to_call:
-                self.llm.count_tokens_for_message(messages)
-                metadata["usage"] = get_llm_usage(full_response)
-                metadata["max_tokens"] = max_context_size
-                metadata["max_output_tokens"] = maximum_output_token
                 yield StreamMessage(
                     event=StreamEvents.ANSWER_END,
                     data={
@@ -993,11 +890,13 @@ class ToolCallingLLM:
             if reasoning or message:
                 yield StreamMessage(
                     event=StreamEvents.AI_MESSAGE,
-                    data={"content": message, "reasoning": reasoning},
+                    data={
+                        "content": message,
+                        "reasoning": reasoning,
+                        "metadata": metadata,
+                    },
                 )
-            perf_timing.measure("pre-tool-calls")
             # Check if any tools require approval first
             pending_approvals = []
             approval_required_tools = []
@@ -1006,6 +905,7 @@ class ToolCallingLLM:
                 futures = []
                 for tool_index, t in enumerate(tools_to_call, 1):  # type: ignore
                     tool_number = tool_number_offset + tool_index
                     future = executor.submit(
                         self._invoke_llm_tool_call,
                         tool_to_call=t,  # type: ignore
@@ -1069,23 +969,11 @@ class ToolCallingLLM:
                 # If we have approval required tools, end the stream with pending approvals
                 if pending_approvals:
                     # Add assistant message with pending tool calls
-                    assistant_msg = {
-                        "role": "assistant",
-                        "content": response_message.content,
-                        "tool_calls": [
-                            {
-                                "id": result.tool_call_id,
-                                "type": "function",
-                                "function": {
-                                    "name": result.tool_name,
-                                    "arguments": json.dumps(result.result.params or {}),
-                                },
-                            }
-                            for result in approval_required_tools
-                        ],
-                        "pending_approval": True,
-                    }
-                    messages.append(assistant_msg)
+                    for result in approval_required_tools:
+                        tool_call = self.find_assistant_tool_call_request(
+                            tool_call_id=result.tool_call_id, messages=messages
+                        )
+                        tool_call["pending_approval"] = True
                     # End stream with approvals required
                     yield StreamMessage(
@@ -1108,6 +996,21 @@ class ToolCallingLLM:
             f"Too many LLM calls - exceeded max_steps: {i}/{self.max_steps}"
         )
+    def find_assistant_tool_call_request(
+        self, tool_call_id: str, messages: list[dict[str, Any]]
+    ) -> dict[str, Any]:
+        for message in messages:
+            if message.get("role") == "assistant":
+                for tool_call in message.get("tool_calls", []):
+                    if tool_call.get("id") == tool_call_id:
+                        return tool_call
+        # Should not happen unless there is a bug.
+        # If we are here
+        raise Exception(
+            f"Failed to find assistant request for a tool_call in conversation history. tool_call_id={tool_call_id}"
+        )
 # TODO: consider getting rid of this entirely and moving templating into the cmds in holmes_cli.py
 class IssueInvestigator(ToolCallingLLM):

holmesgpt 0.14.4a0__py3-none-any.whl → 0.16.0__py3-none-any.whl

Potentially problematic release.

holmesgpt 0.14.4a0py3-none-any.whl → 0.16.0py3-none-any.whl