PyPI - holmesgpt - Versions diffs - 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl - Mend

holmesgpt 0.13.2py3-none-any.whl → 0.18.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

holmes/__init__.py +3 -5
holmes/clients/robusta_client.py +20 -6
holmes/common/env_vars.py +58 -3
holmes/common/openshift.py +1 -1
holmes/config.py +123 -148
holmes/core/conversations.py +71 -15
holmes/core/feedback.py +191 -0
holmes/core/investigation.py +31 -39
holmes/core/investigation_structured_output.py +3 -3
holmes/core/issue.py +1 -1
holmes/core/llm.py +508 -88
holmes/core/models.py +108 -4
holmes/core/openai_formatting.py +14 -1
holmes/core/prompt.py +48 -3
holmes/core/runbooks.py +1 -0
holmes/core/safeguards.py +8 -6
holmes/core/supabase_dal.py +295 -100
holmes/core/tool_calling_llm.py +489 -428
holmes/core/tools.py +325 -56
holmes/core/tools_utils/token_counting.py +21 -0
holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
holmes/core/tools_utils/tool_executor.py +0 -13
holmes/core/tools_utils/toolset_utils.py +1 -0
holmes/core/toolset_manager.py +191 -5
holmes/core/tracing.py +19 -3
holmes/core/transformers/__init__.py +23 -0
holmes/core/transformers/base.py +63 -0
holmes/core/transformers/llm_summarize.py +175 -0
holmes/core/transformers/registry.py +123 -0
holmes/core/transformers/transformer.py +32 -0
holmes/core/truncation/compaction.py +94 -0
holmes/core/truncation/dal_truncation_utils.py +23 -0
holmes/core/truncation/input_context_window_limiter.py +219 -0
holmes/interactive.py +228 -31
holmes/main.py +23 -40
holmes/plugins/interfaces.py +2 -1
holmes/plugins/prompts/__init__.py +2 -1
holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
holmes/plugins/prompts/generic_ask.jinja2 +0 -4
holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
holmes/plugins/runbooks/__init__.py +145 -17
holmes/plugins/runbooks/catalog.json +2 -0
holmes/plugins/sources/github/__init__.py +4 -2
holmes/plugins/sources/prometheus/models.py +1 -0
holmes/plugins/toolsets/__init__.py +44 -27
holmes/plugins/toolsets/aks-node-health.yaml +46 -0
holmes/plugins/toolsets/aks.yaml +64 -0
holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
holmes/plugins/toolsets/azure_sql/utils.py +0 -32
holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
holmes/plugins/toolsets/bash/common/bash.py +23 -13
holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
holmes/plugins/toolsets/bash/common/stringify.py +1 -1
holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
holmes/plugins/toolsets/bash/parse_command.py +12 -13
holmes/plugins/toolsets/cilium.yaml +284 -0
holmes/plugins/toolsets/connectivity_check.py +124 -0
holmes/plugins/toolsets/coralogix/api.py +132 -119
holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
holmes/plugins/toolsets/coralogix/utils.py +15 -79
holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
holmes/plugins/toolsets/git.py +54 -50
holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
holmes/plugins/toolsets/grafana/common.py +13 -29
holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
holmes/plugins/toolsets/grafana/loki_api.py +4 -0
holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
holmes/plugins/toolsets/internet/internet.py +15 -16
holmes/plugins/toolsets/internet/notion.py +9 -11
holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
holmes/plugins/toolsets/investigator/model.py +3 -1
holmes/plugins/toolsets/json_filter_mixin.py +134 -0
holmes/plugins/toolsets/kafka.py +36 -42
holmes/plugins/toolsets/kubernetes.yaml +317 -113
holmes/plugins/toolsets/kubernetes_logs.py +9 -9
holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
holmes/plugins/toolsets/openshift.yaml +283 -0
holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
holmes/plugins/toolsets/prometheus/utils.py +28 -0
holmes/plugins/toolsets/rabbitmq/api.py +23 -4
holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
holmes/plugins/toolsets/robusta/robusta.py +239 -68
holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
holmes/plugins/toolsets/service_discovery.py +1 -1
holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
holmes/plugins/toolsets/utils.py +88 -0
holmes/utils/config_utils.py +91 -0
holmes/utils/connection_utils.py +31 -0
holmes/utils/console/result.py +10 -0
holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
holmes/utils/env.py +7 -0
holmes/utils/file_utils.py +2 -1
holmes/utils/global_instructions.py +60 -11
holmes/utils/holmes_status.py +6 -4
holmes/utils/holmes_sync_toolsets.py +0 -2
holmes/utils/krr_utils.py +188 -0
holmes/utils/log.py +15 -0
holmes/utils/markdown_utils.py +2 -3
holmes/utils/memory_limit.py +58 -0
holmes/utils/sentry_helper.py +64 -0
holmes/utils/stream.py +69 -8
holmes/utils/tags.py +4 -3
holmes/version.py +37 -15
holmesgpt-0.18.4.dist-info/LICENSE +178 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
holmesgpt-0.18.4.dist-info/RECORD +258 -0
holmes/core/performance_timing.py +0 -72
holmes/plugins/toolsets/aws.yaml +0 -80
holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
holmes/plugins/toolsets/newrelic.py +0 -231
holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
holmes/plugins/toolsets/servicenow/install.md +0 -37
holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
holmes/utils/keygen_utils.py +0 -6
holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
holmesgpt-0.13.2.dist-info/RECORD +0 -234
/holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
{holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0

holmes/core/tool_calling_llm.py CHANGED Viewed

@@ -2,8 +2,7 @@ import concurrent.futures
 import json
 import logging
 import textwrap
-from typing import Dict, List, Optional, Type, Union, Callable
+from typing import Any, Callable, Dict, List, Optional, Type, Union
 import sentry_sdk
 from openai import BadRequestError
@@ -14,11 +13,10 @@ from pydantic import BaseModel, Field
 from rich.console import Console
 from holmes.common.env_vars import (
-    TEMPERATURE,
-    MAX_OUTPUT_TOKEN_RESERVATION,
     LOG_LLM_USAGE_RESPONSE,
+    RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
+    TEMPERATURE,
 )
 from holmes.core.investigation_structured_output import (
     DEFAULT_SECTIONS,
     REQUEST_STRUCTURED_OUTPUT_FROM_LLM,
@@ -28,21 +26,42 @@ from holmes.core.investigation_structured_output import (
 )
 from holmes.core.issue import Issue
 from holmes.core.llm import LLM
-from holmes.core.performance_timing import PerformanceTiming
-from holmes.core.resource_instruction import ResourceInstructions
+from holmes.core.models import (
+    PendingToolApproval,
+    ToolApprovalDecision,
+    ToolCallResult,
+)
+from holmes.core.prompt import generate_user_prompt
 from holmes.core.runbooks import RunbookManager
 from holmes.core.safeguards import prevent_overly_repeated_tool_call
-from holmes.core.tools import StructuredToolResult, ToolResultStatus
-from holmes.plugins.prompts import load_and_render_prompt
-from holmes.utils.global_instructions import (
-    Instructions,
-    add_global_instructions_to_user_prompt,
+from holmes.core.tools import (
+    StructuredToolResult,
+    StructuredToolResultStatus,
+    ToolInvokeContext,
+)
+from holmes.core.tools_utils.tool_context_window_limiter import (
+    prevent_overly_big_tool_response,
 )
-from holmes.utils.tags import format_tags_in_string, parse_messages_tags
 from holmes.core.tools_utils.tool_executor import ToolExecutor
 from holmes.core.tracing import DummySpan
+from holmes.core.truncation.input_context_window_limiter import (
+    limit_input_context_window,
+)
+from holmes.plugins.prompts import load_and_render_prompt
+from holmes.plugins.runbooks import RunbookCatalog
+from holmes.utils import sentry_helper
 from holmes.utils.colors import AI_COLOR
-from holmes.utils.stream import StreamEvents, StreamMessage
+from holmes.utils.global_instructions import (
+    Instructions,
+    generate_runbooks_args,
+)
+from holmes.utils.stream import (
+    StreamEvents,
+    StreamMessage,
+    add_token_count_to_metadata,
+    build_stream_event_token_count,
+)
+from holmes.utils.tags import parse_messages_tags
 # Create a named logger for cost tracking
 cost_logger = logging.getLogger("holmes.costs")
@@ -119,156 +138,16 @@ def _process_cost_info(
         logging.debug(f"Could not extract cost information: {e}")
-def format_tool_result_data(tool_result: StructuredToolResult) -> str:
-    tool_response = tool_result.data
-    if isinstance(tool_result.data, str):
-        tool_response = tool_result.data
-    else:
-        try:
-            if isinstance(tool_result.data, BaseModel):
-                tool_response = tool_result.data.model_dump_json(indent=2)
-            else:
-                tool_response = json.dumps(tool_result.data, indent=2)
-        except Exception:
-            tool_response = str(tool_result.data)
-    if tool_result.status == ToolResultStatus.ERROR:
-        tool_response = f"{tool_result.error or 'Tool execution failed'}:\n\n{tool_result.data or ''}".strip()
-    return tool_response
-# TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
-# However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
-# We should fix this in the future
-# TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
-# token truncation and not character truncation
-def truncate_messages_to_fit_context(
-    messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
-) -> list:
-    """
-    Helper function to truncate tool messages to fit within context limits.
-    Args:
-        messages: List of message dictionaries with roles and content
-        max_context_size: Maximum context window size for the model
-        maximum_output_token: Maximum tokens reserved for model output
-        count_tokens_fn: Function to count tokens for a list of messages
-    Returns:
-        Modified list of messages with truncated tool responses
-    Raises:
-        Exception: If non-tool messages exceed available context space
-    """
-    messages_except_tools = [
-        message for message in messages if message["role"] != "tool"
-    ]
-    message_size_without_tools = count_tokens_fn(messages_except_tools)
-    tool_call_messages = [message for message in messages if message["role"] == "tool"]
-    reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
-    if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
-        logging.error(
-            f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
-        )
-        raise Exception(
-            f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
-        )
-    if len(tool_call_messages) == 0:
-        return messages
-    available_space = (
-        max_context_size - message_size_without_tools - maximum_output_token
-    )
-    remaining_space = available_space
-    tool_call_messages.sort(key=lambda x: len(x["content"]))
-    # Allocate space starting with small tools and going to larger tools, while maintaining fairness
-    # Small tools can often get exactly what they need, while larger tools may need to be truncated
-    # We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
-    for i, msg in enumerate(tool_call_messages):
-        remaining_tools = len(tool_call_messages) - i
-        max_allocation = remaining_space // remaining_tools
-        needed_space = len(msg["content"])
-        allocated_space = min(needed_space, max_allocation)
-        if needed_space > allocated_space:
-            truncation_notice = "\n\n[TRUNCATED]"
-            # Ensure the indicator fits in the allocated space
-            if allocated_space > len(truncation_notice):
-                msg["content"] = (
-                    msg["content"][: allocated_space - len(truncation_notice)]
-                    + truncation_notice
-                )
-                logging.info(
-                    f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space-len(truncation_notice)} tokens"
-                )
-            else:
-                msg["content"] = truncation_notice[:allocated_space]
-                logging.info(
-                    f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space} tokens"
-                )
-            msg.pop("token_count", None)  # Remove token_count if present
-        remaining_space -= allocated_space
-    return messages
-class ToolCallResult(BaseModel):
-    tool_call_id: str
-    tool_name: str
-    description: str
-    result: StructuredToolResult
-    size: Optional[int] = None
-    def as_tool_call_message(self):
-        content = format_tool_result_data(self.result)
-        if self.result.params:
-            content = (
-                f"Params used for the tool call: {json.dumps(self.result.params)}. The tool call output follows on the next line.\n"
-                + content
-            )
-        return {
-            "tool_call_id": self.tool_call_id,
-            "role": "tool",
-            "name": self.tool_name,
-            "content": content,
-        }
-    def as_tool_result_response(self):
-        result_dump = self.result.model_dump()
-        result_dump["data"] = self.result.get_stringified_data()
-        return {
-            "tool_call_id": self.tool_call_id,
-            "tool_name": self.tool_name,
-            "description": self.description,
-            "role": "tool",
-            "result": result_dump,
-        }
-    def as_streaming_tool_result_response(self):
-        result_dump = self.result.model_dump()
-        result_dump["data"] = self.result.get_stringified_data()
-        return {
-            "tool_call_id": self.tool_call_id,
-            "role": "tool",
-            "description": self.description,
-            "name": self.tool_name,
-            "result": result_dump,
-        }
 class LLMResult(LLMCosts):
     tool_calls: Optional[List[ToolCallResult]] = None
+    num_llm_calls: Optional[int] = None  # Number of LLM API calls (turns)
     result: Optional[str] = None
     unprocessed_result: Optional[str] = None
     instructions: List[str] = Field(default_factory=list)
     # TODO: clean up these two
     prompt: Optional[str] = None
     messages: Optional[List[dict]] = None
+    metadata: Optional[Dict[Any, Any]] = None
     def get_tool_usage_summary(self):
         return "AI used info from issue and " + ",".join(
@@ -276,6 +155,12 @@ class LLMResult(LLMCosts):
         )
+class ToolCallWithDecision(BaseModel):
+    message_index: int
+    tool_call: ChatCompletionMessageToolCall
+    decision: Optional[ToolApprovalDecision]
 class ToolCallingLLM:
     llm: LLM
@@ -290,11 +175,99 @@ class ToolCallingLLM:
             Callable[[StructuredToolResult], tuple[bool, Optional[str]]]
         ] = None
+    def process_tool_decisions(
+        self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
+    ) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
+        """
+        Process tool approval decisions and execute approved tools.
+        Args:
+            messages: Current conversation messages
+            tool_decisions: List of ToolApprovalDecision objects
+        Returns:
+            Updated messages list with tool execution results
+        """
+        events: list[StreamMessage] = []
+        if not tool_decisions:
+            return messages, events
+        # Create decision lookup
+        decisions_by_tool_call_id = {
+            decision.tool_call_id: decision for decision in tool_decisions
+        }
+        pending_tool_calls: list[ToolCallWithDecision] = []
+        for i in reversed(range(len(messages))):
+            msg = messages[i]
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                message_tool_calls = msg.get("tool_calls", [])
+                for tool_call in message_tool_calls:
+                    decision = decisions_by_tool_call_id.get(tool_call.get("id"), None)
+                    if tool_call.get("pending_approval"):
+                        del tool_call[
+                            "pending_approval"
+                        ]  # Cleanup so that a pending approval is not tagged on message in a future response
+                        pending_tool_calls.append(
+                            ToolCallWithDecision(
+                                tool_call=ChatCompletionMessageToolCall(**tool_call),
+                                decision=decision,
+                                message_index=i,
+                            )
+                        )
+        if not pending_tool_calls:
+            error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
+            logging.error(error_message)
+            raise Exception(error_message)
+        for tool_call_with_decision in pending_tool_calls:
+            tool_call_message: dict
+            tool_call = tool_call_with_decision.tool_call
+            decision = tool_call_with_decision.decision
+            tool_result: Optional[ToolCallResult] = None
+            if decision and decision.approved:
+                tool_result = self._invoke_llm_tool_call(
+                    tool_to_call=tool_call,
+                    previous_tool_calls=[],
+                    trace_span=DummySpan(),  # TODO: replace with proper span
+                    tool_number=None,
+                    user_approved=True,
+                )
+            else:
+                # Tool was rejected or no decision found, add rejection message
+                tool_result = ToolCallResult(
+                    tool_call_id=tool_call.id,
+                    tool_name=tool_call.function.name,
+                    description=tool_call.function.name,
+                    result=StructuredToolResult(
+                        status=StructuredToolResultStatus.ERROR,
+                        error="Tool execution was denied by the user.",
+                    ),
+                )
+            events.append(
+                StreamMessage(
+                    event=StreamEvents.TOOL_RESULT,
+                    data=tool_result.as_streaming_tool_result_response(),
+                )
+            )
+            tool_call_message = tool_result.as_tool_call_message()
+            # It is expected that the tool call result directly follows the tool call request from the LLM
+            # The API call may contain a user ask which is appended to the messages so we can't just append
+            # tool call results; they need to be inserted right after the llm's message requesting tool calls
+            messages.insert(
+                tool_call_with_decision.message_index + 1, tool_call_message
+            )
+        return messages, events
     def prompt_call(
         self,
         system_prompt: str,
         user_prompt: str,
-        post_process_prompt: Optional[str] = None,
         response_format: Optional[Union[dict, Type[BaseModel]]] = None,
         sections: Optional[InputSectionsDataType] = None,
         trace_span=DummySpan(),
@@ -305,8 +278,7 @@ class ToolCallingLLM:
         ]
         return self.call(
             messages,
-            post_process_prompt,
-            response_format,
+            response_format=response_format,
             user_prompt=user_prompt,
             sections=sections,
             trace_span=trace_span,
@@ -315,55 +287,52 @@ class ToolCallingLLM:
     def messages_call(
         self,
         messages: List[Dict[str, str]],
-        post_process_prompt: Optional[str] = None,
         response_format: Optional[Union[dict, Type[BaseModel]]] = None,
         trace_span=DummySpan(),
     ) -> LLMResult:
         return self.call(
-            messages, post_process_prompt, response_format, trace_span=trace_span
+            messages, response_format=response_format, trace_span=trace_span
         )
     @sentry_sdk.trace
     def call(  # type: ignore
         self,
         messages: List[Dict[str, str]],
-        post_process_prompt: Optional[str] = None,
         response_format: Optional[Union[dict, Type[BaseModel]]] = None,
         user_prompt: Optional[str] = None,
         sections: Optional[InputSectionsDataType] = None,
         trace_span=DummySpan(),
         tool_number_offset: int = 0,
     ) -> LLMResult:
-        perf_timing = PerformanceTiming("tool_calling_llm.call")
-        tool_calls = []  # type: ignore
+        tool_calls: list[
+            dict
+        ] = []  # Used for preventing repeated tool calls. potentially reset after compaction
+        all_tool_calls = []  # type: ignore
         costs = LLMCosts()
         tools = self.tool_executor.get_all_tools_openai_format(
             target_model=self.llm.model
         )
-        perf_timing.measure("get_all_tools_openai_format")
         max_steps = self.max_steps
         i = 0
+        metadata: Dict[Any, Any] = {}
         while i < max_steps:
             i += 1
-            perf_timing.measure(f"start iteration {i}")
             logging.debug(f"running iteration {i}")
             # on the last step we don't allow tools - we want to force a reply, not a request to run another tool
             tools = None if i == max_steps else tools
             tool_choice = "auto" if tools else None
-            total_tokens = self.llm.count_tokens_for_message(messages)
-            max_context_size = self.llm.get_context_window_size()
-            maximum_output_token = self.llm.get_maximum_output_token()
-            perf_timing.measure("count tokens")
+            limit_result = limit_input_context_window(
+                llm=self.llm, messages=messages, tools=tools
+            )
+            messages = limit_result.messages
+            metadata = metadata | limit_result.metadata
-            if (total_tokens + maximum_output_token) > max_context_size:
-                logging.warning("Token limit exceeded. Truncating tool responses.")
-                messages = self.truncate_messages_to_fit_context(
-                    messages, max_context_size, maximum_output_token
-                )
-                perf_timing.measure("truncate_messages_to_fit_context")
+            if (
+                limit_result.conversation_history_compacted
+                and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
+            ):
+                tool_calls = []
             logging.debug(f"sending messages={messages}\n\ntools={tools}")
@@ -381,7 +350,6 @@ class ToolCallingLLM:
                 # Extract and accumulate cost information
                 _process_cost_info(full_response, costs, "LLM call")
-                perf_timing.measure("llm.completion")
             # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
             except BadRequestError as e:
                 if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -405,9 +373,10 @@ class ToolCallingLLM:
                 if incorrect_tool_call:
                     logging.warning(
-                        "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
+                        "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
                     )
                     # disable structured output going forward and and retry
+                    sentry_helper.capture_structured_output_incorrect_tool_call()
                     response_format = None
                     max_steps = max_steps + 1
                     continue
@@ -424,42 +393,29 @@ class ToolCallingLLM:
                 hasattr(response_message, "reasoning_content")
                 and response_message.reasoning_content
             ):
-                logging.debug(
-                    f"[bold {AI_COLOR}]AI (reasoning) 🤔:[/bold {AI_COLOR}] {response_message.reasoning_content}\n"
+                logging.info(
+                    f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
                 )
             if not tools_to_call:
-                # For chatty models post process and summarize the result
-                # this only works for calls where user prompt is explicitly passed through
-                if post_process_prompt and user_prompt:
-                    logging.info("Running post processing on investigation.")
-                    raw_response = text_response
-                    post_processed_response, post_processing_cost = (
-                        self._post_processing_call(
-                            prompt=user_prompt,
-                            investigation=raw_response,
-                            user_prompt=post_process_prompt,
-                        )
-                    )
-                    costs.total_cost += post_processing_cost
-                    perf_timing.end(f"- completed in {i} iterations -")
-                    return LLMResult(
-                        result=post_processed_response,
-                        unprocessed_result=raw_response,
-                        tool_calls=tool_calls,
-                        prompt=json.dumps(messages, indent=2),
-                        messages=messages,
-                        **costs.model_dump(),  # Include all cost fields
-                    )
+                tokens = self.llm.count_tokens(messages=messages, tools=tools)
+                add_token_count_to_metadata(
+                    tokens=tokens,
+                    full_llm_response=full_response,
+                    max_context_size=limit_result.max_context_size,
+                    maximum_output_token=limit_result.maximum_output_token,
+                    metadata=metadata,
+                )
-                perf_timing.end(f"- completed in {i} iterations -")
                 return LLMResult(
                     result=text_response,
-                    tool_calls=tool_calls,
+                    tool_calls=all_tool_calls,
+                    num_llm_calls=i,
                     prompt=json.dumps(messages, indent=2),
                     messages=messages,
                     **costs.model_dump(),  # Include all cost fields
+                    metadata=metadata,
                 )
             if text_response and text_response.strip():
@@ -467,7 +423,6 @@ class ToolCallingLLM:
             logging.info(
                 f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
             )
-            perf_timing.measure("pre-tool-calls")
             with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
                 futures = []
                 futures_tool_numbers: dict[
@@ -477,6 +432,7 @@ class ToolCallingLLM:
                 for tool_index, t in enumerate(tools_to_call, 1):
                     logging.debug(f"Tool to call: {t}")
                     tool_number = tool_number_offset + tool_index
                     future = executor.submit(
                         self._invoke_llm_tool_call,
                         tool_to_call=t,
@@ -495,14 +451,24 @@ class ToolCallingLLM:
                         if future in futures_tool_numbers
                         else None
                     )
-                    tool_call_result = self.handle_tool_call_approval(
-                        tool_call_result=tool_call_result, tool_number=tool_number
-                    )
-                    tool_calls.append(tool_call_result.as_tool_result_response())
-                    messages.append(tool_call_result.as_tool_call_message())
+                    if (
+                        tool_call_result.result.status
+                        == StructuredToolResultStatus.APPROVAL_REQUIRED
+                    ):
+                        tool_call_result = self._handle_tool_call_approval(
+                            tool_call_result=tool_call_result,
+                            tool_number=tool_number,
+                            trace_span=trace_span,
+                        )
-                    perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
+                    tool_result_response_dict = (
+                        tool_call_result.as_tool_result_response()
+                    )
+                    tool_calls.append(tool_result_response_dict)
+                    all_tool_calls.append(tool_result_response_dict)
+                    messages.append(tool_call_result.as_tool_call_message())
+                    tokens = self.llm.count_tokens(messages=messages, tools=tools)
                 # Update the tool number offset for the next iteration
                 tool_number_offset += len(tools_to_call)
@@ -513,91 +479,55 @@ class ToolCallingLLM:
         raise Exception(f"Too many LLM calls - exceeded max_steps: {i}/{max_steps}")
-    def _directly_invoke_tool(
+    def _directly_invoke_tool_call(
         self,
         tool_name: str,
         tool_params: dict,
         user_approved: bool,
-        trace_span=DummySpan(),
+        tool_call_id: str,
         tool_number: Optional[int] = None,
     ) -> StructuredToolResult:
-        tool_span = trace_span.start_span(name=tool_name, type="tool")
         tool = self.tool_executor.get_tool_by_name(tool_name)
-        tool_response = None
+        if not tool:
+            logging.warning(
+                f"Skipping tool execution for {tool_name}: args: {tool_params}"
+            )
+            return StructuredToolResult(
+                status=StructuredToolResultStatus.ERROR,
+                error=f"Failed to find tool {tool_name}",
+                params=tool_params,
+            )
         try:
-            if (not tool) or (tool_params is None):
-                logging.warning(
-                    f"Skipping tool execution for {tool_name}: args: {tool_params}"
-                )
-                tool_response = StructuredToolResult(
-                    status=ToolResultStatus.ERROR,
-                    error=f"Failed to find tool {tool_name}",
-                    params=tool_params,
-                )
-            else:
-                tool_response = tool.invoke(
-                    tool_params, tool_number=tool_number, user_approved=user_approved
-                )
+            invoke_context = ToolInvokeContext(
+                tool_number=tool_number,
+                user_approved=user_approved,
+                llm=self.llm,
+                max_token_count=self.llm.get_max_token_count_for_single_tool(),
+                tool_name=tool_name,
+                tool_call_id=tool_call_id,
+            )
+            tool_response = tool.invoke(tool_params, context=invoke_context)
         except Exception as e:
             logging.error(
                 f"Tool call to {tool_name} failed with an Exception", exc_info=True
             )
             tool_response = StructuredToolResult(
-                status=ToolResultStatus.ERROR,
+                status=StructuredToolResultStatus.ERROR,
                 error=f"Tool call failed: {e}",
                 params=tool_params,
             )
-            # Log error to trace span
-            tool_span.log(
-                input=tool_params, output=str(e), metadata={"status": "ERROR"}
-            )
-        tool_span.log(
-            input=tool_params,
-            output=tool_response.data,
-            metadata={
-                "status": tool_response.status.value,
-                "error": tool_response.error,
-                "description": tool.get_parameterized_one_liner(tool_params)
-                if tool
-                else "",
-                "structured_tool_result": tool_response,
-            },
-        )
-        tool_span.end()
         return tool_response
-    def _invoke_llm_tool_call(
+    def _get_tool_call_result(
         self,
-        tool_to_call: ChatCompletionMessageToolCall,
+        tool_call_id: str,
+        tool_name: str,
+        tool_arguments: str,
+        user_approved: bool,
         previous_tool_calls: list[dict],
-        trace_span=DummySpan(),
-        tool_number=None,
+        tool_number: Optional[int] = None,
     ) -> ToolCallResult:
-        # Handle the union type - ChatCompletionMessageToolCall can be either
-        # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
-        # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
-        # We use hasattr to check for the 'function' attribute as it's more flexible
-        # and doesn't require importing the specific type.
-        if hasattr(tool_to_call, "function"):
-            tool_name = tool_to_call.function.name
-            tool_arguments = tool_to_call.function.arguments
-        else:
-            # This is a custom tool call - we don't support these currently
-            logging.error(f"Unsupported custom tool call: {tool_to_call}")
-            return ToolCallResult(
-                tool_call_id=tool_to_call.id,
-                tool_name="unknown",
-                description="NA",
-                result=StructuredToolResult(
-                    status=ToolResultStatus.ERROR,
-                    error="Custom tool calls are not supported",
-                    params=None,
-                ),
-            )
         tool_params = {}
         try:
             tool_params = json.loads(tool_arguments)
@@ -606,21 +536,21 @@ class ToolCallingLLM:
                 f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
             )
-        tool_call_id = tool_to_call.id
-        tool_response = prevent_overly_repeated_tool_call(
-            tool_name=tool_name,
-            tool_params=tool_params,
-            tool_calls=previous_tool_calls,
-        )
+        tool_response = None
+        if not user_approved:
+            tool_response = prevent_overly_repeated_tool_call(
+                tool_name=tool_name,
+                tool_params=tool_params,
+                tool_calls=previous_tool_calls,
+            )
         if not tool_response:
-            tool_response = self._directly_invoke_tool(
+            tool_response = self._directly_invoke_tool_call(
                 tool_name=tool_name,
                 tool_params=tool_params,
-                user_approved=False,
-                trace_span=trace_span,
+                user_approved=user_approved,
                 tool_number=tool_number,
+                tool_call_id=tool_call_id,
             )
         if not isinstance(tool_response, StructuredToolResult):
@@ -629,124 +559,165 @@ class ToolCallingLLM:
                 f"Tool {tool_name} return type is not StructuredToolResult. Nesting the tool result into StructuredToolResult..."
             )
             tool_response = StructuredToolResult(
-                status=ToolResultStatus.SUCCESS,
+                status=StructuredToolResultStatus.SUCCESS,
                 data=tool_response,
                 params=tool_params,
             )
         tool = self.tool_executor.get_tool_by_name(tool_name)
         return ToolCallResult(
             tool_call_id=tool_call_id,
             tool_name=tool_name,
-            description=tool.get_parameterized_one_liner(tool_params) if tool else "",
+            description=str(tool.get_parameterized_one_liner(tool_params))
+            if tool
+            else "",
             result=tool_response,
         )
-    def handle_tool_call_approval(
-        self, tool_call_result: ToolCallResult, tool_number: Optional[int]
+    @staticmethod
+    def _log_tool_call_result(
+        tool_span,
+        tool_call_result: ToolCallResult,
+        approval_possible=True,
+        original_token_count=None,
+    ):
+        tool_span.set_attributes(name=tool_call_result.tool_name)
+        status = tool_call_result.result.status
+        if (
+            status == StructuredToolResultStatus.APPROVAL_REQUIRED
+            and not approval_possible
+        ):
+            status = StructuredToolResultStatus.ERROR
+        if status == StructuredToolResultStatus.ERROR:
+            error = (
+                tool_call_result.result.error
+                if tool_call_result.result.error
+                else "Unspecified error"
+            )
+        else:
+            error = None
+        tool_span.log(
+            input=tool_call_result.result.params,
+            output=tool_call_result.result.data,
+            error=error,
+            metadata={
+                "status": status,
+                "description": tool_call_result.description,
+                "return_code": tool_call_result.result.return_code,
+                "error": tool_call_result.result.error,
+                "original_token_count": original_token_count,
+            },
+        )
+    def _invoke_llm_tool_call(
+        self,
+        tool_to_call: ChatCompletionMessageToolCall,
+        previous_tool_calls: list[dict],
+        trace_span=None,
+        tool_number=None,
+        user_approved: bool = False,
+    ) -> ToolCallResult:
+        if trace_span is None:
+            trace_span = DummySpan()
+        with trace_span.start_span(type="tool") as tool_span:
+            if not hasattr(tool_to_call, "function"):
+                # Handle the union type - ChatCompletionMessageToolCall can be either
+                # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
+                # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
+                # We use hasattr to check for the 'function' attribute as it's more flexible
+                # and doesn't require importing the specific type.
+                tool_name = "Unknown_Custom_Tool"
+                logging.error(f"Unsupported custom tool call: {tool_to_call}")
+                tool_call_result = ToolCallResult(
+                    tool_call_id=tool_to_call.id,
+                    tool_name=tool_name,
+                    description="NA",
+                    result=StructuredToolResult(
+                        status=StructuredToolResultStatus.ERROR,
+                        error="Custom tool calls are not supported",
+                        params=None,
+                    ),
+                )
+            else:
+                tool_name = tool_to_call.function.name
+                tool_arguments = tool_to_call.function.arguments
+                tool_id = tool_to_call.id
+                tool_call_result = self._get_tool_call_result(
+                    tool_id,
+                    tool_name,
+                    tool_arguments,
+                    previous_tool_calls=previous_tool_calls,
+                    tool_number=tool_number,
+                    user_approved=user_approved,
+                )
+            original_token_count = prevent_overly_big_tool_response(
+                tool_call_result=tool_call_result, llm=self.llm
+            )
+            ToolCallingLLM._log_tool_call_result(
+                tool_span,
+                tool_call_result,
+                self.approval_callback is not None,
+                original_token_count,
+            )
+            return tool_call_result
+    def _handle_tool_call_approval(
+        self,
+        tool_call_result: ToolCallResult,
+        tool_number: Optional[int],
+        trace_span: Any,
     ) -> ToolCallResult:
         """
         Handle approval for a single tool call if required.
         Args:
             tool_call_result: A single tool call result that may require approval
+            tool_number: The tool call number
         Returns:
             Updated tool call result with approved/denied status
         """
-        if tool_call_result.result.status != ToolResultStatus.APPROVAL_REQUIRED:
-            return tool_call_result
         # If no approval callback, convert to ERROR because it is assumed the client may not be able to handle approvals
         if not self.approval_callback:
-            tool_call_result.result.status = ToolResultStatus.ERROR
+            tool_call_result.result.status = StructuredToolResultStatus.ERROR
             return tool_call_result
         # Get approval from user
-        approved, feedback = self.approval_callback(tool_call_result.result)
-        if approved:
-            logging.debug(
-                f"User approved command: {tool_call_result.result.invocation}"
-            )
-            new_response = self._directly_invoke_tool(
-                tool_name=tool_call_result.tool_name,
-                tool_params=tool_call_result.result.params or {},
-                user_approved=True,
-                trace_span=DummySpan(),
-                tool_number=tool_number,
-            )
-            tool_call_result.result = new_response
-        else:
-            # User denied - update to error
-            feedback_text = f" User feedback: {feedback}" if feedback else ""
-            tool_call_result.result.status = ToolResultStatus.ERROR
-            tool_call_result.result.error = (
-                f"User denied command execution.{feedback_text}"
-            )
-        return tool_call_result
-    @staticmethod
-    def __load_post_processing_user_prompt(
-        input_prompt, investigation, user_prompt: Optional[str] = None
-    ) -> str:
-        if not user_prompt:
-            user_prompt = "builtin://generic_post_processing.jinja2"
-        return load_and_render_prompt(
-            user_prompt, {"investigation": investigation, "prompt": input_prompt}
-        )
-    def _post_processing_call(
-        self,
-        prompt,
-        investigation,
-        user_prompt: Optional[str] = None,
-        system_prompt: str = "You are an AI assistant summarizing Kubernetes issues.",
-    ) -> tuple[Optional[str], float]:
-        try:
-            user_prompt = ToolCallingLLM.__load_post_processing_user_prompt(
-                prompt, investigation, user_prompt
-            )
-            logging.debug(f'Post processing prompt:\n"""\n{user_prompt}\n"""')
-            messages = [
-                {
-                    "role": "system",
-                    "content": system_prompt,
-                },
-                {
-                    "role": "user",
-                    "content": format_tags_in_string(user_prompt),
-                },
-            ]
-            full_response = self.llm.completion(messages=messages, temperature=0)
-            logging.debug(f"Post processing response {full_response}")
-            # Extract and log cost information for post-processing
-            post_processing_cost = _extract_cost_from_response(full_response)
-            if post_processing_cost > 0:
-                cost_logger.debug(
-                    f"Post-processing LLM cost: ${post_processing_cost:.6f}"
+        with trace_span.start_span(
+            type="task", name=f"Ask approval for {tool_call_result.tool_name}"
+        ):
+            approved, feedback = self.approval_callback(tool_call_result.result)
+        # Note - Tool calls are currently logged twice, once when returning APPROVAL_REQUIRED and once here
+        with trace_span.start_span(type="tool") as tool_span:
+            if approved:
+                logging.debug(
+                    f"User approved command: {tool_call_result.result.invocation}"
                 )
+                new_response = self._directly_invoke_tool_call(
+                    tool_name=tool_call_result.tool_name,
+                    tool_params=tool_call_result.result.params or {},
+                    user_approved=True,
+                    tool_number=tool_number,
+                    tool_call_id=tool_call_result.tool_call_id,
+                )
+                tool_call_result.result = new_response
+            else:
+                # User denied - update to error
+                feedback_text = f" User feedback: {feedback}" if feedback else ""
+                tool_call_result.result.status = StructuredToolResultStatus.ERROR
+                tool_call_result.result.error = (
+                    f"User denied command execution.{feedback_text}"
+                )
+            ToolCallingLLM._log_tool_call_result(tool_span, tool_call_result)
-            return full_response.choices[0].message.content, post_processing_cost  # type: ignore
-        except Exception:
-            logging.exception("Failed to run post processing", exc_info=True)
-            return investigation, 0.0
-    @sentry_sdk.trace
-    def truncate_messages_to_fit_context(
-        self, messages: list, max_context_size: int, maximum_output_token: int
-    ) -> list:
-        return truncate_messages_to_fit_context(
-            messages,
-            max_context_size,
-            maximum_output_token,
-            self.llm.count_tokens_for_message,
-        )
+        return tool_call_result
     def call_stream(
         self,
@@ -755,47 +726,55 @@ class ToolCallingLLM:
         response_format: Optional[Union[dict, Type[BaseModel]]] = None,
         sections: Optional[InputSectionsDataType] = None,
         msgs: Optional[list[dict]] = None,
+        enable_tool_approval: bool = False,
+        tool_decisions: List[ToolApprovalDecision] | None = None,
     ):
         """
         This function DOES NOT call llm.completion(stream=true).
         This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
         """
-        messages = []
+        # Process tool decisions if provided
+        if msgs and tool_decisions:
+            logging.info(f"Processing {len(tool_decisions)} tool decisions")
+            msgs, events = self.process_tool_decisions(msgs, tool_decisions)
+            yield from events
+        messages: list[dict] = []
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
         if user_prompt:
             messages.append({"role": "user", "content": user_prompt})
         if msgs:
             messages.extend(msgs)
-        perf_timing = PerformanceTiming("tool_calling_llm.call")
         tool_calls: list[dict] = []
         tools = self.tool_executor.get_all_tools_openai_format(
             target_model=self.llm.model
         )
-        perf_timing.measure("get_all_tools_openai_format")
         max_steps = self.max_steps
+        metadata: Dict[Any, Any] = {}
         i = 0
         tool_number_offset = 0
         while i < max_steps:
             i += 1
-            perf_timing.measure(f"start iteration {i}")
             logging.debug(f"running iteration {i}")
             tools = None if i == max_steps else tools
             tool_choice = "auto" if tools else None
-            total_tokens = self.llm.count_tokens_for_message(messages)  # type: ignore
-            max_context_size = self.llm.get_context_window_size()
-            maximum_output_token = self.llm.get_maximum_output_token()
-            perf_timing.measure("count tokens")
+            limit_result = limit_input_context_window(
+                llm=self.llm, messages=messages, tools=tools
+            )
+            yield from limit_result.events
+            messages = limit_result.messages
+            metadata = metadata | limit_result.metadata
-            if (total_tokens + maximum_output_token) > max_context_size:
-                logging.warning("Token limit exceeded. Truncating tool responses.")
-                messages = self.truncate_messages_to_fit_context(
-                    messages, max_context_size, maximum_output_token
-                )
-                perf_timing.measure("truncate_messages_to_fit_context")
+            if (
+                limit_result.conversation_history_compacted
+                and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
+            ):
+                tool_calls = []
             logging.debug(f"sending messages={messages}\n\ntools={tools}")
             try:
@@ -812,7 +791,6 @@ class ToolCallingLLM:
                 # Log cost information for this iteration (no accumulation in streaming)
                 _process_cost_info(full_response, log_prefix="LLM iteration")
-                perf_timing.measure("llm.completion")
             # catch a known error that occurs with Azure and replace the error message with something more obvious to the user
             except BadRequestError as e:
                 if "Unrecognized request arguments supplied: tool_choice, tools" in str(
@@ -834,9 +812,10 @@ class ToolCallingLLM:
                 if incorrect_tool_call:
                     logging.warning(
-                        "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
+                        "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
                     )
                     # disable structured output going forward and and retry
+                    sentry_helper.capture_structured_output_incorrect_tool_call()
                     response_format = None
                     max_steps = max_steps + 1
                     continue
@@ -847,11 +826,25 @@ class ToolCallingLLM:
                 )
             )
+            tokens = self.llm.count_tokens(messages=messages, tools=tools)
+            add_token_count_to_metadata(
+                tokens=tokens,
+                full_llm_response=full_response,
+                max_context_size=limit_result.max_context_size,
+                maximum_output_token=limit_result.maximum_output_token,
+                metadata=metadata,
+            )
+            yield build_stream_event_token_count(metadata=metadata)
             tools_to_call = getattr(response_message, "tool_calls", None)
             if not tools_to_call:
                 yield StreamMessage(
                     event=StreamEvents.ANSWER_END,
-                    data={"content": response_message.content, "messages": messages},
+                    data={
+                        "content": response_message.content,
+                        "messages": messages,
+                        "metadata": metadata,
+                    },
                 )
                 return
@@ -860,14 +853,22 @@ class ToolCallingLLM:
             if reasoning or message:
                 yield StreamMessage(
                     event=StreamEvents.AI_MESSAGE,
-                    data={"content": message, "reasoning": reasoning},
+                    data={
+                        "content": message,
+                        "reasoning": reasoning,
+                        "metadata": metadata,
+                    },
                 )
-            perf_timing.measure("pre-tool-calls")
+            # Check if any tools require approval first
+            pending_approvals = []
+            approval_required_tools = []
             with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
                 futures = []
                 for tool_index, t in enumerate(tools_to_call, 1):  # type: ignore
                     tool_number = tool_number_offset + tool_index
                     future = executor.submit(
                         self._invoke_llm_tool_call,
                         tool_to_call=t,  # type: ignore
@@ -884,15 +885,72 @@ class ToolCallingLLM:
                 for future in concurrent.futures.as_completed(futures):
                     tool_call_result: ToolCallResult = future.result()
-                    tool_calls.append(tool_call_result.as_tool_result_response())
-                    messages.append(tool_call_result.as_tool_call_message())
+                    if (
+                        tool_call_result.result.status
+                        == StructuredToolResultStatus.APPROVAL_REQUIRED
+                    ):
+                        if enable_tool_approval:
+                            pending_approvals.append(
+                                PendingToolApproval(
+                                    tool_call_id=tool_call_result.tool_call_id,
+                                    tool_name=tool_call_result.tool_name,
+                                    description=tool_call_result.description,
+                                    params=tool_call_result.result.params or {},
+                                )
+                            )
+                            approval_required_tools.append(tool_call_result)
+                            yield StreamMessage(
+                                event=StreamEvents.TOOL_RESULT,
+                                data=tool_call_result.as_streaming_tool_result_response(),
+                            )
+                        else:
+                            tool_call_result.result.status = (
+                                StructuredToolResultStatus.ERROR
+                            )
+                            tool_call_result.result.error = f"Tool call rejected for security reasons: {tool_call_result.result.error}"
+                            tool_calls.append(
+                                tool_call_result.as_tool_result_response()
+                            )
+                            messages.append(tool_call_result.as_tool_call_message())
+                            yield StreamMessage(
+                                event=StreamEvents.TOOL_RESULT,
+                                data=tool_call_result.as_streaming_tool_result_response(),
+                            )
+                    else:
+                        tool_calls.append(tool_call_result.as_tool_result_response())
+                        messages.append(tool_call_result.as_tool_call_message())
+                        yield StreamMessage(
+                            event=StreamEvents.TOOL_RESULT,
+                            data=tool_call_result.as_streaming_tool_result_response(),
+                        )
-                    perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
+                # If we have approval required tools, end the stream with pending approvals
+                if pending_approvals:
+                    # Add assistant message with pending tool calls
+                    for result in approval_required_tools:
+                        tool_call = self.find_assistant_tool_call_request(
+                            tool_call_id=result.tool_call_id, messages=messages
+                        )
+                        tool_call["pending_approval"] = True
+                    # End stream with approvals required
                     yield StreamMessage(
-                        event=StreamEvents.TOOL_RESULT,
-                        data=tool_call_result.as_streaming_tool_result_response(),
+                        event=StreamEvents.APPROVAL_REQUIRED,
+                        data={
+                            "content": None,
+                            "messages": messages,
+                            "pending_approvals": [
+                                approval.model_dump() for approval in pending_approvals
+                            ],
+                            "requires_approval": True,
+                        },
                     )
+                    return
                 # Update the tool number offset for the next iteration
                 tool_number_offset += len(tools_to_call)
@@ -901,6 +959,21 @@ class ToolCallingLLM:
             f"Too many LLM calls - exceeded max_steps: {i}/{self.max_steps}"
         )
+    def find_assistant_tool_call_request(
+        self, tool_call_id: str, messages: list[dict[str, Any]]
+    ) -> dict[str, Any]:
+        for message in messages:
+            if message.get("role") == "assistant":
+                for tool_call in message.get("tool_calls", []):
+                    if tool_call.get("id") == tool_call_id:
+                        return tool_call
+        # Should not happen unless there is a bug.
+        # If we are here
+        raise Exception(
+            f"Failed to find assistant request for a tool_call in conversation history. tool_call_id={tool_call_id}"
+        )
 # TODO: consider getting rid of this entirely and moving templating into the cmds in holmes_cli.py
 class IssueInvestigator(ToolCallingLLM):
@@ -927,14 +1000,13 @@ class IssueInvestigator(ToolCallingLLM):
         self,
         issue: Issue,
         prompt: str,
-        instructions: Optional[ResourceInstructions],
         console: Optional[Console] = None,
         global_instructions: Optional[Instructions] = None,
-        post_processing_prompt: Optional[str] = None,
         sections: Optional[InputSectionsDataType] = None,
         trace_span=DummySpan(),
+        runbooks: Optional[RunbookCatalog] = None,
     ) -> LLMResult:
-        runbooks = self.runbook_manager.get_instructions_for_issue(issue)
+        issue_runbooks = self.runbook_manager.get_instructions_for_issue(issue)
         request_structured_output_from_llm = True
         response_format = None
@@ -962,12 +1034,9 @@ class IssueInvestigator(ToolCallingLLM):
         else:
             logging.info("Structured output is disabled for this request")
-        if instructions is not None and instructions.instructions:
-            runbooks.extend(instructions.instructions)
         if console and runbooks:
             console.print(
-                f"[bold]Analyzing with {len(runbooks)} runbooks: {runbooks}[/bold]"
+                f"[bold]Analyzing with {len(issue_runbooks)} runbooks: {issue_runbooks}[/bold]"
             )
         elif console:
             console.print(
@@ -982,29 +1051,22 @@ class IssueInvestigator(ToolCallingLLM):
                 "structured_output": request_structured_output_from_llm,
                 "toolsets": self.tool_executor.toolsets,
                 "cluster_name": self.cluster_name,
+                "runbooks_enabled": True if runbooks else False,
             },
         )
-        if instructions is not None and len(instructions.documents) > 0:
-            docPrompts = []
-            for document in instructions.documents:
-                docPrompts.append(
-                    f"* fetch information from this URL: {document.url}\n"
-                )
-            runbooks.extend(docPrompts)
-        user_prompt = ""
-        if runbooks:
-            for runbook_str in runbooks:
-                user_prompt += f"* {runbook_str}\n"
+        base_user = ""
+        base_user = f"{base_user}\n #This is context from the issue:\n{issue.raw}"
-            user_prompt = f'My instructions to check \n"""{user_prompt}"""'
-        user_prompt = add_global_instructions_to_user_prompt(
-            user_prompt, global_instructions
+        runbooks_ctx = generate_runbooks_args(
+            runbook_catalog=runbooks,
+            global_instructions=global_instructions,
+            issue_instructions=issue_runbooks,
+        )
+        user_prompt = generate_user_prompt(
+            base_user,
+            runbooks_ctx,
         )
-        user_prompt = f"{user_prompt}\n This is context from the issue {issue.raw}"
         logging.debug(
             "Rendered system prompt:\n%s", textwrap.indent(system_prompt, "    ")
         )
@@ -1013,10 +1075,9 @@ class IssueInvestigator(ToolCallingLLM):
         res = self.prompt_call(
             system_prompt,
             user_prompt,
-            post_processing_prompt,
             response_format=response_format,
             sections=sections,
             trace_span=trace_span,
         )
-        res.instructions = runbooks
+        res.instructions = issue_runbooks
         return res

holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

holmesgpt 0.13.2py3-none-any.whl → 0.18.4py3-none-any.whl