holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +17 -4
- holmes/common/env_vars.py +40 -1
- holmes/config.py +114 -144
- holmes/core/conversations.py +53 -14
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +18 -22
- holmes/core/llm.py +489 -88
- holmes/core/models.py +103 -1
- holmes/core/openai_formatting.py +13 -0
- holmes/core/prompt.py +1 -1
- holmes/core/safeguards.py +4 -4
- holmes/core/supabase_dal.py +293 -100
- holmes/core/tool_calling_llm.py +423 -323
- holmes/core/tools.py +311 -33
- holmes/core/tools_utils/token_counting.py +14 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
- holmes/core/tools_utils/tool_executor.py +13 -8
- holmes/core/toolset_manager.py +155 -4
- holmes/core/tracing.py +6 -1
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +62 -0
- holmes/core/transformers/llm_summarize.py +174 -0
- holmes/core/transformers/registry.py +122 -0
- holmes/core/transformers/transformer.py +31 -0
- holmes/core/truncation/compaction.py +59 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +218 -0
- holmes/interactive.py +177 -24
- holmes/main.py +7 -4
- holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
- holmes/plugins/prompts/generic_ask.jinja2 +2 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
- holmes/plugins/runbooks/__init__.py +117 -18
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/toolsets/__init__.py +21 -8
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
- holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
- holmes/plugins/toolsets/bash/common/bash.py +7 -7
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
- holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
- holmes/plugins/toolsets/git.py +51 -46
- holmes/plugins/toolsets/grafana/common.py +15 -3
- holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
- holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
- holmes/plugins/toolsets/internet/internet.py +6 -7
- holmes/plugins/toolsets/internet/notion.py +5 -6
- holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
- holmes/plugins/toolsets/kafka.py +25 -36
- holmes/plugins/toolsets/kubernetes.yaml +58 -84
- holmes/plugins/toolsets/kubernetes_logs.py +6 -6
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
- holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
- holmes/plugins/toolsets/newrelic/__init__.py +0 -0
- holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
- holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
- holmes/plugins/toolsets/robusta/robusta.py +236 -65
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/global_instructions.py +75 -10
- holmes/utils/holmes_status.py +2 -1
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/sentry_helper.py +41 -0
- holmes/utils/stream.py +61 -7
- holmes/version.py +34 -14
- holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
holmes/core/tool_calling_llm.py
CHANGED
|
@@ -2,8 +2,13 @@ import concurrent.futures
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import textwrap
|
|
5
|
-
from typing import Dict, List, Optional, Type, Union, Callable
|
|
5
|
+
from typing import Dict, List, Optional, Type, Union, Callable, Any
|
|
6
6
|
|
|
7
|
+
from holmes.core.models import (
|
|
8
|
+
ToolApprovalDecision,
|
|
9
|
+
ToolCallResult,
|
|
10
|
+
PendingToolApproval,
|
|
11
|
+
)
|
|
7
12
|
|
|
8
13
|
import sentry_sdk
|
|
9
14
|
from openai import BadRequestError
|
|
@@ -14,8 +19,8 @@ from pydantic import BaseModel, Field
|
|
|
14
19
|
from rich.console import Console
|
|
15
20
|
|
|
16
21
|
from holmes.common.env_vars import (
|
|
22
|
+
RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
|
|
17
23
|
TEMPERATURE,
|
|
18
|
-
MAX_OUTPUT_TOKEN_RESERVATION,
|
|
19
24
|
LOG_LLM_USAGE_RESPONSE,
|
|
20
25
|
)
|
|
21
26
|
|
|
@@ -28,21 +33,37 @@ from holmes.core.investigation_structured_output import (
|
|
|
28
33
|
)
|
|
29
34
|
from holmes.core.issue import Issue
|
|
30
35
|
from holmes.core.llm import LLM
|
|
31
|
-
from holmes.core.performance_timing import PerformanceTiming
|
|
32
36
|
from holmes.core.resource_instruction import ResourceInstructions
|
|
33
37
|
from holmes.core.runbooks import RunbookManager
|
|
34
38
|
from holmes.core.safeguards import prevent_overly_repeated_tool_call
|
|
35
|
-
from holmes.core.tools import
|
|
39
|
+
from holmes.core.tools import (
|
|
40
|
+
StructuredToolResult,
|
|
41
|
+
StructuredToolResultStatus,
|
|
42
|
+
ToolInvokeContext,
|
|
43
|
+
)
|
|
44
|
+
from holmes.core.tools_utils.tool_context_window_limiter import (
|
|
45
|
+
prevent_overly_big_tool_response,
|
|
46
|
+
)
|
|
47
|
+
from holmes.core.truncation.input_context_window_limiter import (
|
|
48
|
+
limit_input_context_window,
|
|
49
|
+
)
|
|
36
50
|
from holmes.plugins.prompts import load_and_render_prompt
|
|
51
|
+
from holmes.plugins.runbooks import RunbookCatalog
|
|
52
|
+
from holmes.utils import sentry_helper
|
|
37
53
|
from holmes.utils.global_instructions import (
|
|
38
54
|
Instructions,
|
|
39
|
-
|
|
55
|
+
add_runbooks_to_user_prompt,
|
|
40
56
|
)
|
|
41
57
|
from holmes.utils.tags import format_tags_in_string, parse_messages_tags
|
|
42
58
|
from holmes.core.tools_utils.tool_executor import ToolExecutor
|
|
43
59
|
from holmes.core.tracing import DummySpan
|
|
44
60
|
from holmes.utils.colors import AI_COLOR
|
|
45
|
-
from holmes.utils.stream import
|
|
61
|
+
from holmes.utils.stream import (
|
|
62
|
+
StreamEvents,
|
|
63
|
+
StreamMessage,
|
|
64
|
+
add_token_count_to_metadata,
|
|
65
|
+
build_stream_event_token_count,
|
|
66
|
+
)
|
|
46
67
|
|
|
47
68
|
# Create a named logger for cost tracking
|
|
48
69
|
cost_logger = logging.getLogger("holmes.costs")
|
|
@@ -119,148 +140,6 @@ def _process_cost_info(
|
|
|
119
140
|
logging.debug(f"Could not extract cost information: {e}")
|
|
120
141
|
|
|
121
142
|
|
|
122
|
-
def format_tool_result_data(tool_result: StructuredToolResult) -> str:
|
|
123
|
-
tool_response = tool_result.data
|
|
124
|
-
if isinstance(tool_result.data, str):
|
|
125
|
-
tool_response = tool_result.data
|
|
126
|
-
else:
|
|
127
|
-
try:
|
|
128
|
-
if isinstance(tool_result.data, BaseModel):
|
|
129
|
-
tool_response = tool_result.data.model_dump_json(indent=2)
|
|
130
|
-
else:
|
|
131
|
-
tool_response = json.dumps(tool_result.data, indent=2)
|
|
132
|
-
except Exception:
|
|
133
|
-
tool_response = str(tool_result.data)
|
|
134
|
-
if tool_result.status == ToolResultStatus.ERROR:
|
|
135
|
-
tool_response = f"{tool_result.error or 'Tool execution failed'}:\n\n{tool_result.data or ''}".strip()
|
|
136
|
-
return tool_response
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
# TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
|
|
140
|
-
# However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
|
|
141
|
-
# We should fix this in the future
|
|
142
|
-
# TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
|
|
143
|
-
# token truncation and not character truncation
|
|
144
|
-
def truncate_messages_to_fit_context(
|
|
145
|
-
messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
|
|
146
|
-
) -> list:
|
|
147
|
-
"""
|
|
148
|
-
Helper function to truncate tool messages to fit within context limits.
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
messages: List of message dictionaries with roles and content
|
|
152
|
-
max_context_size: Maximum context window size for the model
|
|
153
|
-
maximum_output_token: Maximum tokens reserved for model output
|
|
154
|
-
count_tokens_fn: Function to count tokens for a list of messages
|
|
155
|
-
|
|
156
|
-
Returns:
|
|
157
|
-
Modified list of messages with truncated tool responses
|
|
158
|
-
|
|
159
|
-
Raises:
|
|
160
|
-
Exception: If non-tool messages exceed available context space
|
|
161
|
-
"""
|
|
162
|
-
messages_except_tools = [
|
|
163
|
-
message for message in messages if message["role"] != "tool"
|
|
164
|
-
]
|
|
165
|
-
message_size_without_tools = count_tokens_fn(messages_except_tools)
|
|
166
|
-
|
|
167
|
-
tool_call_messages = [message for message in messages if message["role"] == "tool"]
|
|
168
|
-
|
|
169
|
-
reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
|
|
170
|
-
if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
|
|
171
|
-
logging.error(
|
|
172
|
-
f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
|
|
173
|
-
)
|
|
174
|
-
raise Exception(
|
|
175
|
-
f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
if len(tool_call_messages) == 0:
|
|
179
|
-
return messages
|
|
180
|
-
|
|
181
|
-
available_space = (
|
|
182
|
-
max_context_size - message_size_without_tools - maximum_output_token
|
|
183
|
-
)
|
|
184
|
-
remaining_space = available_space
|
|
185
|
-
tool_call_messages.sort(key=lambda x: len(x["content"]))
|
|
186
|
-
|
|
187
|
-
# Allocate space starting with small tools and going to larger tools, while maintaining fairness
|
|
188
|
-
# Small tools can often get exactly what they need, while larger tools may need to be truncated
|
|
189
|
-
# We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
|
|
190
|
-
for i, msg in enumerate(tool_call_messages):
|
|
191
|
-
remaining_tools = len(tool_call_messages) - i
|
|
192
|
-
max_allocation = remaining_space // remaining_tools
|
|
193
|
-
needed_space = len(msg["content"])
|
|
194
|
-
allocated_space = min(needed_space, max_allocation)
|
|
195
|
-
|
|
196
|
-
if needed_space > allocated_space:
|
|
197
|
-
truncation_notice = "\n\n[TRUNCATED]"
|
|
198
|
-
# Ensure the indicator fits in the allocated space
|
|
199
|
-
if allocated_space > len(truncation_notice):
|
|
200
|
-
msg["content"] = (
|
|
201
|
-
msg["content"][: allocated_space - len(truncation_notice)]
|
|
202
|
-
+ truncation_notice
|
|
203
|
-
)
|
|
204
|
-
logging.info(
|
|
205
|
-
f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space-len(truncation_notice)} tokens"
|
|
206
|
-
)
|
|
207
|
-
else:
|
|
208
|
-
msg["content"] = truncation_notice[:allocated_space]
|
|
209
|
-
logging.info(
|
|
210
|
-
f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space} tokens"
|
|
211
|
-
)
|
|
212
|
-
msg.pop("token_count", None) # Remove token_count if present
|
|
213
|
-
|
|
214
|
-
remaining_space -= allocated_space
|
|
215
|
-
return messages
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
class ToolCallResult(BaseModel):
|
|
219
|
-
tool_call_id: str
|
|
220
|
-
tool_name: str
|
|
221
|
-
description: str
|
|
222
|
-
result: StructuredToolResult
|
|
223
|
-
size: Optional[int] = None
|
|
224
|
-
|
|
225
|
-
def as_tool_call_message(self):
|
|
226
|
-
content = format_tool_result_data(self.result)
|
|
227
|
-
if self.result.params:
|
|
228
|
-
content = (
|
|
229
|
-
f"Params used for the tool call: {json.dumps(self.result.params)}. The tool call output follows on the next line.\n"
|
|
230
|
-
+ content
|
|
231
|
-
)
|
|
232
|
-
return {
|
|
233
|
-
"tool_call_id": self.tool_call_id,
|
|
234
|
-
"role": "tool",
|
|
235
|
-
"name": self.tool_name,
|
|
236
|
-
"content": content,
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
def as_tool_result_response(self):
|
|
240
|
-
result_dump = self.result.model_dump()
|
|
241
|
-
result_dump["data"] = self.result.get_stringified_data()
|
|
242
|
-
|
|
243
|
-
return {
|
|
244
|
-
"tool_call_id": self.tool_call_id,
|
|
245
|
-
"tool_name": self.tool_name,
|
|
246
|
-
"description": self.description,
|
|
247
|
-
"role": "tool",
|
|
248
|
-
"result": result_dump,
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
def as_streaming_tool_result_response(self):
|
|
252
|
-
result_dump = self.result.model_dump()
|
|
253
|
-
result_dump["data"] = self.result.get_stringified_data()
|
|
254
|
-
|
|
255
|
-
return {
|
|
256
|
-
"tool_call_id": self.tool_call_id,
|
|
257
|
-
"role": "tool",
|
|
258
|
-
"description": self.description,
|
|
259
|
-
"name": self.tool_name,
|
|
260
|
-
"result": result_dump,
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
|
|
264
143
|
class LLMResult(LLMCosts):
|
|
265
144
|
tool_calls: Optional[List[ToolCallResult]] = None
|
|
266
145
|
result: Optional[str] = None
|
|
@@ -269,6 +148,7 @@ class LLMResult(LLMCosts):
|
|
|
269
148
|
# TODO: clean up these two
|
|
270
149
|
prompt: Optional[str] = None
|
|
271
150
|
messages: Optional[List[dict]] = None
|
|
151
|
+
metadata: Optional[Dict[Any, Any]] = None
|
|
272
152
|
|
|
273
153
|
def get_tool_usage_summary(self):
|
|
274
154
|
return "AI used info from issue and " + ",".join(
|
|
@@ -276,6 +156,12 @@ class LLMResult(LLMCosts):
|
|
|
276
156
|
)
|
|
277
157
|
|
|
278
158
|
|
|
159
|
+
class ToolCallWithDecision(BaseModel):
|
|
160
|
+
message_index: int
|
|
161
|
+
tool_call: ChatCompletionMessageToolCall
|
|
162
|
+
decision: Optional[ToolApprovalDecision]
|
|
163
|
+
|
|
164
|
+
|
|
279
165
|
class ToolCallingLLM:
|
|
280
166
|
llm: LLM
|
|
281
167
|
|
|
@@ -290,6 +176,95 @@ class ToolCallingLLM:
|
|
|
290
176
|
Callable[[StructuredToolResult], tuple[bool, Optional[str]]]
|
|
291
177
|
] = None
|
|
292
178
|
|
|
179
|
+
def process_tool_decisions(
|
|
180
|
+
self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
|
|
181
|
+
) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
|
|
182
|
+
"""
|
|
183
|
+
Process tool approval decisions and execute approved tools.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
messages: Current conversation messages
|
|
187
|
+
tool_decisions: List of ToolApprovalDecision objects
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Updated messages list with tool execution results
|
|
191
|
+
"""
|
|
192
|
+
events: list[StreamMessage] = []
|
|
193
|
+
if not tool_decisions:
|
|
194
|
+
return messages, events
|
|
195
|
+
|
|
196
|
+
# Create decision lookup
|
|
197
|
+
decisions_by_tool_call_id = {
|
|
198
|
+
decision.tool_call_id: decision for decision in tool_decisions
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
pending_tool_calls: list[ToolCallWithDecision] = []
|
|
202
|
+
|
|
203
|
+
for i in reversed(range(len(messages))):
|
|
204
|
+
msg = messages[i]
|
|
205
|
+
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
206
|
+
message_tool_calls = msg.get("tool_calls", [])
|
|
207
|
+
for tool_call in message_tool_calls:
|
|
208
|
+
decision = decisions_by_tool_call_id.get(tool_call.get("id"), None)
|
|
209
|
+
if tool_call.get("pending_approval"):
|
|
210
|
+
del tool_call[
|
|
211
|
+
"pending_approval"
|
|
212
|
+
] # Cleanup so that a pending approval is not tagged on message in a future response
|
|
213
|
+
pending_tool_calls.append(
|
|
214
|
+
ToolCallWithDecision(
|
|
215
|
+
tool_call=ChatCompletionMessageToolCall(**tool_call),
|
|
216
|
+
decision=decision,
|
|
217
|
+
message_index=i,
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if not pending_tool_calls:
|
|
222
|
+
error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
|
|
223
|
+
logging.error(error_message)
|
|
224
|
+
raise Exception(error_message)
|
|
225
|
+
for tool_call_with_decision in pending_tool_calls:
|
|
226
|
+
tool_call_message: dict
|
|
227
|
+
tool_call = tool_call_with_decision.tool_call
|
|
228
|
+
decision = tool_call_with_decision.decision
|
|
229
|
+
tool_result: Optional[ToolCallResult] = None
|
|
230
|
+
if decision and decision.approved:
|
|
231
|
+
tool_result = self._invoke_llm_tool_call(
|
|
232
|
+
tool_to_call=tool_call,
|
|
233
|
+
previous_tool_calls=[],
|
|
234
|
+
trace_span=DummySpan(), # TODO: replace with proper span
|
|
235
|
+
tool_number=None,
|
|
236
|
+
user_approved=True,
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
# Tool was rejected or no decision found, add rejection message
|
|
240
|
+
tool_result = ToolCallResult(
|
|
241
|
+
tool_call_id=tool_call.id,
|
|
242
|
+
tool_name=tool_call.function.name,
|
|
243
|
+
description=tool_call.function.name,
|
|
244
|
+
result=StructuredToolResult(
|
|
245
|
+
status=StructuredToolResultStatus.ERROR,
|
|
246
|
+
error="Tool execution was denied by the user.",
|
|
247
|
+
),
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
events.append(
|
|
251
|
+
StreamMessage(
|
|
252
|
+
event=StreamEvents.TOOL_RESULT,
|
|
253
|
+
data=tool_result.as_streaming_tool_result_response(),
|
|
254
|
+
)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
tool_call_message = tool_result.as_tool_call_message()
|
|
258
|
+
|
|
259
|
+
# It is expected that the tool call result directly follows the tool call request from the LLM
|
|
260
|
+
# The API call may contain a user ask which is appended to the messages so we can't just append
|
|
261
|
+
# tool call results; they need to be inserted right after the llm's message requesting tool calls
|
|
262
|
+
messages.insert(
|
|
263
|
+
tool_call_with_decision.message_index + 1, tool_call_message
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return messages, events
|
|
267
|
+
|
|
293
268
|
def prompt_call(
|
|
294
269
|
self,
|
|
295
270
|
system_prompt: str,
|
|
@@ -334,36 +309,35 @@ class ToolCallingLLM:
|
|
|
334
309
|
trace_span=DummySpan(),
|
|
335
310
|
tool_number_offset: int = 0,
|
|
336
311
|
) -> LLMResult:
|
|
337
|
-
|
|
338
|
-
|
|
312
|
+
tool_calls: list[
|
|
313
|
+
dict
|
|
314
|
+
] = [] # Used for preventing repeated tool calls. potentially reset after compaction
|
|
315
|
+
all_tool_calls = [] # type: ignore
|
|
339
316
|
costs = LLMCosts()
|
|
340
|
-
|
|
341
317
|
tools = self.tool_executor.get_all_tools_openai_format(
|
|
342
318
|
target_model=self.llm.model
|
|
343
319
|
)
|
|
344
|
-
perf_timing.measure("get_all_tools_openai_format")
|
|
345
320
|
max_steps = self.max_steps
|
|
346
321
|
i = 0
|
|
347
|
-
|
|
322
|
+
metadata: Dict[Any, Any] = {}
|
|
348
323
|
while i < max_steps:
|
|
349
324
|
i += 1
|
|
350
|
-
perf_timing.measure(f"start iteration {i}")
|
|
351
325
|
logging.debug(f"running iteration {i}")
|
|
352
326
|
# on the last step we don't allow tools - we want to force a reply, not a request to run another tool
|
|
353
327
|
tools = None if i == max_steps else tools
|
|
354
328
|
tool_choice = "auto" if tools else None
|
|
355
329
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
330
|
+
limit_result = limit_input_context_window(
|
|
331
|
+
llm=self.llm, messages=messages, tools=tools
|
|
332
|
+
)
|
|
333
|
+
messages = limit_result.messages
|
|
334
|
+
metadata = metadata | limit_result.metadata
|
|
360
335
|
|
|
361
|
-
if (
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
perf_timing.measure("truncate_messages_to_fit_context")
|
|
336
|
+
if (
|
|
337
|
+
limit_result.conversation_history_compacted
|
|
338
|
+
and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
|
|
339
|
+
):
|
|
340
|
+
tool_calls = []
|
|
367
341
|
|
|
368
342
|
logging.debug(f"sending messages={messages}\n\ntools={tools}")
|
|
369
343
|
|
|
@@ -381,7 +355,6 @@ class ToolCallingLLM:
|
|
|
381
355
|
# Extract and accumulate cost information
|
|
382
356
|
_process_cost_info(full_response, costs, "LLM call")
|
|
383
357
|
|
|
384
|
-
perf_timing.measure("llm.completion")
|
|
385
358
|
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
|
|
386
359
|
except BadRequestError as e:
|
|
387
360
|
if "Unrecognized request arguments supplied: tool_choice, tools" in str(
|
|
@@ -405,9 +378,10 @@ class ToolCallingLLM:
|
|
|
405
378
|
|
|
406
379
|
if incorrect_tool_call:
|
|
407
380
|
logging.warning(
|
|
408
|
-
"Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-
|
|
381
|
+
"Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
|
|
409
382
|
)
|
|
410
383
|
# disable structured output going forward and and retry
|
|
384
|
+
sentry_helper.capture_structured_output_incorrect_tool_call()
|
|
411
385
|
response_format = None
|
|
412
386
|
max_steps = max_steps + 1
|
|
413
387
|
continue
|
|
@@ -424,8 +398,8 @@ class ToolCallingLLM:
|
|
|
424
398
|
hasattr(response_message, "reasoning_content")
|
|
425
399
|
and response_message.reasoning_content
|
|
426
400
|
):
|
|
427
|
-
logging.
|
|
428
|
-
f"[
|
|
401
|
+
logging.info(
|
|
402
|
+
f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
|
|
429
403
|
)
|
|
430
404
|
|
|
431
405
|
if not tools_to_call:
|
|
@@ -443,23 +417,33 @@ class ToolCallingLLM:
|
|
|
443
417
|
)
|
|
444
418
|
costs.total_cost += post_processing_cost
|
|
445
419
|
|
|
446
|
-
|
|
420
|
+
tokens = self.llm.count_tokens(messages=messages, tools=tools)
|
|
421
|
+
|
|
422
|
+
add_token_count_to_metadata(
|
|
423
|
+
tokens=tokens,
|
|
424
|
+
full_llm_response=full_response,
|
|
425
|
+
max_context_size=limit_result.max_context_size,
|
|
426
|
+
maximum_output_token=limit_result.maximum_output_token,
|
|
427
|
+
metadata=metadata,
|
|
428
|
+
)
|
|
429
|
+
|
|
447
430
|
return LLMResult(
|
|
448
431
|
result=post_processed_response,
|
|
449
432
|
unprocessed_result=raw_response,
|
|
450
|
-
tool_calls=
|
|
433
|
+
tool_calls=all_tool_calls,
|
|
451
434
|
prompt=json.dumps(messages, indent=2),
|
|
452
435
|
messages=messages,
|
|
453
436
|
**costs.model_dump(), # Include all cost fields
|
|
437
|
+
metadata=metadata,
|
|
454
438
|
)
|
|
455
439
|
|
|
456
|
-
perf_timing.end(f"- completed in {i} iterations -")
|
|
457
440
|
return LLMResult(
|
|
458
441
|
result=text_response,
|
|
459
|
-
tool_calls=
|
|
442
|
+
tool_calls=all_tool_calls,
|
|
460
443
|
prompt=json.dumps(messages, indent=2),
|
|
461
444
|
messages=messages,
|
|
462
445
|
**costs.model_dump(), # Include all cost fields
|
|
446
|
+
metadata=metadata,
|
|
463
447
|
)
|
|
464
448
|
|
|
465
449
|
if text_response and text_response.strip():
|
|
@@ -467,7 +451,6 @@ class ToolCallingLLM:
|
|
|
467
451
|
logging.info(
|
|
468
452
|
f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
|
|
469
453
|
)
|
|
470
|
-
perf_timing.measure("pre-tool-calls")
|
|
471
454
|
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
|
|
472
455
|
futures = []
|
|
473
456
|
futures_tool_numbers: dict[
|
|
@@ -477,6 +460,7 @@ class ToolCallingLLM:
|
|
|
477
460
|
for tool_index, t in enumerate(tools_to_call, 1):
|
|
478
461
|
logging.debug(f"Tool to call: {t}")
|
|
479
462
|
tool_number = tool_number_offset + tool_index
|
|
463
|
+
|
|
480
464
|
future = executor.submit(
|
|
481
465
|
self._invoke_llm_tool_call,
|
|
482
466
|
tool_to_call=t,
|
|
@@ -495,14 +479,27 @@ class ToolCallingLLM:
|
|
|
495
479
|
if future in futures_tool_numbers
|
|
496
480
|
else None
|
|
497
481
|
)
|
|
498
|
-
tool_call_result = self.handle_tool_call_approval(
|
|
499
|
-
tool_call_result=tool_call_result, tool_number=tool_number
|
|
500
|
-
)
|
|
501
482
|
|
|
502
|
-
|
|
483
|
+
if (
|
|
484
|
+
tool_call_result.result.status
|
|
485
|
+
== StructuredToolResultStatus.APPROVAL_REQUIRED
|
|
486
|
+
):
|
|
487
|
+
with trace_span.start_span(type="tool") as tool_span:
|
|
488
|
+
tool_call_result = self._handle_tool_call_approval(
|
|
489
|
+
tool_call_result=tool_call_result,
|
|
490
|
+
tool_number=tool_number,
|
|
491
|
+
)
|
|
492
|
+
ToolCallingLLM._log_tool_call_result(
|
|
493
|
+
tool_span, tool_call_result
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
tool_result_response_dict = (
|
|
497
|
+
tool_call_result.as_tool_result_response()
|
|
498
|
+
)
|
|
499
|
+
tool_calls.append(tool_result_response_dict)
|
|
500
|
+
all_tool_calls.append(tool_result_response_dict)
|
|
503
501
|
messages.append(tool_call_result.as_tool_call_message())
|
|
504
|
-
|
|
505
|
-
perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
|
|
502
|
+
tokens = self.llm.count_tokens(messages=messages, tools=tools)
|
|
506
503
|
|
|
507
504
|
# Update the tool number offset for the next iteration
|
|
508
505
|
tool_number_offset += len(tools_to_call)
|
|
@@ -513,91 +510,52 @@ class ToolCallingLLM:
|
|
|
513
510
|
|
|
514
511
|
raise Exception(f"Too many LLM calls - exceeded max_steps: {i}/{max_steps}")
|
|
515
512
|
|
|
516
|
-
def
|
|
513
|
+
def _directly_invoke_tool_call(
|
|
517
514
|
self,
|
|
518
515
|
tool_name: str,
|
|
519
516
|
tool_params: dict,
|
|
520
517
|
user_approved: bool,
|
|
521
|
-
trace_span=DummySpan(),
|
|
522
518
|
tool_number: Optional[int] = None,
|
|
523
519
|
) -> StructuredToolResult:
|
|
524
|
-
tool_span = trace_span.start_span(name=tool_name, type="tool")
|
|
525
520
|
tool = self.tool_executor.get_tool_by_name(tool_name)
|
|
526
|
-
|
|
521
|
+
if not tool:
|
|
522
|
+
logging.warning(
|
|
523
|
+
f"Skipping tool execution for {tool_name}: args: {tool_params}"
|
|
524
|
+
)
|
|
525
|
+
return StructuredToolResult(
|
|
526
|
+
status=StructuredToolResultStatus.ERROR,
|
|
527
|
+
error=f"Failed to find tool {tool_name}",
|
|
528
|
+
params=tool_params,
|
|
529
|
+
)
|
|
530
|
+
|
|
527
531
|
try:
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
params=tool_params,
|
|
536
|
-
)
|
|
537
|
-
else:
|
|
538
|
-
tool_response = tool.invoke(
|
|
539
|
-
tool_params, tool_number=tool_number, user_approved=user_approved
|
|
540
|
-
)
|
|
532
|
+
invoke_context = ToolInvokeContext(
|
|
533
|
+
tool_number=tool_number,
|
|
534
|
+
user_approved=user_approved,
|
|
535
|
+
llm=self.llm,
|
|
536
|
+
max_token_count=self.llm.get_max_token_count_for_single_tool(),
|
|
537
|
+
)
|
|
538
|
+
tool_response = tool.invoke(tool_params, context=invoke_context)
|
|
541
539
|
except Exception as e:
|
|
542
540
|
logging.error(
|
|
543
541
|
f"Tool call to {tool_name} failed with an Exception", exc_info=True
|
|
544
542
|
)
|
|
545
543
|
tool_response = StructuredToolResult(
|
|
546
|
-
status=
|
|
544
|
+
status=StructuredToolResultStatus.ERROR,
|
|
547
545
|
error=f"Tool call failed: {e}",
|
|
548
546
|
params=tool_params,
|
|
549
547
|
)
|
|
550
|
-
|
|
551
|
-
# Log error to trace span
|
|
552
|
-
tool_span.log(
|
|
553
|
-
input=tool_params, output=str(e), metadata={"status": "ERROR"}
|
|
554
|
-
)
|
|
555
|
-
|
|
556
|
-
tool_span.log(
|
|
557
|
-
input=tool_params,
|
|
558
|
-
output=tool_response.data,
|
|
559
|
-
metadata={
|
|
560
|
-
"status": tool_response.status.value,
|
|
561
|
-
"error": tool_response.error,
|
|
562
|
-
"description": tool.get_parameterized_one_liner(tool_params)
|
|
563
|
-
if tool
|
|
564
|
-
else "",
|
|
565
|
-
"structured_tool_result": tool_response,
|
|
566
|
-
},
|
|
567
|
-
)
|
|
568
|
-
tool_span.end()
|
|
569
|
-
|
|
570
548
|
return tool_response
|
|
571
549
|
|
|
572
|
-
def
|
|
550
|
+
def _get_tool_call_result(
|
|
573
551
|
self,
|
|
574
|
-
|
|
552
|
+
tool_call_id: str,
|
|
553
|
+
tool_name: str,
|
|
554
|
+
tool_arguments: str,
|
|
555
|
+
user_approved: bool,
|
|
575
556
|
previous_tool_calls: list[dict],
|
|
576
|
-
|
|
577
|
-
tool_number=None,
|
|
557
|
+
tool_number: Optional[int] = None,
|
|
578
558
|
) -> ToolCallResult:
|
|
579
|
-
# Handle the union type - ChatCompletionMessageToolCall can be either
|
|
580
|
-
# ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
|
|
581
|
-
# or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
|
|
582
|
-
# We use hasattr to check for the 'function' attribute as it's more flexible
|
|
583
|
-
# and doesn't require importing the specific type.
|
|
584
|
-
if hasattr(tool_to_call, "function"):
|
|
585
|
-
tool_name = tool_to_call.function.name
|
|
586
|
-
tool_arguments = tool_to_call.function.arguments
|
|
587
|
-
else:
|
|
588
|
-
# This is a custom tool call - we don't support these currently
|
|
589
|
-
logging.error(f"Unsupported custom tool call: {tool_to_call}")
|
|
590
|
-
return ToolCallResult(
|
|
591
|
-
tool_call_id=tool_to_call.id,
|
|
592
|
-
tool_name="unknown",
|
|
593
|
-
description="NA",
|
|
594
|
-
result=StructuredToolResult(
|
|
595
|
-
status=ToolResultStatus.ERROR,
|
|
596
|
-
error="Custom tool calls are not supported",
|
|
597
|
-
params=None,
|
|
598
|
-
),
|
|
599
|
-
)
|
|
600
|
-
|
|
601
559
|
tool_params = {}
|
|
602
560
|
try:
|
|
603
561
|
tool_params = json.loads(tool_arguments)
|
|
@@ -606,20 +564,19 @@ class ToolCallingLLM:
|
|
|
606
564
|
f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
|
|
607
565
|
)
|
|
608
566
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
567
|
+
tool_response = None
|
|
568
|
+
if not user_approved:
|
|
569
|
+
tool_response = prevent_overly_repeated_tool_call(
|
|
570
|
+
tool_name=tool_name,
|
|
571
|
+
tool_params=tool_params,
|
|
572
|
+
tool_calls=previous_tool_calls,
|
|
573
|
+
)
|
|
616
574
|
|
|
617
575
|
if not tool_response:
|
|
618
|
-
tool_response = self.
|
|
576
|
+
tool_response = self._directly_invoke_tool_call(
|
|
619
577
|
tool_name=tool_name,
|
|
620
578
|
tool_params=tool_params,
|
|
621
|
-
user_approved=
|
|
622
|
-
trace_span=trace_span,
|
|
579
|
+
user_approved=user_approved,
|
|
623
580
|
tool_number=tool_number,
|
|
624
581
|
)
|
|
625
582
|
|
|
@@ -629,38 +586,103 @@ class ToolCallingLLM:
|
|
|
629
586
|
f"Tool {tool_name} return type is not StructuredToolResult. Nesting the tool result into StructuredToolResult..."
|
|
630
587
|
)
|
|
631
588
|
tool_response = StructuredToolResult(
|
|
632
|
-
status=
|
|
589
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
633
590
|
data=tool_response,
|
|
634
591
|
params=tool_params,
|
|
635
592
|
)
|
|
636
593
|
|
|
637
594
|
tool = self.tool_executor.get_tool_by_name(tool_name)
|
|
595
|
+
|
|
638
596
|
return ToolCallResult(
|
|
639
597
|
tool_call_id=tool_call_id,
|
|
640
598
|
tool_name=tool_name,
|
|
641
|
-
description=tool.get_parameterized_one_liner(tool_params)
|
|
599
|
+
description=str(tool.get_parameterized_one_liner(tool_params))
|
|
600
|
+
if tool
|
|
601
|
+
else "",
|
|
642
602
|
result=tool_response,
|
|
643
603
|
)
|
|
644
604
|
|
|
645
|
-
|
|
646
|
-
|
|
605
|
+
@staticmethod
|
|
606
|
+
def _log_tool_call_result(tool_span, tool_call_result: ToolCallResult):
|
|
607
|
+
tool_span.set_attributes(name=tool_call_result.tool_name)
|
|
608
|
+
tool_span.log(
|
|
609
|
+
input=tool_call_result.result.params,
|
|
610
|
+
output=tool_call_result.result.data,
|
|
611
|
+
error=tool_call_result.result.error,
|
|
612
|
+
metadata={
|
|
613
|
+
"status": tool_call_result.result.status,
|
|
614
|
+
"description": tool_call_result.description,
|
|
615
|
+
},
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
def _invoke_llm_tool_call(
|
|
619
|
+
self,
|
|
620
|
+
tool_to_call: ChatCompletionMessageToolCall,
|
|
621
|
+
previous_tool_calls: list[dict],
|
|
622
|
+
trace_span=None,
|
|
623
|
+
tool_number=None,
|
|
624
|
+
user_approved: bool = False,
|
|
625
|
+
) -> ToolCallResult:
|
|
626
|
+
if trace_span is None:
|
|
627
|
+
trace_span = DummySpan()
|
|
628
|
+
with trace_span.start_span(type="tool") as tool_span:
|
|
629
|
+
if not hasattr(tool_to_call, "function"):
|
|
630
|
+
# Handle the union type - ChatCompletionMessageToolCall can be either
|
|
631
|
+
# ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
|
|
632
|
+
# or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
|
|
633
|
+
# We use hasattr to check for the 'function' attribute as it's more flexible
|
|
634
|
+
# and doesn't require importing the specific type.
|
|
635
|
+
tool_name = "Unknown_Custom_Tool"
|
|
636
|
+
logging.error(f"Unsupported custom tool call: {tool_to_call}")
|
|
637
|
+
tool_call_result = ToolCallResult(
|
|
638
|
+
tool_call_id=tool_to_call.id,
|
|
639
|
+
tool_name=tool_name,
|
|
640
|
+
description="NA",
|
|
641
|
+
result=StructuredToolResult(
|
|
642
|
+
status=StructuredToolResultStatus.ERROR,
|
|
643
|
+
error="Custom tool calls are not supported",
|
|
644
|
+
params=None,
|
|
645
|
+
),
|
|
646
|
+
)
|
|
647
|
+
else:
|
|
648
|
+
tool_name = tool_to_call.function.name
|
|
649
|
+
tool_arguments = tool_to_call.function.arguments
|
|
650
|
+
tool_id = tool_to_call.id
|
|
651
|
+
tool_call_result = self._get_tool_call_result(
|
|
652
|
+
tool_id,
|
|
653
|
+
tool_name,
|
|
654
|
+
tool_arguments,
|
|
655
|
+
previous_tool_calls=previous_tool_calls,
|
|
656
|
+
tool_number=tool_number,
|
|
657
|
+
user_approved=user_approved,
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
prevent_overly_big_tool_response(
|
|
661
|
+
tool_call_result=tool_call_result, llm=self.llm
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
ToolCallingLLM._log_tool_call_result(tool_span, tool_call_result)
|
|
665
|
+
return tool_call_result
|
|
666
|
+
|
|
667
|
+
def _handle_tool_call_approval(
|
|
668
|
+
self,
|
|
669
|
+
tool_call_result: ToolCallResult,
|
|
670
|
+
tool_number: Optional[int],
|
|
647
671
|
) -> ToolCallResult:
|
|
648
672
|
"""
|
|
649
673
|
Handle approval for a single tool call if required.
|
|
650
674
|
|
|
651
675
|
Args:
|
|
652
676
|
tool_call_result: A single tool call result that may require approval
|
|
677
|
+
tool_number: The tool call number
|
|
653
678
|
|
|
654
679
|
Returns:
|
|
655
680
|
Updated tool call result with approved/denied status
|
|
656
681
|
"""
|
|
657
682
|
|
|
658
|
-
if tool_call_result.result.status != ToolResultStatus.APPROVAL_REQUIRED:
|
|
659
|
-
return tool_call_result
|
|
660
|
-
|
|
661
683
|
# If no approval callback, convert to ERROR because it is assumed the client may not be able to handle approvals
|
|
662
684
|
if not self.approval_callback:
|
|
663
|
-
tool_call_result.result.status =
|
|
685
|
+
tool_call_result.result.status = StructuredToolResultStatus.ERROR
|
|
664
686
|
return tool_call_result
|
|
665
687
|
|
|
666
688
|
# Get approval from user
|
|
@@ -670,19 +692,17 @@ class ToolCallingLLM:
|
|
|
670
692
|
logging.debug(
|
|
671
693
|
f"User approved command: {tool_call_result.result.invocation}"
|
|
672
694
|
)
|
|
673
|
-
|
|
674
|
-
new_response = self._directly_invoke_tool(
|
|
695
|
+
new_response = self._directly_invoke_tool_call(
|
|
675
696
|
tool_name=tool_call_result.tool_name,
|
|
676
697
|
tool_params=tool_call_result.result.params or {},
|
|
677
698
|
user_approved=True,
|
|
678
|
-
trace_span=DummySpan(),
|
|
679
699
|
tool_number=tool_number,
|
|
680
700
|
)
|
|
681
701
|
tool_call_result.result = new_response
|
|
682
702
|
else:
|
|
683
703
|
# User denied - update to error
|
|
684
704
|
feedback_text = f" User feedback: {feedback}" if feedback else ""
|
|
685
|
-
tool_call_result.result.status =
|
|
705
|
+
tool_call_result.result.status = StructuredToolResultStatus.ERROR
|
|
686
706
|
tool_call_result.result.error = (
|
|
687
707
|
f"User denied command execution.{feedback_text}"
|
|
688
708
|
)
|
|
@@ -737,17 +757,6 @@ class ToolCallingLLM:
|
|
|
737
757
|
logging.exception("Failed to run post processing", exc_info=True)
|
|
738
758
|
return investigation, 0.0
|
|
739
759
|
|
|
740
|
-
@sentry_sdk.trace
|
|
741
|
-
def truncate_messages_to_fit_context(
|
|
742
|
-
self, messages: list, max_context_size: int, maximum_output_token: int
|
|
743
|
-
) -> list:
|
|
744
|
-
return truncate_messages_to_fit_context(
|
|
745
|
-
messages,
|
|
746
|
-
max_context_size,
|
|
747
|
-
maximum_output_token,
|
|
748
|
-
self.llm.count_tokens_for_message,
|
|
749
|
-
)
|
|
750
|
-
|
|
751
760
|
def call_stream(
|
|
752
761
|
self,
|
|
753
762
|
system_prompt: str = "",
|
|
@@ -755,47 +764,55 @@ class ToolCallingLLM:
|
|
|
755
764
|
response_format: Optional[Union[dict, Type[BaseModel]]] = None,
|
|
756
765
|
sections: Optional[InputSectionsDataType] = None,
|
|
757
766
|
msgs: Optional[list[dict]] = None,
|
|
767
|
+
enable_tool_approval: bool = False,
|
|
768
|
+
tool_decisions: List[ToolApprovalDecision] | None = None,
|
|
758
769
|
):
|
|
759
770
|
"""
|
|
760
771
|
This function DOES NOT call llm.completion(stream=true).
|
|
761
772
|
This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
|
|
762
773
|
"""
|
|
763
|
-
|
|
774
|
+
|
|
775
|
+
# Process tool decisions if provided
|
|
776
|
+
if msgs and tool_decisions:
|
|
777
|
+
logging.info(f"Processing {len(tool_decisions)} tool decisions")
|
|
778
|
+
msgs, events = self.process_tool_decisions(msgs, tool_decisions)
|
|
779
|
+
yield from events
|
|
780
|
+
|
|
781
|
+
messages: list[dict] = []
|
|
764
782
|
if system_prompt:
|
|
765
783
|
messages.append({"role": "system", "content": system_prompt})
|
|
766
784
|
if user_prompt:
|
|
767
785
|
messages.append({"role": "user", "content": user_prompt})
|
|
768
786
|
if msgs:
|
|
769
787
|
messages.extend(msgs)
|
|
770
|
-
perf_timing = PerformanceTiming("tool_calling_llm.call")
|
|
771
788
|
tool_calls: list[dict] = []
|
|
772
789
|
tools = self.tool_executor.get_all_tools_openai_format(
|
|
773
790
|
target_model=self.llm.model
|
|
774
791
|
)
|
|
775
|
-
perf_timing.measure("get_all_tools_openai_format")
|
|
776
792
|
max_steps = self.max_steps
|
|
793
|
+
metadata: Dict[Any, Any] = {}
|
|
777
794
|
i = 0
|
|
778
795
|
tool_number_offset = 0
|
|
779
796
|
|
|
780
797
|
while i < max_steps:
|
|
781
798
|
i += 1
|
|
782
|
-
perf_timing.measure(f"start iteration {i}")
|
|
783
799
|
logging.debug(f"running iteration {i}")
|
|
784
800
|
|
|
785
801
|
tools = None if i == max_steps else tools
|
|
786
802
|
tool_choice = "auto" if tools else None
|
|
787
803
|
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
804
|
+
limit_result = limit_input_context_window(
|
|
805
|
+
llm=self.llm, messages=messages, tools=tools
|
|
806
|
+
)
|
|
807
|
+
yield from limit_result.events
|
|
808
|
+
messages = limit_result.messages
|
|
809
|
+
metadata = metadata | limit_result.metadata
|
|
792
810
|
|
|
793
|
-
if (
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
perf_timing.measure("truncate_messages_to_fit_context")
|
|
811
|
+
if (
|
|
812
|
+
limit_result.conversation_history_compacted
|
|
813
|
+
and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
|
|
814
|
+
):
|
|
815
|
+
tool_calls = []
|
|
799
816
|
|
|
800
817
|
logging.debug(f"sending messages={messages}\n\ntools={tools}")
|
|
801
818
|
try:
|
|
@@ -812,7 +829,6 @@ class ToolCallingLLM:
|
|
|
812
829
|
# Log cost information for this iteration (no accumulation in streaming)
|
|
813
830
|
_process_cost_info(full_response, log_prefix="LLM iteration")
|
|
814
831
|
|
|
815
|
-
perf_timing.measure("llm.completion")
|
|
816
832
|
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
|
|
817
833
|
except BadRequestError as e:
|
|
818
834
|
if "Unrecognized request arguments supplied: tool_choice, tools" in str(
|
|
@@ -834,9 +850,10 @@ class ToolCallingLLM:
|
|
|
834
850
|
|
|
835
851
|
if incorrect_tool_call:
|
|
836
852
|
logging.warning(
|
|
837
|
-
"Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-
|
|
853
|
+
"Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
|
|
838
854
|
)
|
|
839
855
|
# disable structured output going forward and and retry
|
|
856
|
+
sentry_helper.capture_structured_output_incorrect_tool_call()
|
|
840
857
|
response_format = None
|
|
841
858
|
max_steps = max_steps + 1
|
|
842
859
|
continue
|
|
@@ -847,11 +864,25 @@ class ToolCallingLLM:
|
|
|
847
864
|
)
|
|
848
865
|
)
|
|
849
866
|
|
|
867
|
+
tokens = self.llm.count_tokens(messages=messages, tools=tools)
|
|
868
|
+
add_token_count_to_metadata(
|
|
869
|
+
tokens=tokens,
|
|
870
|
+
full_llm_response=full_response,
|
|
871
|
+
max_context_size=limit_result.max_context_size,
|
|
872
|
+
maximum_output_token=limit_result.maximum_output_token,
|
|
873
|
+
metadata=metadata,
|
|
874
|
+
)
|
|
875
|
+
yield build_stream_event_token_count(metadata=metadata)
|
|
876
|
+
|
|
850
877
|
tools_to_call = getattr(response_message, "tool_calls", None)
|
|
851
878
|
if not tools_to_call:
|
|
852
879
|
yield StreamMessage(
|
|
853
880
|
event=StreamEvents.ANSWER_END,
|
|
854
|
-
data={
|
|
881
|
+
data={
|
|
882
|
+
"content": response_message.content,
|
|
883
|
+
"messages": messages,
|
|
884
|
+
"metadata": metadata,
|
|
885
|
+
},
|
|
855
886
|
)
|
|
856
887
|
return
|
|
857
888
|
|
|
@@ -860,14 +891,22 @@ class ToolCallingLLM:
|
|
|
860
891
|
if reasoning or message:
|
|
861
892
|
yield StreamMessage(
|
|
862
893
|
event=StreamEvents.AI_MESSAGE,
|
|
863
|
-
data={
|
|
894
|
+
data={
|
|
895
|
+
"content": message,
|
|
896
|
+
"reasoning": reasoning,
|
|
897
|
+
"metadata": metadata,
|
|
898
|
+
},
|
|
864
899
|
)
|
|
865
900
|
|
|
866
|
-
|
|
901
|
+
# Check if any tools require approval first
|
|
902
|
+
pending_approvals = []
|
|
903
|
+
approval_required_tools = []
|
|
904
|
+
|
|
867
905
|
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
|
|
868
906
|
futures = []
|
|
869
907
|
for tool_index, t in enumerate(tools_to_call, 1): # type: ignore
|
|
870
908
|
tool_number = tool_number_offset + tool_index
|
|
909
|
+
|
|
871
910
|
future = executor.submit(
|
|
872
911
|
self._invoke_llm_tool_call,
|
|
873
912
|
tool_to_call=t, # type: ignore
|
|
@@ -884,15 +923,72 @@ class ToolCallingLLM:
|
|
|
884
923
|
for future in concurrent.futures.as_completed(futures):
|
|
885
924
|
tool_call_result: ToolCallResult = future.result()
|
|
886
925
|
|
|
887
|
-
|
|
888
|
-
|
|
926
|
+
if (
|
|
927
|
+
tool_call_result.result.status
|
|
928
|
+
== StructuredToolResultStatus.APPROVAL_REQUIRED
|
|
929
|
+
):
|
|
930
|
+
if enable_tool_approval:
|
|
931
|
+
pending_approvals.append(
|
|
932
|
+
PendingToolApproval(
|
|
933
|
+
tool_call_id=tool_call_result.tool_call_id,
|
|
934
|
+
tool_name=tool_call_result.tool_name,
|
|
935
|
+
description=tool_call_result.description,
|
|
936
|
+
params=tool_call_result.result.params or {},
|
|
937
|
+
)
|
|
938
|
+
)
|
|
939
|
+
approval_required_tools.append(tool_call_result)
|
|
940
|
+
|
|
941
|
+
yield StreamMessage(
|
|
942
|
+
event=StreamEvents.TOOL_RESULT,
|
|
943
|
+
data=tool_call_result.as_streaming_tool_result_response(),
|
|
944
|
+
)
|
|
945
|
+
else:
|
|
946
|
+
tool_call_result.result.status = (
|
|
947
|
+
StructuredToolResultStatus.ERROR
|
|
948
|
+
)
|
|
949
|
+
tool_call_result.result.error = f"Tool call rejected for security reasons: {tool_call_result.result.error}"
|
|
950
|
+
|
|
951
|
+
tool_calls.append(
|
|
952
|
+
tool_call_result.as_tool_result_response()
|
|
953
|
+
)
|
|
954
|
+
messages.append(tool_call_result.as_tool_call_message())
|
|
955
|
+
|
|
956
|
+
yield StreamMessage(
|
|
957
|
+
event=StreamEvents.TOOL_RESULT,
|
|
958
|
+
data=tool_call_result.as_streaming_tool_result_response(),
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
else:
|
|
962
|
+
tool_calls.append(tool_call_result.as_tool_result_response())
|
|
963
|
+
messages.append(tool_call_result.as_tool_call_message())
|
|
964
|
+
|
|
965
|
+
yield StreamMessage(
|
|
966
|
+
event=StreamEvents.TOOL_RESULT,
|
|
967
|
+
data=tool_call_result.as_streaming_tool_result_response(),
|
|
968
|
+
)
|
|
889
969
|
|
|
890
|
-
|
|
970
|
+
# If we have approval required tools, end the stream with pending approvals
|
|
971
|
+
if pending_approvals:
|
|
972
|
+
# Add assistant message with pending tool calls
|
|
973
|
+
for result in approval_required_tools:
|
|
974
|
+
tool_call = self.find_assistant_tool_call_request(
|
|
975
|
+
tool_call_id=result.tool_call_id, messages=messages
|
|
976
|
+
)
|
|
977
|
+
tool_call["pending_approval"] = True
|
|
891
978
|
|
|
979
|
+
# End stream with approvals required
|
|
892
980
|
yield StreamMessage(
|
|
893
|
-
event=StreamEvents.
|
|
894
|
-
data=
|
|
981
|
+
event=StreamEvents.APPROVAL_REQUIRED,
|
|
982
|
+
data={
|
|
983
|
+
"content": None,
|
|
984
|
+
"messages": messages,
|
|
985
|
+
"pending_approvals": [
|
|
986
|
+
approval.model_dump() for approval in pending_approvals
|
|
987
|
+
],
|
|
988
|
+
"requires_approval": True,
|
|
989
|
+
},
|
|
895
990
|
)
|
|
991
|
+
return
|
|
896
992
|
|
|
897
993
|
# Update the tool number offset for the next iteration
|
|
898
994
|
tool_number_offset += len(tools_to_call)
|
|
@@ -901,6 +997,21 @@ class ToolCallingLLM:
|
|
|
901
997
|
f"Too many LLM calls - exceeded max_steps: {i}/{self.max_steps}"
|
|
902
998
|
)
|
|
903
999
|
|
|
1000
|
+
def find_assistant_tool_call_request(
|
|
1001
|
+
self, tool_call_id: str, messages: list[dict[str, Any]]
|
|
1002
|
+
) -> dict[str, Any]:
|
|
1003
|
+
for message in messages:
|
|
1004
|
+
if message.get("role") == "assistant":
|
|
1005
|
+
for tool_call in message.get("tool_calls", []):
|
|
1006
|
+
if tool_call.get("id") == tool_call_id:
|
|
1007
|
+
return tool_call
|
|
1008
|
+
|
|
1009
|
+
# Should not happen unless there is a bug.
|
|
1010
|
+
# If we are here
|
|
1011
|
+
raise Exception(
|
|
1012
|
+
f"Failed to find assistant request for a tool_call in conversation history. tool_call_id={tool_call_id}"
|
|
1013
|
+
)
|
|
1014
|
+
|
|
904
1015
|
|
|
905
1016
|
# TODO: consider getting rid of this entirely and moving templating into the cmds in holmes_cli.py
|
|
906
1017
|
class IssueInvestigator(ToolCallingLLM):
|
|
@@ -933,8 +1044,9 @@ class IssueInvestigator(ToolCallingLLM):
|
|
|
933
1044
|
post_processing_prompt: Optional[str] = None,
|
|
934
1045
|
sections: Optional[InputSectionsDataType] = None,
|
|
935
1046
|
trace_span=DummySpan(),
|
|
1047
|
+
runbooks: Optional[RunbookCatalog] = None,
|
|
936
1048
|
) -> LLMResult:
|
|
937
|
-
|
|
1049
|
+
issue_runbooks = self.runbook_manager.get_instructions_for_issue(issue)
|
|
938
1050
|
|
|
939
1051
|
request_structured_output_from_llm = True
|
|
940
1052
|
response_format = None
|
|
@@ -962,12 +1074,9 @@ class IssueInvestigator(ToolCallingLLM):
|
|
|
962
1074
|
else:
|
|
963
1075
|
logging.info("Structured output is disabled for this request")
|
|
964
1076
|
|
|
965
|
-
if instructions is not None and instructions.instructions:
|
|
966
|
-
runbooks.extend(instructions.instructions)
|
|
967
|
-
|
|
968
1077
|
if console and runbooks:
|
|
969
1078
|
console.print(
|
|
970
|
-
f"[bold]Analyzing with {len(
|
|
1079
|
+
f"[bold]Analyzing with {len(issue_runbooks)} runbooks: {issue_runbooks}[/bold]"
|
|
971
1080
|
)
|
|
972
1081
|
elif console:
|
|
973
1082
|
console.print(
|
|
@@ -982,29 +1091,20 @@ class IssueInvestigator(ToolCallingLLM):
|
|
|
982
1091
|
"structured_output": request_structured_output_from_llm,
|
|
983
1092
|
"toolsets": self.tool_executor.toolsets,
|
|
984
1093
|
"cluster_name": self.cluster_name,
|
|
1094
|
+
"runbooks_enabled": True if runbooks else False,
|
|
985
1095
|
},
|
|
986
1096
|
)
|
|
987
1097
|
|
|
988
|
-
if instructions is not None and len(instructions.documents) > 0:
|
|
989
|
-
docPrompts = []
|
|
990
|
-
for document in instructions.documents:
|
|
991
|
-
docPrompts.append(
|
|
992
|
-
f"* fetch information from this URL: {document.url}\n"
|
|
993
|
-
)
|
|
994
|
-
runbooks.extend(docPrompts)
|
|
995
|
-
|
|
996
1098
|
user_prompt = ""
|
|
997
|
-
if runbooks:
|
|
998
|
-
for runbook_str in runbooks:
|
|
999
|
-
user_prompt += f"* {runbook_str}\n"
|
|
1000
|
-
|
|
1001
|
-
user_prompt = f'My instructions to check \n"""{user_prompt}"""'
|
|
1002
1099
|
|
|
1003
|
-
user_prompt =
|
|
1004
|
-
user_prompt,
|
|
1100
|
+
user_prompt = add_runbooks_to_user_prompt(
|
|
1101
|
+
user_prompt,
|
|
1102
|
+
runbook_catalog=runbooks,
|
|
1103
|
+
global_instructions=global_instructions,
|
|
1104
|
+
issue_instructions=issue_runbooks,
|
|
1105
|
+
resource_instructions=instructions,
|
|
1005
1106
|
)
|
|
1006
|
-
user_prompt = f"{user_prompt}\n This is context from the issue
|
|
1007
|
-
|
|
1107
|
+
user_prompt = f"{user_prompt}\n #This is context from the issue:\n{issue.raw}"
|
|
1008
1108
|
logging.debug(
|
|
1009
1109
|
"Rendered system prompt:\n%s", textwrap.indent(system_prompt, " ")
|
|
1010
1110
|
)
|
|
@@ -1018,5 +1118,5 @@ class IssueInvestigator(ToolCallingLLM):
|
|
|
1018
1118
|
sections=sections,
|
|
1019
1119
|
trace_span=trace_span,
|
|
1020
1120
|
)
|
|
1021
|
-
res.instructions =
|
|
1121
|
+
res.instructions = issue_runbooks
|
|
1022
1122
|
return res
|