holmesgpt 0.14.4a0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +12 -10
- holmes/common/env_vars.py +22 -0
- holmes/config.py +51 -4
- holmes/core/conversations.py +3 -2
- holmes/core/llm.py +226 -72
- holmes/core/openai_formatting.py +13 -0
- holmes/core/supabase_dal.py +33 -42
- holmes/core/tool_calling_llm.py +185 -282
- holmes/core/tools.py +21 -1
- holmes/core/tools_utils/token_counting.py +2 -1
- holmes/core/tools_utils/tool_context_window_limiter.py +32 -30
- holmes/core/truncation/compaction.py +59 -0
- holmes/core/truncation/input_context_window_limiter.py +218 -0
- holmes/interactive.py +17 -7
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
- holmes/plugins/toolsets/__init__.py +4 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +0 -1
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
- holmes/plugins/toolsets/grafana/grafana_api.py +1 -1
- holmes/plugins/toolsets/investigator/core_investigation.py +34 -24
- holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +1 -1
- holmes/plugins/toolsets/robusta/robusta.py +35 -8
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +4 -3
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -1
- holmes/utils/stream.py +31 -1
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/METADATA +6 -2
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/RECORD +36 -31
- holmes/core/performance_timing.py +0 -72
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.14.4a0.dist-info → holmesgpt-0.16.0.dist-info}/entry_points.txt +0 -0
holmes/core/tool_calling_llm.py
CHANGED
|
@@ -7,8 +7,6 @@ from typing import Dict, List, Optional, Type, Union, Callable, Any
|
|
|
7
7
|
from holmes.core.models import (
|
|
8
8
|
ToolApprovalDecision,
|
|
9
9
|
ToolCallResult,
|
|
10
|
-
TruncationResult,
|
|
11
|
-
TruncationMetadata,
|
|
12
10
|
PendingToolApproval,
|
|
13
11
|
)
|
|
14
12
|
|
|
@@ -21,8 +19,8 @@ from pydantic import BaseModel, Field
|
|
|
21
19
|
from rich.console import Console
|
|
22
20
|
|
|
23
21
|
from holmes.common.env_vars import (
|
|
22
|
+
RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
|
|
24
23
|
TEMPERATURE,
|
|
25
|
-
MAX_OUTPUT_TOKEN_RESERVATION,
|
|
26
24
|
LOG_LLM_USAGE_RESPONSE,
|
|
27
25
|
)
|
|
28
26
|
|
|
@@ -34,8 +32,7 @@ from holmes.core.investigation_structured_output import (
|
|
|
34
32
|
is_response_an_incorrect_tool_call,
|
|
35
33
|
)
|
|
36
34
|
from holmes.core.issue import Issue
|
|
37
|
-
from holmes.core.llm import LLM
|
|
38
|
-
from holmes.core.performance_timing import PerformanceTiming
|
|
35
|
+
from holmes.core.llm import LLM
|
|
39
36
|
from holmes.core.resource_instruction import ResourceInstructions
|
|
40
37
|
from holmes.core.runbooks import RunbookManager
|
|
41
38
|
from holmes.core.safeguards import prevent_overly_repeated_tool_call
|
|
@@ -45,9 +42,11 @@ from holmes.core.tools import (
|
|
|
45
42
|
ToolInvokeContext,
|
|
46
43
|
)
|
|
47
44
|
from holmes.core.tools_utils.tool_context_window_limiter import (
|
|
48
|
-
get_max_token_count_for_single_tool,
|
|
49
45
|
prevent_overly_big_tool_response,
|
|
50
46
|
)
|
|
47
|
+
from holmes.core.truncation.input_context_window_limiter import (
|
|
48
|
+
limit_input_context_window,
|
|
49
|
+
)
|
|
51
50
|
from holmes.plugins.prompts import load_and_render_prompt
|
|
52
51
|
from holmes.utils import sentry_helper
|
|
53
52
|
from holmes.utils.global_instructions import (
|
|
@@ -58,15 +57,17 @@ from holmes.utils.tags import format_tags_in_string, parse_messages_tags
|
|
|
58
57
|
from holmes.core.tools_utils.tool_executor import ToolExecutor
|
|
59
58
|
from holmes.core.tracing import DummySpan
|
|
60
59
|
from holmes.utils.colors import AI_COLOR
|
|
61
|
-
from holmes.utils.stream import
|
|
60
|
+
from holmes.utils.stream import (
|
|
61
|
+
StreamEvents,
|
|
62
|
+
StreamMessage,
|
|
63
|
+
add_token_count_to_metadata,
|
|
64
|
+
build_stream_event_token_count,
|
|
65
|
+
)
|
|
62
66
|
|
|
63
67
|
# Create a named logger for cost tracking
|
|
64
68
|
cost_logger = logging.getLogger("holmes.costs")
|
|
65
69
|
|
|
66
70
|
|
|
67
|
-
TRUNCATION_NOTICE = "\n\n[TRUNCATED]"
|
|
68
|
-
|
|
69
|
-
|
|
70
71
|
class LLMCosts(BaseModel):
|
|
71
72
|
"""Tracks cost and token usage for LLM calls."""
|
|
72
73
|
|
|
@@ -138,109 +139,6 @@ def _process_cost_info(
|
|
|
138
139
|
logging.debug(f"Could not extract cost information: {e}")
|
|
139
140
|
|
|
140
141
|
|
|
141
|
-
# TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
|
|
142
|
-
# However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
|
|
143
|
-
# We should fix this in the future
|
|
144
|
-
# TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
|
|
145
|
-
# token truncation and not character truncation
|
|
146
|
-
def truncate_messages_to_fit_context(
|
|
147
|
-
messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
|
|
148
|
-
) -> TruncationResult:
|
|
149
|
-
"""
|
|
150
|
-
Helper function to truncate tool messages to fit within context limits.
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
messages: List of message dictionaries with roles and content
|
|
154
|
-
max_context_size: Maximum context window size for the model
|
|
155
|
-
maximum_output_token: Maximum tokens reserved for model output
|
|
156
|
-
count_tokens_fn: Function to count tokens for a list of messages
|
|
157
|
-
|
|
158
|
-
Returns:
|
|
159
|
-
Modified list of messages with truncated tool responses
|
|
160
|
-
|
|
161
|
-
Raises:
|
|
162
|
-
Exception: If non-tool messages exceed available context space
|
|
163
|
-
"""
|
|
164
|
-
messages_except_tools = [
|
|
165
|
-
message for message in messages if message["role"] != "tool"
|
|
166
|
-
]
|
|
167
|
-
message_size_without_tools = count_tokens_fn(messages_except_tools)
|
|
168
|
-
|
|
169
|
-
tool_call_messages = [message for message in messages if message["role"] == "tool"]
|
|
170
|
-
|
|
171
|
-
reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
|
|
172
|
-
if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
|
|
173
|
-
logging.error(
|
|
174
|
-
f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
|
|
175
|
-
)
|
|
176
|
-
raise Exception(
|
|
177
|
-
f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
if len(tool_call_messages) == 0:
|
|
181
|
-
return TruncationResult(truncated_messages=messages, truncations=[])
|
|
182
|
-
|
|
183
|
-
available_space = (
|
|
184
|
-
max_context_size - message_size_without_tools - reserved_for_output_tokens
|
|
185
|
-
)
|
|
186
|
-
remaining_space = available_space
|
|
187
|
-
tool_call_messages.sort(
|
|
188
|
-
key=lambda x: count_tokens_fn([{"role": "tool", "content": x["content"]}])
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
truncations = []
|
|
192
|
-
|
|
193
|
-
# Allocate space starting with small tools and going to larger tools, while maintaining fairness
|
|
194
|
-
# Small tools can often get exactly what they need, while larger tools may need to be truncated
|
|
195
|
-
# We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
|
|
196
|
-
for i, msg in enumerate(tool_call_messages):
|
|
197
|
-
remaining_tools = len(tool_call_messages) - i
|
|
198
|
-
max_allocation = remaining_space // remaining_tools
|
|
199
|
-
needed_space = count_tokens_fn([{"role": "tool", "content": msg["content"]}])
|
|
200
|
-
allocated_space = min(needed_space, max_allocation)
|
|
201
|
-
|
|
202
|
-
if needed_space > allocated_space:
|
|
203
|
-
truncation_metadata = _truncate_tool_message(
|
|
204
|
-
msg, allocated_space, needed_space
|
|
205
|
-
)
|
|
206
|
-
truncations.append(truncation_metadata)
|
|
207
|
-
|
|
208
|
-
remaining_space -= allocated_space
|
|
209
|
-
return TruncationResult(truncated_messages=messages, truncations=truncations)
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
def _truncate_tool_message(
|
|
213
|
-
msg: dict, allocated_space: int, needed_space: int
|
|
214
|
-
) -> TruncationMetadata:
|
|
215
|
-
msg_content = msg["content"]
|
|
216
|
-
tool_call_id = msg["tool_call_id"]
|
|
217
|
-
tool_name = msg["name"]
|
|
218
|
-
|
|
219
|
-
# Ensure the indicator fits in the allocated space
|
|
220
|
-
if allocated_space > len(TRUNCATION_NOTICE):
|
|
221
|
-
original = msg_content if isinstance(msg_content, str) else str(msg_content)
|
|
222
|
-
msg["content"] = (
|
|
223
|
-
original[: allocated_space - len(TRUNCATION_NOTICE)] + TRUNCATION_NOTICE
|
|
224
|
-
)
|
|
225
|
-
end_index = allocated_space - len(TRUNCATION_NOTICE)
|
|
226
|
-
else:
|
|
227
|
-
msg["content"] = TRUNCATION_NOTICE[:allocated_space]
|
|
228
|
-
end_index = allocated_space
|
|
229
|
-
|
|
230
|
-
msg.pop("token_count", None) # Remove token_count if present
|
|
231
|
-
logging.info(
|
|
232
|
-
f"Truncating tool message '{tool_name}' from {needed_space} to {allocated_space} tokens"
|
|
233
|
-
)
|
|
234
|
-
truncation_metadata = TruncationMetadata(
|
|
235
|
-
tool_call_id=tool_call_id,
|
|
236
|
-
start_index=0,
|
|
237
|
-
end_index=end_index,
|
|
238
|
-
tool_name=tool_name,
|
|
239
|
-
original_token_count=needed_space,
|
|
240
|
-
)
|
|
241
|
-
return truncation_metadata
|
|
242
|
-
|
|
243
|
-
|
|
244
142
|
class LLMResult(LLMCosts):
|
|
245
143
|
tool_calls: Optional[List[ToolCallResult]] = None
|
|
246
144
|
result: Optional[str] = None
|
|
@@ -257,6 +155,12 @@ class LLMResult(LLMCosts):
|
|
|
257
155
|
)
|
|
258
156
|
|
|
259
157
|
|
|
158
|
+
class ToolCallWithDecision(BaseModel):
|
|
159
|
+
message_index: int
|
|
160
|
+
tool_call: ChatCompletionMessageToolCall
|
|
161
|
+
decision: Optional[ToolApprovalDecision]
|
|
162
|
+
|
|
163
|
+
|
|
260
164
|
class ToolCallingLLM:
|
|
261
165
|
llm: LLM
|
|
262
166
|
|
|
@@ -273,7 +177,7 @@ class ToolCallingLLM:
|
|
|
273
177
|
|
|
274
178
|
def process_tool_decisions(
|
|
275
179
|
self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
|
|
276
|
-
) -> List[Dict[str, Any]]:
|
|
180
|
+
) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
|
|
277
181
|
"""
|
|
278
182
|
Process tool approval decisions and execute approved tools.
|
|
279
183
|
|
|
@@ -284,85 +188,81 @@ class ToolCallingLLM:
|
|
|
284
188
|
Returns:
|
|
285
189
|
Updated messages list with tool execution results
|
|
286
190
|
"""
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
pending_message_idx = None
|
|
291
|
-
pending_tool_calls = None
|
|
292
|
-
|
|
293
|
-
for i in reversed(range(len(messages))):
|
|
294
|
-
msg = messages[i]
|
|
295
|
-
if msg.get("role") == "assistant" and msg.get("pending_approval"):
|
|
296
|
-
pending_message_idx = i
|
|
297
|
-
pending_tool_calls = msg.get("tool_calls", [])
|
|
298
|
-
break
|
|
299
|
-
|
|
300
|
-
if pending_message_idx is None or not pending_tool_calls:
|
|
301
|
-
# No pending approvals found
|
|
302
|
-
if tool_decisions:
|
|
303
|
-
logging.warning(
|
|
304
|
-
f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
|
|
305
|
-
)
|
|
306
|
-
return messages
|
|
191
|
+
events: list[StreamMessage] = []
|
|
192
|
+
if not tool_decisions:
|
|
193
|
+
return messages, events
|
|
307
194
|
|
|
308
195
|
# Create decision lookup
|
|
309
|
-
|
|
196
|
+
decisions_by_tool_call_id = {
|
|
310
197
|
decision.tool_call_id: decision for decision in tool_decisions
|
|
311
198
|
}
|
|
312
199
|
|
|
313
|
-
|
|
314
|
-
pending_tool_ids = {tool_call["id"] for tool_call in pending_tool_calls}
|
|
315
|
-
invalid_decisions = [
|
|
316
|
-
decision.tool_call_id
|
|
317
|
-
for decision in tool_decisions
|
|
318
|
-
if decision.tool_call_id not in pending_tool_ids
|
|
319
|
-
]
|
|
320
|
-
|
|
321
|
-
if invalid_decisions:
|
|
322
|
-
logging.warning(
|
|
323
|
-
f"Received decisions for non-pending tool calls: {invalid_decisions}"
|
|
324
|
-
)
|
|
200
|
+
pending_tool_calls: list[ToolCallWithDecision] = []
|
|
325
201
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
202
|
+
for i in reversed(range(len(messages))):
|
|
203
|
+
msg = messages[i]
|
|
204
|
+
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
205
|
+
message_tool_calls = msg.get("tool_calls", [])
|
|
206
|
+
for tool_call in message_tool_calls:
|
|
207
|
+
decision = decisions_by_tool_call_id.get(tool_call.get("id"), None)
|
|
208
|
+
if tool_call.get("pending_approval"):
|
|
209
|
+
del tool_call[
|
|
210
|
+
"pending_approval"
|
|
211
|
+
] # Cleanup so that a pending approval is not tagged on message in a future response
|
|
212
|
+
pending_tool_calls.append(
|
|
213
|
+
ToolCallWithDecision(
|
|
214
|
+
tool_call=ChatCompletionMessageToolCall(**tool_call),
|
|
215
|
+
decision=decision,
|
|
216
|
+
message_index=i,
|
|
217
|
+
)
|
|
218
|
+
)
|
|
330
219
|
|
|
220
|
+
if not pending_tool_calls:
|
|
221
|
+
error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
|
|
222
|
+
logging.error(error_message)
|
|
223
|
+
raise Exception(error_message)
|
|
224
|
+
for tool_call_with_decision in pending_tool_calls:
|
|
225
|
+
tool_call_message: dict
|
|
226
|
+
tool_call = tool_call_with_decision.tool_call
|
|
227
|
+
decision = tool_call_with_decision.decision
|
|
228
|
+
tool_result: Optional[ToolCallResult] = None
|
|
331
229
|
if decision and decision.approved:
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
)
|
|
340
|
-
messages.append(llm_tool_result.as_tool_call_message())
|
|
341
|
-
|
|
342
|
-
except Exception as e:
|
|
343
|
-
logging.error(
|
|
344
|
-
f"Failed to execute approved tool {tool_call_id}: {e}"
|
|
345
|
-
)
|
|
346
|
-
messages.append(
|
|
347
|
-
{
|
|
348
|
-
"tool_call_id": tool_call_id,
|
|
349
|
-
"role": "tool",
|
|
350
|
-
"name": tool_call["function"]["name"],
|
|
351
|
-
"content": f"Tool execution failed: {str(e)}",
|
|
352
|
-
}
|
|
353
|
-
)
|
|
230
|
+
tool_result = self._invoke_llm_tool_call(
|
|
231
|
+
tool_to_call=tool_call,
|
|
232
|
+
previous_tool_calls=[],
|
|
233
|
+
trace_span=DummySpan(), # TODO: replace with proper span
|
|
234
|
+
tool_number=None,
|
|
235
|
+
user_approved=True,
|
|
236
|
+
)
|
|
354
237
|
else:
|
|
355
238
|
# Tool was rejected or no decision found, add rejection message
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
239
|
+
tool_result = ToolCallResult(
|
|
240
|
+
tool_call_id=tool_call.id,
|
|
241
|
+
tool_name=tool_call.function.name,
|
|
242
|
+
description=tool_call.function.name,
|
|
243
|
+
result=StructuredToolResult(
|
|
244
|
+
status=StructuredToolResultStatus.ERROR,
|
|
245
|
+
error="Tool execution was denied by the user.",
|
|
246
|
+
),
|
|
363
247
|
)
|
|
364
248
|
|
|
365
|
-
|
|
249
|
+
events.append(
|
|
250
|
+
StreamMessage(
|
|
251
|
+
event=StreamEvents.TOOL_RESULT,
|
|
252
|
+
data=tool_result.as_streaming_tool_result_response(),
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
tool_call_message = tool_result.as_tool_call_message()
|
|
257
|
+
|
|
258
|
+
# It is expected that the tool call result directly follows the tool call request from the LLM
|
|
259
|
+
# The API call may contain a user ask which is appended to the messages so we can't just append
|
|
260
|
+
# tool call results; they need to be inserted right after the llm's message requesting tool calls
|
|
261
|
+
messages.insert(
|
|
262
|
+
tool_call_with_decision.message_index + 1, tool_call_message
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
return messages, events
|
|
366
266
|
|
|
367
267
|
def prompt_call(
|
|
368
268
|
self,
|
|
@@ -408,40 +308,35 @@ class ToolCallingLLM:
|
|
|
408
308
|
trace_span=DummySpan(),
|
|
409
309
|
tool_number_offset: int = 0,
|
|
410
310
|
) -> LLMResult:
|
|
411
|
-
|
|
412
|
-
|
|
311
|
+
tool_calls: list[
|
|
312
|
+
dict
|
|
313
|
+
] = [] # Used for preventing repeated tool calls. potentially reset after compaction
|
|
314
|
+
all_tool_calls = [] # type: ignore
|
|
413
315
|
costs = LLMCosts()
|
|
414
|
-
|
|
415
316
|
tools = self.tool_executor.get_all_tools_openai_format(
|
|
416
317
|
target_model=self.llm.model
|
|
417
318
|
)
|
|
418
|
-
perf_timing.measure("get_all_tools_openai_format")
|
|
419
319
|
max_steps = self.max_steps
|
|
420
320
|
i = 0
|
|
421
321
|
metadata: Dict[Any, Any] = {}
|
|
422
322
|
while i < max_steps:
|
|
423
323
|
i += 1
|
|
424
|
-
perf_timing.measure(f"start iteration {i}")
|
|
425
324
|
logging.debug(f"running iteration {i}")
|
|
426
325
|
# on the last step we don't allow tools - we want to force a reply, not a request to run another tool
|
|
427
326
|
tools = None if i == max_steps else tools
|
|
428
327
|
tool_choice = "auto" if tools else None
|
|
429
328
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
329
|
+
limit_result = limit_input_context_window(
|
|
330
|
+
llm=self.llm, messages=messages, tools=tools
|
|
331
|
+
)
|
|
332
|
+
messages = limit_result.messages
|
|
333
|
+
metadata = metadata | limit_result.metadata
|
|
434
334
|
|
|
435
|
-
if (
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
metadata["truncations"] = [
|
|
441
|
-
t.model_dump() for t in truncated_res.truncations
|
|
442
|
-
]
|
|
443
|
-
messages = truncated_res.truncated_messages
|
|
444
|
-
perf_timing.measure("truncate_messages_to_fit_context")
|
|
335
|
+
if (
|
|
336
|
+
limit_result.conversation_history_compacted
|
|
337
|
+
and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
|
|
338
|
+
):
|
|
339
|
+
tool_calls = []
|
|
445
340
|
|
|
446
341
|
logging.debug(f"sending messages={messages}\n\ntools={tools}")
|
|
447
342
|
|
|
@@ -459,7 +354,6 @@ class ToolCallingLLM:
|
|
|
459
354
|
# Extract and accumulate cost information
|
|
460
355
|
_process_cost_info(full_response, costs, "LLM call")
|
|
461
356
|
|
|
462
|
-
perf_timing.measure("llm.completion")
|
|
463
357
|
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
|
|
464
358
|
except BadRequestError as e:
|
|
465
359
|
if "Unrecognized request arguments supplied: tool_choice, tools" in str(
|
|
@@ -483,7 +377,7 @@ class ToolCallingLLM:
|
|
|
483
377
|
|
|
484
378
|
if incorrect_tool_call:
|
|
485
379
|
logging.warning(
|
|
486
|
-
"Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-
|
|
380
|
+
"Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
|
|
487
381
|
)
|
|
488
382
|
# disable structured output going forward and and retry
|
|
489
383
|
sentry_helper.capture_structured_output_incorrect_tool_call()
|
|
@@ -503,8 +397,8 @@ class ToolCallingLLM:
|
|
|
503
397
|
hasattr(response_message, "reasoning_content")
|
|
504
398
|
and response_message.reasoning_content
|
|
505
399
|
):
|
|
506
|
-
logging.
|
|
507
|
-
f"[
|
|
400
|
+
logging.info(
|
|
401
|
+
f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
|
|
508
402
|
)
|
|
509
403
|
|
|
510
404
|
if not tools_to_call:
|
|
@@ -522,25 +416,29 @@ class ToolCallingLLM:
|
|
|
522
416
|
)
|
|
523
417
|
costs.total_cost += post_processing_cost
|
|
524
418
|
|
|
525
|
-
self.llm.
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
419
|
+
tokens = self.llm.count_tokens(messages=messages, tools=tools)
|
|
420
|
+
|
|
421
|
+
add_token_count_to_metadata(
|
|
422
|
+
tokens=tokens,
|
|
423
|
+
full_llm_response=full_response,
|
|
424
|
+
max_context_size=limit_result.max_context_size,
|
|
425
|
+
maximum_output_token=limit_result.maximum_output_token,
|
|
426
|
+
metadata=metadata,
|
|
427
|
+
)
|
|
428
|
+
|
|
530
429
|
return LLMResult(
|
|
531
430
|
result=post_processed_response,
|
|
532
431
|
unprocessed_result=raw_response,
|
|
533
|
-
tool_calls=
|
|
432
|
+
tool_calls=all_tool_calls,
|
|
534
433
|
prompt=json.dumps(messages, indent=2),
|
|
535
434
|
messages=messages,
|
|
536
435
|
**costs.model_dump(), # Include all cost fields
|
|
537
436
|
metadata=metadata,
|
|
538
437
|
)
|
|
539
438
|
|
|
540
|
-
perf_timing.end(f"- completed in {i} iterations -")
|
|
541
439
|
return LLMResult(
|
|
542
440
|
result=text_response,
|
|
543
|
-
tool_calls=
|
|
441
|
+
tool_calls=all_tool_calls,
|
|
544
442
|
prompt=json.dumps(messages, indent=2),
|
|
545
443
|
messages=messages,
|
|
546
444
|
**costs.model_dump(), # Include all cost fields
|
|
@@ -552,7 +450,6 @@ class ToolCallingLLM:
|
|
|
552
450
|
logging.info(
|
|
553
451
|
f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
|
|
554
452
|
)
|
|
555
|
-
perf_timing.measure("pre-tool-calls")
|
|
556
453
|
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
|
|
557
454
|
futures = []
|
|
558
455
|
futures_tool_numbers: dict[
|
|
@@ -562,6 +459,7 @@ class ToolCallingLLM:
|
|
|
562
459
|
for tool_index, t in enumerate(tools_to_call, 1):
|
|
563
460
|
logging.debug(f"Tool to call: {t}")
|
|
564
461
|
tool_number = tool_number_offset + tool_index
|
|
462
|
+
|
|
565
463
|
future = executor.submit(
|
|
566
464
|
self._invoke_llm_tool_call,
|
|
567
465
|
tool_to_call=t,
|
|
@@ -594,10 +492,13 @@ class ToolCallingLLM:
|
|
|
594
492
|
tool_span, tool_call_result
|
|
595
493
|
)
|
|
596
494
|
|
|
597
|
-
|
|
495
|
+
tool_result_response_dict = (
|
|
496
|
+
tool_call_result.as_tool_result_response()
|
|
497
|
+
)
|
|
498
|
+
tool_calls.append(tool_result_response_dict)
|
|
499
|
+
all_tool_calls.append(tool_result_response_dict)
|
|
598
500
|
messages.append(tool_call_result.as_tool_call_message())
|
|
599
|
-
|
|
600
|
-
perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
|
|
501
|
+
tokens = self.llm.count_tokens(messages=messages, tools=tools)
|
|
601
502
|
|
|
602
503
|
# Update the tool number offset for the next iteration
|
|
603
504
|
tool_number_offset += len(tools_to_call)
|
|
@@ -631,7 +532,7 @@ class ToolCallingLLM:
|
|
|
631
532
|
tool_number=tool_number,
|
|
632
533
|
user_approved=user_approved,
|
|
633
534
|
llm=self.llm,
|
|
634
|
-
max_token_count=
|
|
535
|
+
max_token_count=self.llm.get_max_token_count_for_single_tool(),
|
|
635
536
|
)
|
|
636
537
|
tool_response = tool.invoke(tool_params, context=invoke_context)
|
|
637
538
|
except Exception as e:
|
|
@@ -650,6 +551,7 @@ class ToolCallingLLM:
|
|
|
650
551
|
tool_call_id: str,
|
|
651
552
|
tool_name: str,
|
|
652
553
|
tool_arguments: str,
|
|
554
|
+
user_approved: bool,
|
|
653
555
|
previous_tool_calls: list[dict],
|
|
654
556
|
tool_number: Optional[int] = None,
|
|
655
557
|
) -> ToolCallResult:
|
|
@@ -661,17 +563,19 @@ class ToolCallingLLM:
|
|
|
661
563
|
f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
|
|
662
564
|
)
|
|
663
565
|
|
|
664
|
-
tool_response =
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
566
|
+
tool_response = None
|
|
567
|
+
if not user_approved:
|
|
568
|
+
tool_response = prevent_overly_repeated_tool_call(
|
|
569
|
+
tool_name=tool_name,
|
|
570
|
+
tool_params=tool_params,
|
|
571
|
+
tool_calls=previous_tool_calls,
|
|
572
|
+
)
|
|
669
573
|
|
|
670
574
|
if not tool_response:
|
|
671
575
|
tool_response = self._directly_invoke_tool_call(
|
|
672
576
|
tool_name=tool_name,
|
|
673
577
|
tool_params=tool_params,
|
|
674
|
-
user_approved=
|
|
578
|
+
user_approved=user_approved,
|
|
675
579
|
tool_number=tool_number,
|
|
676
580
|
)
|
|
677
581
|
|
|
@@ -716,6 +620,7 @@ class ToolCallingLLM:
|
|
|
716
620
|
previous_tool_calls: list[dict],
|
|
717
621
|
trace_span=None,
|
|
718
622
|
tool_number=None,
|
|
623
|
+
user_approved: bool = False,
|
|
719
624
|
) -> ToolCallResult:
|
|
720
625
|
if trace_span is None:
|
|
721
626
|
trace_span = DummySpan()
|
|
@@ -748,6 +653,7 @@ class ToolCallingLLM:
|
|
|
748
653
|
tool_arguments,
|
|
749
654
|
previous_tool_calls=previous_tool_calls,
|
|
750
655
|
tool_number=tool_number,
|
|
656
|
+
user_approved=user_approved,
|
|
751
657
|
)
|
|
752
658
|
|
|
753
659
|
prevent_overly_big_tool_response(
|
|
@@ -850,20 +756,6 @@ class ToolCallingLLM:
|
|
|
850
756
|
logging.exception("Failed to run post processing", exc_info=True)
|
|
851
757
|
return investigation, 0.0
|
|
852
758
|
|
|
853
|
-
@sentry_sdk.trace
|
|
854
|
-
def truncate_messages_to_fit_context(
|
|
855
|
-
self, messages: list, max_context_size: int, maximum_output_token: int
|
|
856
|
-
) -> TruncationResult:
|
|
857
|
-
truncated_res = truncate_messages_to_fit_context(
|
|
858
|
-
messages,
|
|
859
|
-
max_context_size,
|
|
860
|
-
maximum_output_token,
|
|
861
|
-
self.llm.count_tokens_for_message,
|
|
862
|
-
)
|
|
863
|
-
if truncated_res.truncations:
|
|
864
|
-
sentry_helper.capture_tool_truncations(truncated_res.truncations)
|
|
865
|
-
return truncated_res
|
|
866
|
-
|
|
867
759
|
def call_stream(
|
|
868
760
|
self,
|
|
869
761
|
system_prompt: str = "",
|
|
@@ -872,11 +764,19 @@ class ToolCallingLLM:
|
|
|
872
764
|
sections: Optional[InputSectionsDataType] = None,
|
|
873
765
|
msgs: Optional[list[dict]] = None,
|
|
874
766
|
enable_tool_approval: bool = False,
|
|
767
|
+
tool_decisions: List[ToolApprovalDecision] | None = None,
|
|
875
768
|
):
|
|
876
769
|
"""
|
|
877
770
|
This function DOES NOT call llm.completion(stream=true).
|
|
878
771
|
This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
|
|
879
772
|
"""
|
|
773
|
+
|
|
774
|
+
# Process tool decisions if provided
|
|
775
|
+
if msgs and tool_decisions:
|
|
776
|
+
logging.info(f"Processing {len(tool_decisions)} tool decisions")
|
|
777
|
+
msgs, events = self.process_tool_decisions(msgs, tool_decisions)
|
|
778
|
+
yield from events
|
|
779
|
+
|
|
880
780
|
messages: list[dict] = []
|
|
881
781
|
if system_prompt:
|
|
882
782
|
messages.append({"role": "system", "content": system_prompt})
|
|
@@ -884,12 +784,10 @@ class ToolCallingLLM:
|
|
|
884
784
|
messages.append({"role": "user", "content": user_prompt})
|
|
885
785
|
if msgs:
|
|
886
786
|
messages.extend(msgs)
|
|
887
|
-
perf_timing = PerformanceTiming("tool_calling_llm.call")
|
|
888
787
|
tool_calls: list[dict] = []
|
|
889
788
|
tools = self.tool_executor.get_all_tools_openai_format(
|
|
890
789
|
target_model=self.llm.model
|
|
891
790
|
)
|
|
892
|
-
perf_timing.measure("get_all_tools_openai_format")
|
|
893
791
|
max_steps = self.max_steps
|
|
894
792
|
metadata: Dict[Any, Any] = {}
|
|
895
793
|
i = 0
|
|
@@ -897,29 +795,23 @@ class ToolCallingLLM:
|
|
|
897
795
|
|
|
898
796
|
while i < max_steps:
|
|
899
797
|
i += 1
|
|
900
|
-
perf_timing.measure(f"start iteration {i}")
|
|
901
798
|
logging.debug(f"running iteration {i}")
|
|
902
799
|
|
|
903
800
|
tools = None if i == max_steps else tools
|
|
904
801
|
tool_choice = "auto" if tools else None
|
|
905
802
|
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
803
|
+
limit_result = limit_input_context_window(
|
|
804
|
+
llm=self.llm, messages=messages, tools=tools
|
|
805
|
+
)
|
|
806
|
+
yield from limit_result.events
|
|
807
|
+
messages = limit_result.messages
|
|
808
|
+
metadata = metadata | limit_result.metadata
|
|
910
809
|
|
|
911
|
-
if (
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
metadata["truncations"] = [
|
|
917
|
-
t.model_dump() for t in truncated_res.truncations
|
|
918
|
-
]
|
|
919
|
-
messages = truncated_res.truncated_messages
|
|
920
|
-
perf_timing.measure("truncate_messages_to_fit_context")
|
|
921
|
-
else:
|
|
922
|
-
metadata["truncations"] = []
|
|
810
|
+
if (
|
|
811
|
+
limit_result.conversation_history_compacted
|
|
812
|
+
and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
|
|
813
|
+
):
|
|
814
|
+
tool_calls = []
|
|
923
815
|
|
|
924
816
|
logging.debug(f"sending messages={messages}\n\ntools={tools}")
|
|
925
817
|
try:
|
|
@@ -936,7 +828,6 @@ class ToolCallingLLM:
|
|
|
936
828
|
# Log cost information for this iteration (no accumulation in streaming)
|
|
937
829
|
_process_cost_info(full_response, log_prefix="LLM iteration")
|
|
938
830
|
|
|
939
|
-
perf_timing.measure("llm.completion")
|
|
940
831
|
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
|
|
941
832
|
except BadRequestError as e:
|
|
942
833
|
if "Unrecognized request arguments supplied: tool_choice, tools" in str(
|
|
@@ -958,7 +849,7 @@ class ToolCallingLLM:
|
|
|
958
849
|
|
|
959
850
|
if incorrect_tool_call:
|
|
960
851
|
logging.warning(
|
|
961
|
-
"Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-
|
|
852
|
+
"Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4.1' or other structured output compatible models. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
|
|
962
853
|
)
|
|
963
854
|
# disable structured output going forward and and retry
|
|
964
855
|
sentry_helper.capture_structured_output_incorrect_tool_call()
|
|
@@ -972,12 +863,18 @@ class ToolCallingLLM:
|
|
|
972
863
|
)
|
|
973
864
|
)
|
|
974
865
|
|
|
866
|
+
tokens = self.llm.count_tokens(messages=messages, tools=tools)
|
|
867
|
+
add_token_count_to_metadata(
|
|
868
|
+
tokens=tokens,
|
|
869
|
+
full_llm_response=full_response,
|
|
870
|
+
max_context_size=limit_result.max_context_size,
|
|
871
|
+
maximum_output_token=limit_result.maximum_output_token,
|
|
872
|
+
metadata=metadata,
|
|
873
|
+
)
|
|
874
|
+
yield build_stream_event_token_count(metadata=metadata)
|
|
875
|
+
|
|
975
876
|
tools_to_call = getattr(response_message, "tool_calls", None)
|
|
976
877
|
if not tools_to_call:
|
|
977
|
-
self.llm.count_tokens_for_message(messages)
|
|
978
|
-
metadata["usage"] = get_llm_usage(full_response)
|
|
979
|
-
metadata["max_tokens"] = max_context_size
|
|
980
|
-
metadata["max_output_tokens"] = maximum_output_token
|
|
981
878
|
yield StreamMessage(
|
|
982
879
|
event=StreamEvents.ANSWER_END,
|
|
983
880
|
data={
|
|
@@ -993,11 +890,13 @@ class ToolCallingLLM:
|
|
|
993
890
|
if reasoning or message:
|
|
994
891
|
yield StreamMessage(
|
|
995
892
|
event=StreamEvents.AI_MESSAGE,
|
|
996
|
-
data={
|
|
893
|
+
data={
|
|
894
|
+
"content": message,
|
|
895
|
+
"reasoning": reasoning,
|
|
896
|
+
"metadata": metadata,
|
|
897
|
+
},
|
|
997
898
|
)
|
|
998
899
|
|
|
999
|
-
perf_timing.measure("pre-tool-calls")
|
|
1000
|
-
|
|
1001
900
|
# Check if any tools require approval first
|
|
1002
901
|
pending_approvals = []
|
|
1003
902
|
approval_required_tools = []
|
|
@@ -1006,6 +905,7 @@ class ToolCallingLLM:
|
|
|
1006
905
|
futures = []
|
|
1007
906
|
for tool_index, t in enumerate(tools_to_call, 1): # type: ignore
|
|
1008
907
|
tool_number = tool_number_offset + tool_index
|
|
908
|
+
|
|
1009
909
|
future = executor.submit(
|
|
1010
910
|
self._invoke_llm_tool_call,
|
|
1011
911
|
tool_to_call=t, # type: ignore
|
|
@@ -1069,23 +969,11 @@ class ToolCallingLLM:
|
|
|
1069
969
|
# If we have approval required tools, end the stream with pending approvals
|
|
1070
970
|
if pending_approvals:
|
|
1071
971
|
# Add assistant message with pending tool calls
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
"id": result.tool_call_id,
|
|
1078
|
-
"type": "function",
|
|
1079
|
-
"function": {
|
|
1080
|
-
"name": result.tool_name,
|
|
1081
|
-
"arguments": json.dumps(result.result.params or {}),
|
|
1082
|
-
},
|
|
1083
|
-
}
|
|
1084
|
-
for result in approval_required_tools
|
|
1085
|
-
],
|
|
1086
|
-
"pending_approval": True,
|
|
1087
|
-
}
|
|
1088
|
-
messages.append(assistant_msg)
|
|
972
|
+
for result in approval_required_tools:
|
|
973
|
+
tool_call = self.find_assistant_tool_call_request(
|
|
974
|
+
tool_call_id=result.tool_call_id, messages=messages
|
|
975
|
+
)
|
|
976
|
+
tool_call["pending_approval"] = True
|
|
1089
977
|
|
|
1090
978
|
# End stream with approvals required
|
|
1091
979
|
yield StreamMessage(
|
|
@@ -1108,6 +996,21 @@ class ToolCallingLLM:
|
|
|
1108
996
|
f"Too many LLM calls - exceeded max_steps: {i}/{self.max_steps}"
|
|
1109
997
|
)
|
|
1110
998
|
|
|
999
|
+
def find_assistant_tool_call_request(
|
|
1000
|
+
self, tool_call_id: str, messages: list[dict[str, Any]]
|
|
1001
|
+
) -> dict[str, Any]:
|
|
1002
|
+
for message in messages:
|
|
1003
|
+
if message.get("role") == "assistant":
|
|
1004
|
+
for tool_call in message.get("tool_calls", []):
|
|
1005
|
+
if tool_call.get("id") == tool_call_id:
|
|
1006
|
+
return tool_call
|
|
1007
|
+
|
|
1008
|
+
# Should not happen unless there is a bug.
|
|
1009
|
+
# If we are here
|
|
1010
|
+
raise Exception(
|
|
1011
|
+
f"Failed to find assistant request for a tool_call in conversation history. tool_call_id={tool_call_id}"
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1111
1014
|
|
|
1112
1015
|
# TODO: consider getting rid of this entirely and moving templating into the cmds in holmes_cli.py
|
|
1113
1016
|
class IssueInvestigator(ToolCallingLLM):
|