holmesgpt 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/common/env_vars.py +8 -0
- holmes/core/llm.py +28 -0
- holmes/core/supabase_dal.py +33 -42
- holmes/core/tool_calling_llm.py +92 -223
- holmes/core/tools_utils/tool_context_window_limiter.py +32 -39
- holmes/core/truncation/compaction.py +59 -0
- holmes/core/truncation/input_context_window_limiter.py +218 -0
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
- holmes/plugins/toolsets/investigator/core_investigation.py +20 -11
- holmes/plugins/toolsets/robusta/robusta.py +35 -8
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +4 -3
- holmes/utils/stream.py +1 -0
- {holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/METADATA +4 -2
- {holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/RECORD +18 -16
- holmes/core/performance_timing.py +0 -72
- {holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.15.0.dist-info → holmesgpt-0.16.0.dist-info}/entry_points.txt +0 -0
holmes/core/tool_calling_llm.py
CHANGED
|
@@ -7,8 +7,6 @@ from typing import Dict, List, Optional, Type, Union, Callable, Any
|
|
|
7
7
|
from holmes.core.models import (
|
|
8
8
|
ToolApprovalDecision,
|
|
9
9
|
ToolCallResult,
|
|
10
|
-
TruncationResult,
|
|
11
|
-
TruncationMetadata,
|
|
12
10
|
PendingToolApproval,
|
|
13
11
|
)
|
|
14
12
|
|
|
@@ -21,8 +19,8 @@ from pydantic import BaseModel, Field
|
|
|
21
19
|
from rich.console import Console
|
|
22
20
|
|
|
23
21
|
from holmes.common.env_vars import (
|
|
22
|
+
RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION,
|
|
24
23
|
TEMPERATURE,
|
|
25
|
-
MAX_OUTPUT_TOKEN_RESERVATION,
|
|
26
24
|
LOG_LLM_USAGE_RESPONSE,
|
|
27
25
|
)
|
|
28
26
|
|
|
@@ -35,7 +33,6 @@ from holmes.core.investigation_structured_output import (
|
|
|
35
33
|
)
|
|
36
34
|
from holmes.core.issue import Issue
|
|
37
35
|
from holmes.core.llm import LLM
|
|
38
|
-
from holmes.core.performance_timing import PerformanceTiming
|
|
39
36
|
from holmes.core.resource_instruction import ResourceInstructions
|
|
40
37
|
from holmes.core.runbooks import RunbookManager
|
|
41
38
|
from holmes.core.safeguards import prevent_overly_repeated_tool_call
|
|
@@ -45,9 +42,11 @@ from holmes.core.tools import (
|
|
|
45
42
|
ToolInvokeContext,
|
|
46
43
|
)
|
|
47
44
|
from holmes.core.tools_utils.tool_context_window_limiter import (
|
|
48
|
-
get_max_token_count_for_single_tool,
|
|
49
45
|
prevent_overly_big_tool_response,
|
|
50
46
|
)
|
|
47
|
+
from holmes.core.truncation.input_context_window_limiter import (
|
|
48
|
+
limit_input_context_window,
|
|
49
|
+
)
|
|
51
50
|
from holmes.plugins.prompts import load_and_render_prompt
|
|
52
51
|
from holmes.utils import sentry_helper
|
|
53
52
|
from holmes.utils.global_instructions import (
|
|
@@ -69,9 +68,6 @@ from holmes.utils.stream import (
|
|
|
69
68
|
cost_logger = logging.getLogger("holmes.costs")
|
|
70
69
|
|
|
71
70
|
|
|
72
|
-
TRUNCATION_NOTICE = "\n\n[TRUNCATED]"
|
|
73
|
-
|
|
74
|
-
|
|
75
71
|
class LLMCosts(BaseModel):
|
|
76
72
|
"""Tracks cost and token usage for LLM calls."""
|
|
77
73
|
|
|
@@ -143,114 +139,6 @@ def _process_cost_info(
|
|
|
143
139
|
logging.debug(f"Could not extract cost information: {e}")
|
|
144
140
|
|
|
145
141
|
|
|
146
|
-
# TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
|
|
147
|
-
# However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
|
|
148
|
-
# We should fix this in the future
|
|
149
|
-
# TODO: we truncate using character counts not token counts - this means we're overly agressive with truncation - improve it by considering
|
|
150
|
-
# token truncation and not character truncation
|
|
151
|
-
def truncate_messages_to_fit_context(
|
|
152
|
-
messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
|
|
153
|
-
) -> TruncationResult:
|
|
154
|
-
"""
|
|
155
|
-
Helper function to truncate tool messages to fit within context limits.
|
|
156
|
-
|
|
157
|
-
Args:
|
|
158
|
-
messages: List of message dictionaries with roles and content
|
|
159
|
-
max_context_size: Maximum context window size for the model
|
|
160
|
-
maximum_output_token: Maximum tokens reserved for model output
|
|
161
|
-
count_tokens_fn: Function to count tokens for a list of messages
|
|
162
|
-
|
|
163
|
-
Returns:
|
|
164
|
-
Modified list of messages with truncated tool responses
|
|
165
|
-
|
|
166
|
-
Raises:
|
|
167
|
-
Exception: If non-tool messages exceed available context space
|
|
168
|
-
"""
|
|
169
|
-
messages_except_tools = [
|
|
170
|
-
message for message in messages if message["role"] != "tool"
|
|
171
|
-
]
|
|
172
|
-
tokens = count_tokens_fn(messages_except_tools)
|
|
173
|
-
message_size_without_tools = tokens.total_tokens
|
|
174
|
-
|
|
175
|
-
tool_call_messages = [message for message in messages if message["role"] == "tool"]
|
|
176
|
-
|
|
177
|
-
reserved_for_output_tokens = min(maximum_output_token, MAX_OUTPUT_TOKEN_RESERVATION)
|
|
178
|
-
if message_size_without_tools >= (max_context_size - reserved_for_output_tokens):
|
|
179
|
-
logging.error(
|
|
180
|
-
f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the model's context window for input."
|
|
181
|
-
)
|
|
182
|
-
raise Exception(
|
|
183
|
-
f"The combined size of system_prompt and user_prompt ({message_size_without_tools} tokens) exceeds the maximum context size of {max_context_size - reserved_for_output_tokens} tokens available for input."
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
if len(tool_call_messages) == 0:
|
|
187
|
-
return TruncationResult(truncated_messages=messages, truncations=[])
|
|
188
|
-
|
|
189
|
-
available_space = (
|
|
190
|
-
max_context_size - message_size_without_tools - reserved_for_output_tokens
|
|
191
|
-
)
|
|
192
|
-
remaining_space = available_space
|
|
193
|
-
tool_call_messages.sort(
|
|
194
|
-
key=lambda x: count_tokens_fn(
|
|
195
|
-
[{"role": "tool", "content": x["content"]}]
|
|
196
|
-
).total_tokens
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
truncations = []
|
|
200
|
-
|
|
201
|
-
# Allocate space starting with small tools and going to larger tools, while maintaining fairness
|
|
202
|
-
# Small tools can often get exactly what they need, while larger tools may need to be truncated
|
|
203
|
-
# We ensure fairness (no tool gets more than others that need it) and also maximize utilization (we don't leave space unused)
|
|
204
|
-
for i, msg in enumerate(tool_call_messages):
|
|
205
|
-
remaining_tools = len(tool_call_messages) - i
|
|
206
|
-
max_allocation = remaining_space // remaining_tools
|
|
207
|
-
needed_space = count_tokens_fn(
|
|
208
|
-
[{"role": "tool", "content": msg["content"]}]
|
|
209
|
-
).total_tokens
|
|
210
|
-
allocated_space = min(needed_space, max_allocation)
|
|
211
|
-
|
|
212
|
-
if needed_space > allocated_space:
|
|
213
|
-
truncation_metadata = _truncate_tool_message(
|
|
214
|
-
msg, allocated_space, needed_space
|
|
215
|
-
)
|
|
216
|
-
truncations.append(truncation_metadata)
|
|
217
|
-
|
|
218
|
-
remaining_space -= allocated_space
|
|
219
|
-
return TruncationResult(truncated_messages=messages, truncations=truncations)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
def _truncate_tool_message(
|
|
223
|
-
msg: dict, allocated_space: int, needed_space: int
|
|
224
|
-
) -> TruncationMetadata:
|
|
225
|
-
msg_content = msg["content"]
|
|
226
|
-
tool_call_id = msg["tool_call_id"]
|
|
227
|
-
tool_name = msg["name"]
|
|
228
|
-
|
|
229
|
-
# Ensure the indicator fits in the allocated space
|
|
230
|
-
if allocated_space > len(TRUNCATION_NOTICE):
|
|
231
|
-
original = msg_content if isinstance(msg_content, str) else str(msg_content)
|
|
232
|
-
msg["content"] = (
|
|
233
|
-
original[: allocated_space - len(TRUNCATION_NOTICE)] + TRUNCATION_NOTICE
|
|
234
|
-
)
|
|
235
|
-
end_index = allocated_space - len(TRUNCATION_NOTICE)
|
|
236
|
-
else:
|
|
237
|
-
msg["content"] = TRUNCATION_NOTICE[:allocated_space]
|
|
238
|
-
end_index = allocated_space
|
|
239
|
-
|
|
240
|
-
msg.pop("token_count", None) # Remove token_count if present
|
|
241
|
-
logging.info(
|
|
242
|
-
f"Truncating tool message '{tool_name}' from {needed_space} to {allocated_space} tokens"
|
|
243
|
-
)
|
|
244
|
-
truncation_metadata = TruncationMetadata(
|
|
245
|
-
tool_call_id=tool_call_id,
|
|
246
|
-
start_index=0,
|
|
247
|
-
end_index=end_index,
|
|
248
|
-
tool_name=tool_name,
|
|
249
|
-
original_token_count=needed_space,
|
|
250
|
-
)
|
|
251
|
-
return truncation_metadata
|
|
252
|
-
|
|
253
|
-
|
|
254
142
|
class LLMResult(LLMCosts):
|
|
255
143
|
tool_calls: Optional[List[ToolCallResult]] = None
|
|
256
144
|
result: Optional[str] = None
|
|
@@ -289,7 +177,7 @@ class ToolCallingLLM:
|
|
|
289
177
|
|
|
290
178
|
def process_tool_decisions(
|
|
291
179
|
self, messages: List[Dict[str, Any]], tool_decisions: List[ToolApprovalDecision]
|
|
292
|
-
) -> List[Dict[str, Any]]:
|
|
180
|
+
) -> tuple[List[Dict[str, Any]], list[StreamMessage]]:
|
|
293
181
|
"""
|
|
294
182
|
Process tool approval decisions and execute approved tools.
|
|
295
183
|
|
|
@@ -300,8 +188,9 @@ class ToolCallingLLM:
|
|
|
300
188
|
Returns:
|
|
301
189
|
Updated messages list with tool execution results
|
|
302
190
|
"""
|
|
191
|
+
events: list[StreamMessage] = []
|
|
303
192
|
if not tool_decisions:
|
|
304
|
-
return messages
|
|
193
|
+
return messages, events
|
|
305
194
|
|
|
306
195
|
# Create decision lookup
|
|
307
196
|
decisions_by_tool_call_id = {
|
|
@@ -332,40 +221,39 @@ class ToolCallingLLM:
|
|
|
332
221
|
error_message = f"Received {len(tool_decisions)} tool decisions but no pending approvals found"
|
|
333
222
|
logging.error(error_message)
|
|
334
223
|
raise Exception(error_message)
|
|
335
|
-
|
|
336
224
|
for tool_call_with_decision in pending_tool_calls:
|
|
337
225
|
tool_call_message: dict
|
|
338
226
|
tool_call = tool_call_with_decision.tool_call
|
|
339
227
|
decision = tool_call_with_decision.decision
|
|
228
|
+
tool_result: Optional[ToolCallResult] = None
|
|
340
229
|
if decision and decision.approved:
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
)
|
|
349
|
-
tool_call_message = llm_tool_result.as_tool_call_message()
|
|
350
|
-
|
|
351
|
-
except Exception as e:
|
|
352
|
-
logging.error(
|
|
353
|
-
f"Failed to execute approved tool {tool_call.id}: {e}"
|
|
354
|
-
)
|
|
355
|
-
tool_call_message = {
|
|
356
|
-
"tool_call_id": tool_call.id,
|
|
357
|
-
"role": "tool",
|
|
358
|
-
"name": tool_call.function.name,
|
|
359
|
-
"content": f"Tool execution failed: {str(e)}",
|
|
360
|
-
}
|
|
230
|
+
tool_result = self._invoke_llm_tool_call(
|
|
231
|
+
tool_to_call=tool_call,
|
|
232
|
+
previous_tool_calls=[],
|
|
233
|
+
trace_span=DummySpan(), # TODO: replace with proper span
|
|
234
|
+
tool_number=None,
|
|
235
|
+
user_approved=True,
|
|
236
|
+
)
|
|
361
237
|
else:
|
|
362
238
|
# Tool was rejected or no decision found, add rejection message
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
239
|
+
tool_result = ToolCallResult(
|
|
240
|
+
tool_call_id=tool_call.id,
|
|
241
|
+
tool_name=tool_call.function.name,
|
|
242
|
+
description=tool_call.function.name,
|
|
243
|
+
result=StructuredToolResult(
|
|
244
|
+
status=StructuredToolResultStatus.ERROR,
|
|
245
|
+
error="Tool execution was denied by the user.",
|
|
246
|
+
),
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
events.append(
|
|
250
|
+
StreamMessage(
|
|
251
|
+
event=StreamEvents.TOOL_RESULT,
|
|
252
|
+
data=tool_result.as_streaming_tool_result_response(),
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
tool_call_message = tool_result.as_tool_call_message()
|
|
369
257
|
|
|
370
258
|
# It is expected that the tool call result directly follows the tool call request from the LLM
|
|
371
259
|
# The API call may contain a user ask which is appended to the messages so we can't just append
|
|
@@ -374,7 +262,7 @@ class ToolCallingLLM:
|
|
|
374
262
|
tool_call_with_decision.message_index + 1, tool_call_message
|
|
375
263
|
)
|
|
376
264
|
|
|
377
|
-
return messages
|
|
265
|
+
return messages, events
|
|
378
266
|
|
|
379
267
|
def prompt_call(
|
|
380
268
|
self,
|
|
@@ -420,40 +308,35 @@ class ToolCallingLLM:
|
|
|
420
308
|
trace_span=DummySpan(),
|
|
421
309
|
tool_number_offset: int = 0,
|
|
422
310
|
) -> LLMResult:
|
|
423
|
-
|
|
424
|
-
|
|
311
|
+
tool_calls: list[
|
|
312
|
+
dict
|
|
313
|
+
] = [] # Used for preventing repeated tool calls. potentially reset after compaction
|
|
314
|
+
all_tool_calls = [] # type: ignore
|
|
425
315
|
costs = LLMCosts()
|
|
426
|
-
|
|
427
316
|
tools = self.tool_executor.get_all_tools_openai_format(
|
|
428
317
|
target_model=self.llm.model
|
|
429
318
|
)
|
|
430
|
-
perf_timing.measure("get_all_tools_openai_format")
|
|
431
319
|
max_steps = self.max_steps
|
|
432
320
|
i = 0
|
|
433
321
|
metadata: Dict[Any, Any] = {}
|
|
434
322
|
while i < max_steps:
|
|
435
323
|
i += 1
|
|
436
|
-
perf_timing.measure(f"start iteration {i}")
|
|
437
324
|
logging.debug(f"running iteration {i}")
|
|
438
325
|
# on the last step we don't allow tools - we want to force a reply, not a request to run another tool
|
|
439
326
|
tools = None if i == max_steps else tools
|
|
440
327
|
tool_choice = "auto" if tools else None
|
|
441
328
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
t.model_dump() for t in truncated_res.truncations
|
|
454
|
-
]
|
|
455
|
-
messages = truncated_res.truncated_messages
|
|
456
|
-
perf_timing.measure("truncate_messages_to_fit_context")
|
|
329
|
+
limit_result = limit_input_context_window(
|
|
330
|
+
llm=self.llm, messages=messages, tools=tools
|
|
331
|
+
)
|
|
332
|
+
messages = limit_result.messages
|
|
333
|
+
metadata = metadata | limit_result.metadata
|
|
334
|
+
|
|
335
|
+
if (
|
|
336
|
+
limit_result.conversation_history_compacted
|
|
337
|
+
and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
|
|
338
|
+
):
|
|
339
|
+
tool_calls = []
|
|
457
340
|
|
|
458
341
|
logging.debug(f"sending messages={messages}\n\ntools={tools}")
|
|
459
342
|
|
|
@@ -471,7 +354,6 @@ class ToolCallingLLM:
|
|
|
471
354
|
# Extract and accumulate cost information
|
|
472
355
|
_process_cost_info(full_response, costs, "LLM call")
|
|
473
356
|
|
|
474
|
-
perf_timing.measure("llm.completion")
|
|
475
357
|
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
|
|
476
358
|
except BadRequestError as e:
|
|
477
359
|
if "Unrecognized request arguments supplied: tool_choice, tools" in str(
|
|
@@ -515,8 +397,8 @@ class ToolCallingLLM:
|
|
|
515
397
|
hasattr(response_message, "reasoning_content")
|
|
516
398
|
and response_message.reasoning_content
|
|
517
399
|
):
|
|
518
|
-
logging.
|
|
519
|
-
f"[
|
|
400
|
+
logging.info(
|
|
401
|
+
f"[italic dim]AI reasoning:\n\n{response_message.reasoning_content}[/italic dim]\n"
|
|
520
402
|
)
|
|
521
403
|
|
|
522
404
|
if not tools_to_call:
|
|
@@ -539,26 +421,24 @@ class ToolCallingLLM:
|
|
|
539
421
|
add_token_count_to_metadata(
|
|
540
422
|
tokens=tokens,
|
|
541
423
|
full_llm_response=full_response,
|
|
542
|
-
max_context_size=max_context_size,
|
|
543
|
-
maximum_output_token=maximum_output_token,
|
|
424
|
+
max_context_size=limit_result.max_context_size,
|
|
425
|
+
maximum_output_token=limit_result.maximum_output_token,
|
|
544
426
|
metadata=metadata,
|
|
545
427
|
)
|
|
546
|
-
perf_timing.end(f"- completed in {i} iterations -")
|
|
547
428
|
|
|
548
429
|
return LLMResult(
|
|
549
430
|
result=post_processed_response,
|
|
550
431
|
unprocessed_result=raw_response,
|
|
551
|
-
tool_calls=
|
|
432
|
+
tool_calls=all_tool_calls,
|
|
552
433
|
prompt=json.dumps(messages, indent=2),
|
|
553
434
|
messages=messages,
|
|
554
435
|
**costs.model_dump(), # Include all cost fields
|
|
555
436
|
metadata=metadata,
|
|
556
437
|
)
|
|
557
438
|
|
|
558
|
-
perf_timing.end(f"- completed in {i} iterations -")
|
|
559
439
|
return LLMResult(
|
|
560
440
|
result=text_response,
|
|
561
|
-
tool_calls=
|
|
441
|
+
tool_calls=all_tool_calls,
|
|
562
442
|
prompt=json.dumps(messages, indent=2),
|
|
563
443
|
messages=messages,
|
|
564
444
|
**costs.model_dump(), # Include all cost fields
|
|
@@ -570,7 +450,6 @@ class ToolCallingLLM:
|
|
|
570
450
|
logging.info(
|
|
571
451
|
f"The AI requested [bold]{len(tools_to_call) if tools_to_call else 0}[/bold] tool call(s)."
|
|
572
452
|
)
|
|
573
|
-
perf_timing.measure("pre-tool-calls")
|
|
574
453
|
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
|
|
575
454
|
futures = []
|
|
576
455
|
futures_tool_numbers: dict[
|
|
@@ -580,6 +459,7 @@ class ToolCallingLLM:
|
|
|
580
459
|
for tool_index, t in enumerate(tools_to_call, 1):
|
|
581
460
|
logging.debug(f"Tool to call: {t}")
|
|
582
461
|
tool_number = tool_number_offset + tool_index
|
|
462
|
+
|
|
583
463
|
future = executor.submit(
|
|
584
464
|
self._invoke_llm_tool_call,
|
|
585
465
|
tool_to_call=t,
|
|
@@ -612,10 +492,13 @@ class ToolCallingLLM:
|
|
|
612
492
|
tool_span, tool_call_result
|
|
613
493
|
)
|
|
614
494
|
|
|
615
|
-
|
|
495
|
+
tool_result_response_dict = (
|
|
496
|
+
tool_call_result.as_tool_result_response()
|
|
497
|
+
)
|
|
498
|
+
tool_calls.append(tool_result_response_dict)
|
|
499
|
+
all_tool_calls.append(tool_result_response_dict)
|
|
616
500
|
messages.append(tool_call_result.as_tool_call_message())
|
|
617
|
-
|
|
618
|
-
perf_timing.measure(f"tool completed {tool_call_result.tool_name}")
|
|
501
|
+
tokens = self.llm.count_tokens(messages=messages, tools=tools)
|
|
619
502
|
|
|
620
503
|
# Update the tool number offset for the next iteration
|
|
621
504
|
tool_number_offset += len(tools_to_call)
|
|
@@ -649,7 +532,7 @@ class ToolCallingLLM:
|
|
|
649
532
|
tool_number=tool_number,
|
|
650
533
|
user_approved=user_approved,
|
|
651
534
|
llm=self.llm,
|
|
652
|
-
max_token_count=
|
|
535
|
+
max_token_count=self.llm.get_max_token_count_for_single_tool(),
|
|
653
536
|
)
|
|
654
537
|
tool_response = tool.invoke(tool_params, context=invoke_context)
|
|
655
538
|
except Exception as e:
|
|
@@ -680,11 +563,13 @@ class ToolCallingLLM:
|
|
|
680
563
|
f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
|
|
681
564
|
)
|
|
682
565
|
|
|
683
|
-
tool_response =
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
566
|
+
tool_response = None
|
|
567
|
+
if not user_approved:
|
|
568
|
+
tool_response = prevent_overly_repeated_tool_call(
|
|
569
|
+
tool_name=tool_name,
|
|
570
|
+
tool_params=tool_params,
|
|
571
|
+
tool_calls=previous_tool_calls,
|
|
572
|
+
)
|
|
688
573
|
|
|
689
574
|
if not tool_response:
|
|
690
575
|
tool_response = self._directly_invoke_tool_call(
|
|
@@ -871,20 +756,6 @@ class ToolCallingLLM:
|
|
|
871
756
|
logging.exception("Failed to run post processing", exc_info=True)
|
|
872
757
|
return investigation, 0.0
|
|
873
758
|
|
|
874
|
-
@sentry_sdk.trace
|
|
875
|
-
def truncate_messages_to_fit_context(
|
|
876
|
-
self, messages: list, max_context_size: int, maximum_output_token: int
|
|
877
|
-
) -> TruncationResult:
|
|
878
|
-
truncated_res = truncate_messages_to_fit_context(
|
|
879
|
-
messages,
|
|
880
|
-
max_context_size,
|
|
881
|
-
maximum_output_token,
|
|
882
|
-
self.llm.count_tokens,
|
|
883
|
-
)
|
|
884
|
-
if truncated_res.truncations:
|
|
885
|
-
sentry_helper.capture_tool_truncations(truncated_res.truncations)
|
|
886
|
-
return truncated_res
|
|
887
|
-
|
|
888
759
|
def call_stream(
|
|
889
760
|
self,
|
|
890
761
|
system_prompt: str = "",
|
|
@@ -893,11 +764,19 @@ class ToolCallingLLM:
|
|
|
893
764
|
sections: Optional[InputSectionsDataType] = None,
|
|
894
765
|
msgs: Optional[list[dict]] = None,
|
|
895
766
|
enable_tool_approval: bool = False,
|
|
767
|
+
tool_decisions: List[ToolApprovalDecision] | None = None,
|
|
896
768
|
):
|
|
897
769
|
"""
|
|
898
770
|
This function DOES NOT call llm.completion(stream=true).
|
|
899
771
|
This function streams holmes one iteration at a time instead of waiting for all iterations to complete.
|
|
900
772
|
"""
|
|
773
|
+
|
|
774
|
+
# Process tool decisions if provided
|
|
775
|
+
if msgs and tool_decisions:
|
|
776
|
+
logging.info(f"Processing {len(tool_decisions)} tool decisions")
|
|
777
|
+
msgs, events = self.process_tool_decisions(msgs, tool_decisions)
|
|
778
|
+
yield from events
|
|
779
|
+
|
|
901
780
|
messages: list[dict] = []
|
|
902
781
|
if system_prompt:
|
|
903
782
|
messages.append({"role": "system", "content": system_prompt})
|
|
@@ -905,12 +784,10 @@ class ToolCallingLLM:
|
|
|
905
784
|
messages.append({"role": "user", "content": user_prompt})
|
|
906
785
|
if msgs:
|
|
907
786
|
messages.extend(msgs)
|
|
908
|
-
perf_timing = PerformanceTiming("tool_calling_llm.call")
|
|
909
787
|
tool_calls: list[dict] = []
|
|
910
788
|
tools = self.tool_executor.get_all_tools_openai_format(
|
|
911
789
|
target_model=self.llm.model
|
|
912
790
|
)
|
|
913
|
-
perf_timing.measure("get_all_tools_openai_format")
|
|
914
791
|
max_steps = self.max_steps
|
|
915
792
|
metadata: Dict[Any, Any] = {}
|
|
916
793
|
i = 0
|
|
@@ -918,29 +795,23 @@ class ToolCallingLLM:
|
|
|
918
795
|
|
|
919
796
|
while i < max_steps:
|
|
920
797
|
i += 1
|
|
921
|
-
perf_timing.measure(f"start iteration {i}")
|
|
922
798
|
logging.debug(f"running iteration {i}")
|
|
923
799
|
|
|
924
800
|
tools = None if i == max_steps else tools
|
|
925
801
|
tool_choice = "auto" if tools else None
|
|
926
802
|
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
803
|
+
limit_result = limit_input_context_window(
|
|
804
|
+
llm=self.llm, messages=messages, tools=tools
|
|
805
|
+
)
|
|
806
|
+
yield from limit_result.events
|
|
807
|
+
messages = limit_result.messages
|
|
808
|
+
metadata = metadata | limit_result.metadata
|
|
931
809
|
|
|
932
|
-
if (
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
metadata["truncations"] = [
|
|
938
|
-
t.model_dump() for t in truncated_res.truncations
|
|
939
|
-
]
|
|
940
|
-
messages = truncated_res.truncated_messages
|
|
941
|
-
perf_timing.measure("truncate_messages_to_fit_context")
|
|
942
|
-
else:
|
|
943
|
-
metadata["truncations"] = []
|
|
810
|
+
if (
|
|
811
|
+
limit_result.conversation_history_compacted
|
|
812
|
+
and RESET_REPEATED_TOOL_CALL_CHECK_AFTER_COMPACTION
|
|
813
|
+
):
|
|
814
|
+
tool_calls = []
|
|
944
815
|
|
|
945
816
|
logging.debug(f"sending messages={messages}\n\ntools={tools}")
|
|
946
817
|
try:
|
|
@@ -957,7 +828,6 @@ class ToolCallingLLM:
|
|
|
957
828
|
# Log cost information for this iteration (no accumulation in streaming)
|
|
958
829
|
_process_cost_info(full_response, log_prefix="LLM iteration")
|
|
959
830
|
|
|
960
|
-
perf_timing.measure("llm.completion")
|
|
961
831
|
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
|
|
962
832
|
except BadRequestError as e:
|
|
963
833
|
if "Unrecognized request arguments supplied: tool_choice, tools" in str(
|
|
@@ -997,8 +867,8 @@ class ToolCallingLLM:
|
|
|
997
867
|
add_token_count_to_metadata(
|
|
998
868
|
tokens=tokens,
|
|
999
869
|
full_llm_response=full_response,
|
|
1000
|
-
max_context_size=max_context_size,
|
|
1001
|
-
maximum_output_token=maximum_output_token,
|
|
870
|
+
max_context_size=limit_result.max_context_size,
|
|
871
|
+
maximum_output_token=limit_result.maximum_output_token,
|
|
1002
872
|
metadata=metadata,
|
|
1003
873
|
)
|
|
1004
874
|
yield build_stream_event_token_count(metadata=metadata)
|
|
@@ -1027,8 +897,6 @@ class ToolCallingLLM:
|
|
|
1027
897
|
},
|
|
1028
898
|
)
|
|
1029
899
|
|
|
1030
|
-
perf_timing.measure("pre-tool-calls")
|
|
1031
|
-
|
|
1032
900
|
# Check if any tools require approval first
|
|
1033
901
|
pending_approvals = []
|
|
1034
902
|
approval_required_tools = []
|
|
@@ -1037,6 +905,7 @@ class ToolCallingLLM:
|
|
|
1037
905
|
futures = []
|
|
1038
906
|
for tool_index, t in enumerate(tools_to_call, 1): # type: ignore
|
|
1039
907
|
tool_number = tool_number_offset + tool_index
|
|
908
|
+
|
|
1040
909
|
future = executor.submit(
|
|
1041
910
|
self._invoke_llm_tool_call,
|
|
1042
911
|
tool_to_call=t, # type: ignore
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
|
-
from
|
|
3
|
-
TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT,
|
|
4
|
-
TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_TOKENS,
|
|
5
|
-
)
|
|
2
|
+
from pydantic import BaseModel
|
|
6
3
|
from holmes.core.llm import LLM
|
|
7
4
|
from holmes.core.tools import StructuredToolResultStatus
|
|
8
5
|
from holmes.core.models import ToolCallResult
|
|
9
6
|
from holmes.utils import sentry_helper
|
|
10
7
|
|
|
11
8
|
|
|
9
|
+
class ToolCallSizeMetadata(BaseModel):
|
|
10
|
+
messages_token: int
|
|
11
|
+
max_tokens_allowed: int
|
|
12
|
+
|
|
13
|
+
|
|
12
14
|
def get_pct_token_count(percent_of_total_context_window: float, llm: LLM) -> int:
|
|
13
15
|
context_window_size = llm.get_context_window_size()
|
|
14
16
|
|
|
@@ -18,47 +20,38 @@ def get_pct_token_count(percent_of_total_context_window: float, llm: LLM) -> int
|
|
|
18
20
|
return context_window_size
|
|
19
21
|
|
|
20
22
|
|
|
21
|
-
def
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
tokens = llm.count_tokens(messages=[message])
|
|
37
|
-
messages_token = tokens.total_tokens
|
|
38
|
-
|
|
39
|
-
if messages_token > max_tokens_allowed:
|
|
40
|
-
relative_pct = ((messages_token - max_tokens_allowed) / messages_token) * 100
|
|
41
|
-
|
|
42
|
-
error_message: Optional[str] = (
|
|
43
|
-
f"The tool call result is too large to return: {messages_token} tokens.\nThe maximum allowed tokens is {max_tokens_allowed} which is {format(relative_pct, '.1f')}% smaller.\nInstructions for the LLM: try to repeat the query but proactively narrow down the result so that the tool answer fits within the allowed number of tokens."
|
|
23
|
+
def is_tool_call_too_big(
|
|
24
|
+
tool_call_result: ToolCallResult, llm: LLM
|
|
25
|
+
) -> tuple[bool, Optional[ToolCallSizeMetadata]]:
|
|
26
|
+
if tool_call_result.result.status == StructuredToolResultStatus.SUCCESS:
|
|
27
|
+
message = tool_call_result.as_tool_call_message()
|
|
28
|
+
|
|
29
|
+
tokens = llm.count_tokens(messages=[message])
|
|
30
|
+
max_tokens_allowed = llm.get_max_token_count_for_single_tool()
|
|
31
|
+
return (
|
|
32
|
+
tokens.total_tokens > max_tokens_allowed,
|
|
33
|
+
ToolCallSizeMetadata(
|
|
34
|
+
messages_token=tokens.total_tokens,
|
|
35
|
+
max_tokens_allowed=max_tokens_allowed,
|
|
36
|
+
),
|
|
44
37
|
)
|
|
38
|
+
return False, None
|
|
45
39
|
|
|
46
|
-
if tool_call_result.result.status == StructuredToolResultStatus.NO_DATA:
|
|
47
|
-
error_message = None
|
|
48
|
-
# tool_call_result.result.data is set to None below which is expected to fix the issue
|
|
49
|
-
elif tool_call_result.result.status == StructuredToolResultStatus.ERROR:
|
|
50
|
-
original_error = (
|
|
51
|
-
tool_call_result.result.error
|
|
52
|
-
or tool_call_result.result.data
|
|
53
|
-
or "Unknown error"
|
|
54
|
-
)
|
|
55
|
-
truncated_error = str(original_error)[:100]
|
|
56
|
-
error_message = f"The tool call returned an error it is too large to return\nThe following original error is truncated:\n{truncated_error}"
|
|
57
40
|
|
|
41
|
+
def prevent_overly_big_tool_response(tool_call_result: ToolCallResult, llm: LLM):
|
|
42
|
+
tool_call_result_is_too_big, metadata = is_tool_call_too_big(
|
|
43
|
+
tool_call_result=tool_call_result, llm=llm
|
|
44
|
+
)
|
|
45
|
+
if tool_call_result_is_too_big and metadata:
|
|
46
|
+
relative_pct = (
|
|
47
|
+
(metadata.messages_token - metadata.max_tokens_allowed)
|
|
48
|
+
/ metadata.messages_token
|
|
49
|
+
) * 100
|
|
50
|
+
error_message = f"The tool call result is too large to return: {metadata.messages_token} tokens.\nThe maximum allowed tokens is {metadata.max_tokens_allowed} which is {format(relative_pct, '.1f')}% smaller.\nInstructions for the LLM: try to repeat the query but proactively narrow down the result so that the tool answer fits within the allowed number of tokens."
|
|
58
51
|
tool_call_result.result.status = StructuredToolResultStatus.ERROR
|
|
59
52
|
tool_call_result.result.data = None
|
|
60
53
|
tool_call_result.result.error = error_message
|
|
61
54
|
|
|
62
55
|
sentry_helper.capture_toolcall_contains_too_many_tokens(
|
|
63
|
-
tool_call_result, messages_token, max_tokens_allowed
|
|
56
|
+
tool_call_result, metadata.messages_token, metadata.max_tokens_allowed
|
|
64
57
|
)
|