holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (82) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +10 -2
  3. holmes/common/env_vars.py +8 -1
  4. holmes/config.py +66 -139
  5. holmes/core/investigation.py +1 -2
  6. holmes/core/llm.py +256 -51
  7. holmes/core/models.py +2 -0
  8. holmes/core/safeguards.py +4 -4
  9. holmes/core/supabase_dal.py +14 -8
  10. holmes/core/tool_calling_llm.py +193 -176
  11. holmes/core/tools.py +260 -25
  12. holmes/core/tools_utils/data_types.py +81 -0
  13. holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
  14. holmes/core/tools_utils/tool_executor.py +2 -2
  15. holmes/core/toolset_manager.py +150 -3
  16. holmes/core/tracing.py +6 -1
  17. holmes/core/transformers/__init__.py +23 -0
  18. holmes/core/transformers/base.py +62 -0
  19. holmes/core/transformers/llm_summarize.py +174 -0
  20. holmes/core/transformers/registry.py +122 -0
  21. holmes/core/transformers/transformer.py +31 -0
  22. holmes/main.py +5 -0
  23. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  24. holmes/plugins/toolsets/aks.yaml +64 -0
  25. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
  26. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
  27. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
  28. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
  29. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  30. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
  31. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
  32. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
  33. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
  34. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
  35. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
  36. holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
  37. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  38. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  39. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +16 -17
  40. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +9 -10
  41. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +21 -22
  42. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +8 -8
  43. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -19
  44. holmes/plugins/toolsets/git.py +22 -22
  45. holmes/plugins/toolsets/grafana/common.py +14 -2
  46. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
  47. holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
  48. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +3 -3
  49. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  50. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
  51. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  52. holmes/plugins/toolsets/internet/internet.py +3 -3
  53. holmes/plugins/toolsets/internet/notion.py +3 -3
  54. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  55. holmes/plugins/toolsets/kafka.py +18 -18
  56. holmes/plugins/toolsets/kubernetes.yaml +58 -0
  57. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  58. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  59. holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
  60. holmes/plugins/toolsets/newrelic.py +8 -8
  61. holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
  62. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  63. holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
  64. holmes/plugins/toolsets/prometheus/prometheus.py +172 -39
  65. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +25 -0
  66. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  67. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
  68. holmes/plugins/toolsets/robusta/robusta.py +10 -10
  69. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
  70. holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
  71. holmes/plugins/toolsets/utils.py +88 -0
  72. holmes/utils/config_utils.py +91 -0
  73. holmes/utils/env.py +7 -0
  74. holmes/utils/holmes_status.py +2 -1
  75. holmes/utils/sentry_helper.py +41 -0
  76. holmes/utils/stream.py +9 -0
  77. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/METADATA +10 -14
  78. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/RECORD +81 -71
  79. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  80. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/LICENSE.txt +0 -0
  81. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/WHEEL +0 -0
  82. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1a0.dist-info}/entry_points.txt +0 -0
@@ -2,7 +2,7 @@ import concurrent.futures
2
2
  import json
3
3
  import logging
4
4
  import textwrap
5
- from typing import Dict, List, Optional, Type, Union, Callable
5
+ from typing import Dict, List, Optional, Type, Union, Callable, Any
6
6
 
7
7
 
8
8
  import sentry_sdk
@@ -32,14 +32,23 @@ from holmes.core.performance_timing import PerformanceTiming
32
32
  from holmes.core.resource_instruction import ResourceInstructions
33
33
  from holmes.core.runbooks import RunbookManager
34
34
  from holmes.core.safeguards import prevent_overly_repeated_tool_call
35
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
35
+ from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
36
+ from holmes.core.tools_utils.tool_context_window_limiter import (
37
+ prevent_overly_big_tool_response,
38
+ )
36
39
  from holmes.plugins.prompts import load_and_render_prompt
40
+ from holmes.utils import sentry_helper
37
41
  from holmes.utils.global_instructions import (
38
42
  Instructions,
39
43
  add_global_instructions_to_user_prompt,
40
44
  )
41
45
  from holmes.utils.tags import format_tags_in_string, parse_messages_tags
42
46
  from holmes.core.tools_utils.tool_executor import ToolExecutor
47
+ from holmes.core.tools_utils.data_types import (
48
+ TruncationResult,
49
+ ToolCallResult,
50
+ TruncationMetadata,
51
+ )
43
52
  from holmes.core.tracing import DummySpan
44
53
  from holmes.utils.colors import AI_COLOR
45
54
  from holmes.utils.stream import StreamEvents, StreamMessage
@@ -48,6 +57,9 @@ from holmes.utils.stream import StreamEvents, StreamMessage
48
57
  cost_logger = logging.getLogger("holmes.costs")
49
58
 
50
59
 
60
+ TRUNCATION_NOTICE = "\n\n[TRUNCATED]"
61
+
62
+
51
63
  class LLMCosts(BaseModel):
52
64
  """Tracks cost and token usage for LLM calls."""
53
65
 
@@ -119,23 +131,6 @@ def _process_cost_info(
119
131
  logging.debug(f"Could not extract cost information: {e}")
120
132
 
121
133
 
122
- def format_tool_result_data(tool_result: StructuredToolResult) -> str:
123
- tool_response = tool_result.data
124
- if isinstance(tool_result.data, str):
125
- tool_response = tool_result.data
126
- else:
127
- try:
128
- if isinstance(tool_result.data, BaseModel):
129
- tool_response = tool_result.data.model_dump_json(indent=2)
130
- else:
131
- tool_response = json.dumps(tool_result.data, indent=2)
132
- except Exception:
133
- tool_response = str(tool_result.data)
134
- if tool_result.status == ToolResultStatus.ERROR:
135
- tool_response = f"{tool_result.error or 'Tool execution failed'}:\n\n{tool_result.data or ''}".strip()
136
- return tool_response
137
-
138
-
139
134
  # TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
140
135
  # However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
141
136
  # We should fix this in the future
@@ -143,7 +138,7 @@ def format_tool_result_data(tool_result: StructuredToolResult) -> str:
143
138
  # token truncation and not character truncation
144
139
  def truncate_messages_to_fit_context(
145
140
  messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
146
- ) -> list:
141
+ ) -> TruncationResult:
147
142
  """
148
143
  Helper function to truncate tool messages to fit within context limits.
149
144
 
@@ -176,13 +171,17 @@ def truncate_messages_to_fit_context(
176
171
  )
177
172
 
178
173
  if len(tool_call_messages) == 0:
179
- return messages
174
+ return TruncationResult(truncated_messages=messages, truncations=[])
180
175
 
181
176
  available_space = (
182
- max_context_size - message_size_without_tools - maximum_output_token
177
+ max_context_size - message_size_without_tools - reserved_for_output_tokens
183
178
  )
184
179
  remaining_space = available_space
185
- tool_call_messages.sort(key=lambda x: len(x["content"]))
180
+ tool_call_messages.sort(
181
+ key=lambda x: count_tokens_fn([{"role": "tool", "content": x["content"]}])
182
+ )
183
+
184
+ truncations = []
186
185
 
187
186
  # Allocate space starting with small tools and going to larger tools, while maintaining fairness
188
187
  # Small tools can often get exactly what they need, while larger tools may need to be truncated
@@ -190,75 +189,49 @@ def truncate_messages_to_fit_context(
190
189
  for i, msg in enumerate(tool_call_messages):
191
190
  remaining_tools = len(tool_call_messages) - i
192
191
  max_allocation = remaining_space // remaining_tools
193
- needed_space = len(msg["content"])
192
+ needed_space = count_tokens_fn([{"role": "tool", "content": msg["content"]}])
194
193
  allocated_space = min(needed_space, max_allocation)
195
194
 
196
195
  if needed_space > allocated_space:
197
- truncation_notice = "\n\n[TRUNCATED]"
198
- # Ensure the indicator fits in the allocated space
199
- if allocated_space > len(truncation_notice):
200
- msg["content"] = (
201
- msg["content"][: allocated_space - len(truncation_notice)]
202
- + truncation_notice
203
- )
204
- logging.info(
205
- f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space-len(truncation_notice)} tokens"
206
- )
207
- else:
208
- msg["content"] = truncation_notice[:allocated_space]
209
- logging.info(
210
- f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space} tokens"
211
- )
212
- msg.pop("token_count", None) # Remove token_count if present
196
+ truncation_metadata = _truncate_tool_message(
197
+ msg, allocated_space, needed_space
198
+ )
199
+ truncations.append(truncation_metadata)
213
200
 
214
201
  remaining_space -= allocated_space
215
- return messages
216
-
217
-
218
- class ToolCallResult(BaseModel):
219
- tool_call_id: str
220
- tool_name: str
221
- description: str
222
- result: StructuredToolResult
223
- size: Optional[int] = None
224
-
225
- def as_tool_call_message(self):
226
- content = format_tool_result_data(self.result)
227
- if self.result.params:
228
- content = (
229
- f"Params used for the tool call: {json.dumps(self.result.params)}. The tool call output follows on the next line.\n"
230
- + content
231
- )
232
- return {
233
- "tool_call_id": self.tool_call_id,
234
- "role": "tool",
235
- "name": self.tool_name,
236
- "content": content,
237
- }
238
-
239
- def as_tool_result_response(self):
240
- result_dump = self.result.model_dump()
241
- result_dump["data"] = self.result.get_stringified_data()
242
-
243
- return {
244
- "tool_call_id": self.tool_call_id,
245
- "tool_name": self.tool_name,
246
- "description": self.description,
247
- "role": "tool",
248
- "result": result_dump,
249
- }
250
-
251
- def as_streaming_tool_result_response(self):
252
- result_dump = self.result.model_dump()
253
- result_dump["data"] = self.result.get_stringified_data()
254
-
255
- return {
256
- "tool_call_id": self.tool_call_id,
257
- "role": "tool",
258
- "description": self.description,
259
- "name": self.tool_name,
260
- "result": result_dump,
261
- }
202
+ return TruncationResult(truncated_messages=messages, truncations=truncations)
203
+
204
+
205
+ def _truncate_tool_message(
206
+ msg: dict, allocated_space: int, needed_space: int
207
+ ) -> TruncationMetadata:
208
+ msg_content = msg["content"]
209
+ tool_call_id = msg["tool_call_id"]
210
+ tool_name = msg["name"]
211
+
212
+ # Ensure the indicator fits in the allocated space
213
+ if allocated_space > len(TRUNCATION_NOTICE):
214
+ original = msg_content if isinstance(msg_content, str) else str(msg_content)
215
+ msg["content"] = (
216
+ original[: allocated_space - len(TRUNCATION_NOTICE)] + TRUNCATION_NOTICE
217
+ )
218
+ end_index = allocated_space - len(TRUNCATION_NOTICE)
219
+ else:
220
+ msg["content"] = TRUNCATION_NOTICE[:allocated_space]
221
+ end_index = allocated_space
222
+
223
+ msg.pop("token_count", None) # Remove token_count if present
224
+ logging.info(
225
+ f"Truncating tool message '{tool_name}' from {needed_space} to {allocated_space} tokens"
226
+ )
227
+ truncation_metadata = TruncationMetadata(
228
+ tool_call_id=tool_call_id,
229
+ start_index=0,
230
+ end_index=end_index,
231
+ tool_name=tool_name,
232
+ original_token_count=needed_space,
233
+ )
234
+ return truncation_metadata
262
235
 
263
236
 
264
237
  class LLMResult(LLMCosts):
@@ -269,6 +242,7 @@ class LLMResult(LLMCosts):
269
242
  # TODO: clean up these two
270
243
  prompt: Optional[str] = None
271
244
  messages: Optional[List[dict]] = None
245
+ metadata: Optional[Dict[Any, Any]] = None
272
246
 
273
247
  def get_tool_usage_summary(self):
274
248
  return "AI used info from issue and " + ",".join(
@@ -344,7 +318,7 @@ class ToolCallingLLM:
344
318
  perf_timing.measure("get_all_tools_openai_format")
345
319
  max_steps = self.max_steps
346
320
  i = 0
347
-
321
+ metadata: Dict[Any, Any] = {}
348
322
  while i < max_steps:
349
323
  i += 1
350
324
  perf_timing.measure(f"start iteration {i}")
@@ -360,9 +334,13 @@ class ToolCallingLLM:
360
334
 
361
335
  if (total_tokens + maximum_output_token) > max_context_size:
362
336
  logging.warning("Token limit exceeded. Truncating tool responses.")
363
- messages = self.truncate_messages_to_fit_context(
337
+ truncated_res = self.truncate_messages_to_fit_context(
364
338
  messages, max_context_size, maximum_output_token
365
339
  )
340
+ metadata["truncations"] = [
341
+ t.model_dump() for t in truncated_res.truncations
342
+ ]
343
+ messages = truncated_res.truncated_messages
366
344
  perf_timing.measure("truncate_messages_to_fit_context")
367
345
 
368
346
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
@@ -408,6 +386,7 @@ class ToolCallingLLM:
408
386
  "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
409
387
  )
410
388
  # disable structured output going forward and and retry
389
+ sentry_helper.capture_structured_output_incorrect_tool_call()
411
390
  response_format = None
412
391
  max_steps = max_steps + 1
413
392
  continue
@@ -451,6 +430,7 @@ class ToolCallingLLM:
451
430
  prompt=json.dumps(messages, indent=2),
452
431
  messages=messages,
453
432
  **costs.model_dump(), # Include all cost fields
433
+ metadata=metadata,
454
434
  )
455
435
 
456
436
  perf_timing.end(f"- completed in {i} iterations -")
@@ -460,6 +440,7 @@ class ToolCallingLLM:
460
440
  prompt=json.dumps(messages, indent=2),
461
441
  messages=messages,
462
442
  **costs.model_dump(), # Include all cost fields
443
+ metadata=metadata,
463
444
  )
464
445
 
465
446
  if text_response and text_response.strip():
@@ -495,9 +476,19 @@ class ToolCallingLLM:
495
476
  if future in futures_tool_numbers
496
477
  else None
497
478
  )
498
- tool_call_result = self.handle_tool_call_approval(
499
- tool_call_result=tool_call_result, tool_number=tool_number
500
- )
479
+
480
+ if (
481
+ tool_call_result.result.status
482
+ == StructuredToolResultStatus.APPROVAL_REQUIRED
483
+ ):
484
+ with trace_span.start_span(type="tool") as tool_span:
485
+ tool_call_result = self._handle_tool_call_approval(
486
+ tool_call_result=tool_call_result,
487
+ tool_number=tool_number,
488
+ )
489
+ ToolCallingLLM._log_tool_call_result(
490
+ tool_span, tool_call_result
491
+ )
501
492
 
502
493
  tool_calls.append(tool_call_result.as_tool_result_response())
503
494
  messages.append(tool_call_result.as_tool_call_message())
@@ -513,91 +504,47 @@ class ToolCallingLLM:
513
504
 
514
505
  raise Exception(f"Too many LLM calls - exceeded max_steps: {i}/{max_steps}")
515
506
 
516
- def _directly_invoke_tool(
507
+ def _directly_invoke_tool_call(
517
508
  self,
518
509
  tool_name: str,
519
510
  tool_params: dict,
520
511
  user_approved: bool,
521
- trace_span=DummySpan(),
522
512
  tool_number: Optional[int] = None,
523
513
  ) -> StructuredToolResult:
524
- tool_span = trace_span.start_span(name=tool_name, type="tool")
525
514
  tool = self.tool_executor.get_tool_by_name(tool_name)
526
- tool_response = None
515
+ if not tool:
516
+ logging.warning(
517
+ f"Skipping tool execution for {tool_name}: args: {tool_params}"
518
+ )
519
+ return StructuredToolResult(
520
+ status=StructuredToolResultStatus.ERROR,
521
+ error=f"Failed to find tool {tool_name}",
522
+ params=tool_params,
523
+ )
524
+
527
525
  try:
528
- if (not tool) or (tool_params is None):
529
- logging.warning(
530
- f"Skipping tool execution for {tool_name}: args: {tool_params}"
531
- )
532
- tool_response = StructuredToolResult(
533
- status=ToolResultStatus.ERROR,
534
- error=f"Failed to find tool {tool_name}",
535
- params=tool_params,
536
- )
537
- else:
538
- tool_response = tool.invoke(
539
- tool_params, tool_number=tool_number, user_approved=user_approved
540
- )
526
+ tool_response = tool.invoke(
527
+ tool_params, tool_number=tool_number, user_approved=user_approved
528
+ )
541
529
  except Exception as e:
542
530
  logging.error(
543
531
  f"Tool call to {tool_name} failed with an Exception", exc_info=True
544
532
  )
545
533
  tool_response = StructuredToolResult(
546
- status=ToolResultStatus.ERROR,
534
+ status=StructuredToolResultStatus.ERROR,
547
535
  error=f"Tool call failed: {e}",
548
536
  params=tool_params,
549
537
  )
550
-
551
- # Log error to trace span
552
- tool_span.log(
553
- input=tool_params, output=str(e), metadata={"status": "ERROR"}
554
- )
555
-
556
- tool_span.log(
557
- input=tool_params,
558
- output=tool_response.data,
559
- metadata={
560
- "status": tool_response.status.value,
561
- "error": tool_response.error,
562
- "description": tool.get_parameterized_one_liner(tool_params)
563
- if tool
564
- else "",
565
- "structured_tool_result": tool_response,
566
- },
567
- )
568
- tool_span.end()
569
-
570
538
  return tool_response
571
539
 
572
- def _invoke_llm_tool_call(
540
+ def _get_tool_call_result(
573
541
  self,
574
- tool_to_call: ChatCompletionMessageToolCall,
542
+ tool_call_id: str,
543
+ tool_name: str,
544
+ tool_arguments: str,
575
545
  previous_tool_calls: list[dict],
576
- trace_span=DummySpan(),
577
- tool_number=None,
546
+ tool_number: Optional[int] = None,
578
547
  ) -> ToolCallResult:
579
- # Handle the union type - ChatCompletionMessageToolCall can be either
580
- # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
581
- # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
582
- # We use hasattr to check for the 'function' attribute as it's more flexible
583
- # and doesn't require importing the specific type.
584
- if hasattr(tool_to_call, "function"):
585
- tool_name = tool_to_call.function.name
586
- tool_arguments = tool_to_call.function.arguments
587
- else:
588
- # This is a custom tool call - we don't support these currently
589
- logging.error(f"Unsupported custom tool call: {tool_to_call}")
590
- return ToolCallResult(
591
- tool_call_id=tool_to_call.id,
592
- tool_name="unknown",
593
- description="NA",
594
- result=StructuredToolResult(
595
- status=ToolResultStatus.ERROR,
596
- error="Custom tool calls are not supported",
597
- params=None,
598
- ),
599
- )
600
-
601
548
  tool_params = {}
602
549
  try:
603
550
  tool_params = json.loads(tool_arguments)
@@ -606,8 +553,6 @@ class ToolCallingLLM:
606
553
  f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
607
554
  )
608
555
 
609
- tool_call_id = tool_to_call.id
610
-
611
556
  tool_response = prevent_overly_repeated_tool_call(
612
557
  tool_name=tool_name,
613
558
  tool_params=tool_params,
@@ -615,11 +560,10 @@ class ToolCallingLLM:
615
560
  )
616
561
 
617
562
  if not tool_response:
618
- tool_response = self._directly_invoke_tool(
563
+ tool_response = self._directly_invoke_tool_call(
619
564
  tool_name=tool_name,
620
565
  tool_params=tool_params,
621
566
  user_approved=False,
622
- trace_span=trace_span,
623
567
  tool_number=tool_number,
624
568
  )
625
569
 
@@ -629,12 +573,13 @@ class ToolCallingLLM:
629
573
  f"Tool {tool_name} return type is not StructuredToolResult. Nesting the tool result into StructuredToolResult..."
630
574
  )
631
575
  tool_response = StructuredToolResult(
632
- status=ToolResultStatus.SUCCESS,
576
+ status=StructuredToolResultStatus.SUCCESS,
633
577
  data=tool_response,
634
578
  params=tool_params,
635
579
  )
636
580
 
637
581
  tool = self.tool_executor.get_tool_by_name(tool_name)
582
+
638
583
  return ToolCallResult(
639
584
  tool_call_id=tool_call_id,
640
585
  tool_name=tool_name,
@@ -642,25 +587,85 @@ class ToolCallingLLM:
642
587
  result=tool_response,
643
588
  )
644
589
 
645
- def handle_tool_call_approval(
646
- self, tool_call_result: ToolCallResult, tool_number: Optional[int]
590
+ @staticmethod
591
+ def _log_tool_call_result(tool_span, tool_call_result: ToolCallResult):
592
+ tool_span.set_attributes(name=tool_call_result.tool_name)
593
+ tool_span.log(
594
+ input=tool_call_result.result.params,
595
+ output=tool_call_result.result.data,
596
+ error=tool_call_result.result.error,
597
+ metadata={
598
+ "status": tool_call_result.result.status,
599
+ "description": tool_call_result.description,
600
+ },
601
+ )
602
+
603
+ def _invoke_llm_tool_call(
604
+ self,
605
+ tool_to_call: ChatCompletionMessageToolCall,
606
+ previous_tool_calls: list[dict],
607
+ trace_span=None,
608
+ tool_number=None,
609
+ ) -> ToolCallResult:
610
+ if trace_span is None:
611
+ trace_span = DummySpan()
612
+ with trace_span.start_span(type="tool") as tool_span:
613
+ if not hasattr(tool_to_call, "function"):
614
+ # Handle the union type - ChatCompletionMessageToolCall can be either
615
+ # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
616
+ # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
617
+ # We use hasattr to check for the 'function' attribute as it's more flexible
618
+ # and doesn't require importing the specific type.
619
+ tool_name = "Unknown_Custom_Tool"
620
+ logging.error(f"Unsupported custom tool call: {tool_to_call}")
621
+ tool_call_result = ToolCallResult(
622
+ tool_call_id=tool_to_call.id,
623
+ tool_name=tool_name,
624
+ description="NA",
625
+ result=StructuredToolResult(
626
+ status=StructuredToolResultStatus.ERROR,
627
+ error="Custom tool calls are not supported",
628
+ params=None,
629
+ ),
630
+ )
631
+ else:
632
+ tool_name = tool_to_call.function.name
633
+ tool_arguments = tool_to_call.function.arguments
634
+ tool_id = tool_to_call.id
635
+ tool_call_result = self._get_tool_call_result(
636
+ tool_id,
637
+ tool_name,
638
+ tool_arguments,
639
+ previous_tool_calls=previous_tool_calls,
640
+ tool_number=tool_number,
641
+ )
642
+
643
+ prevent_overly_big_tool_response(
644
+ tool_call_result=tool_call_result, llm=self.llm
645
+ )
646
+
647
+ ToolCallingLLM._log_tool_call_result(tool_span, tool_call_result)
648
+ return tool_call_result
649
+
650
+ def _handle_tool_call_approval(
651
+ self,
652
+ tool_call_result: ToolCallResult,
653
+ tool_number: Optional[int],
647
654
  ) -> ToolCallResult:
648
655
  """
649
656
  Handle approval for a single tool call if required.
650
657
 
651
658
  Args:
652
659
  tool_call_result: A single tool call result that may require approval
660
+ tool_number: The tool call number
653
661
 
654
662
  Returns:
655
663
  Updated tool call result with approved/denied status
656
664
  """
657
665
 
658
- if tool_call_result.result.status != ToolResultStatus.APPROVAL_REQUIRED:
659
- return tool_call_result
660
-
661
666
  # If no approval callback, convert to ERROR because it is assumed the client may not be able to handle approvals
662
667
  if not self.approval_callback:
663
- tool_call_result.result.status = ToolResultStatus.ERROR
668
+ tool_call_result.result.status = StructuredToolResultStatus.ERROR
664
669
  return tool_call_result
665
670
 
666
671
  # Get approval from user
@@ -670,19 +675,17 @@ class ToolCallingLLM:
670
675
  logging.debug(
671
676
  f"User approved command: {tool_call_result.result.invocation}"
672
677
  )
673
-
674
- new_response = self._directly_invoke_tool(
678
+ new_response = self._directly_invoke_tool_call(
675
679
  tool_name=tool_call_result.tool_name,
676
680
  tool_params=tool_call_result.result.params or {},
677
681
  user_approved=True,
678
- trace_span=DummySpan(),
679
682
  tool_number=tool_number,
680
683
  )
681
684
  tool_call_result.result = new_response
682
685
  else:
683
686
  # User denied - update to error
684
687
  feedback_text = f" User feedback: {feedback}" if feedback else ""
685
- tool_call_result.result.status = ToolResultStatus.ERROR
688
+ tool_call_result.result.status = StructuredToolResultStatus.ERROR
686
689
  tool_call_result.result.error = (
687
690
  f"User denied command execution.{feedback_text}"
688
691
  )
@@ -740,13 +743,16 @@ class ToolCallingLLM:
740
743
  @sentry_sdk.trace
741
744
  def truncate_messages_to_fit_context(
742
745
  self, messages: list, max_context_size: int, maximum_output_token: int
743
- ) -> list:
744
- return truncate_messages_to_fit_context(
746
+ ) -> TruncationResult:
747
+ truncated_res = truncate_messages_to_fit_context(
745
748
  messages,
746
749
  max_context_size,
747
750
  maximum_output_token,
748
751
  self.llm.count_tokens_for_message,
749
752
  )
753
+ if truncated_res.truncations:
754
+ sentry_helper.capture_tool_truncations(truncated_res.truncations)
755
+ return truncated_res
750
756
 
751
757
  def call_stream(
752
758
  self,
@@ -774,6 +780,7 @@ class ToolCallingLLM:
774
780
  )
775
781
  perf_timing.measure("get_all_tools_openai_format")
776
782
  max_steps = self.max_steps
783
+ metadata: Dict[Any, Any] = {}
777
784
  i = 0
778
785
  tool_number_offset = 0
779
786
 
@@ -792,10 +799,16 @@ class ToolCallingLLM:
792
799
 
793
800
  if (total_tokens + maximum_output_token) > max_context_size:
794
801
  logging.warning("Token limit exceeded. Truncating tool responses.")
795
- messages = self.truncate_messages_to_fit_context(
802
+ truncated_res = self.truncate_messages_to_fit_context(
796
803
  messages, max_context_size, maximum_output_token
797
804
  )
805
+ metadata["truncations"] = [
806
+ t.model_dump() for t in truncated_res.truncations
807
+ ]
808
+ messages = truncated_res.truncated_messages
798
809
  perf_timing.measure("truncate_messages_to_fit_context")
810
+ else:
811
+ metadata["truncations"] = []
799
812
 
800
813
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
801
814
  try:
@@ -837,6 +850,7 @@ class ToolCallingLLM:
837
850
  "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
838
851
  )
839
852
  # disable structured output going forward and and retry
853
+ sentry_helper.capture_structured_output_incorrect_tool_call()
840
854
  response_format = None
841
855
  max_steps = max_steps + 1
842
856
  continue
@@ -851,7 +865,11 @@ class ToolCallingLLM:
851
865
  if not tools_to_call:
852
866
  yield StreamMessage(
853
867
  event=StreamEvents.ANSWER_END,
854
- data={"content": response_message.content, "messages": messages},
868
+ data={
869
+ "content": response_message.content,
870
+ "messages": messages,
871
+ "metadata": metadata,
872
+ },
855
873
  )
856
874
  return
857
875
 
@@ -883,7 +901,6 @@ class ToolCallingLLM:
883
901
 
884
902
  for future in concurrent.futures.as_completed(futures):
885
903
  tool_call_result: ToolCallResult = future.result()
886
-
887
904
  tool_calls.append(tool_call_result.as_tool_result_response())
888
905
  messages.append(tool_call_result.as_tool_call_message())
889
906