holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (86) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +15 -4
  3. holmes/common/env_vars.py +8 -1
  4. holmes/config.py +66 -139
  5. holmes/core/investigation.py +1 -2
  6. holmes/core/llm.py +295 -52
  7. holmes/core/models.py +2 -0
  8. holmes/core/safeguards.py +4 -4
  9. holmes/core/supabase_dal.py +14 -8
  10. holmes/core/tool_calling_llm.py +202 -177
  11. holmes/core/tools.py +260 -25
  12. holmes/core/tools_utils/data_types.py +81 -0
  13. holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
  14. holmes/core/tools_utils/tool_executor.py +2 -2
  15. holmes/core/toolset_manager.py +150 -3
  16. holmes/core/tracing.py +6 -1
  17. holmes/core/transformers/__init__.py +23 -0
  18. holmes/core/transformers/base.py +62 -0
  19. holmes/core/transformers/llm_summarize.py +174 -0
  20. holmes/core/transformers/registry.py +122 -0
  21. holmes/core/transformers/transformer.py +31 -0
  22. holmes/main.py +5 -0
  23. holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
  24. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  25. holmes/plugins/toolsets/aks.yaml +64 -0
  26. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
  27. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
  28. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
  29. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
  30. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  31. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
  32. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
  33. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
  34. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
  35. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
  36. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
  37. holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
  38. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  39. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  40. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  41. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  42. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +345 -207
  43. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  44. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +96 -32
  45. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +10 -10
  46. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +21 -22
  47. holmes/plugins/toolsets/git.py +22 -22
  48. holmes/plugins/toolsets/grafana/common.py +14 -2
  49. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
  50. holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
  51. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +5 -4
  52. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  53. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
  54. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  55. holmes/plugins/toolsets/internet/internet.py +3 -3
  56. holmes/plugins/toolsets/internet/notion.py +3 -3
  57. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  58. holmes/plugins/toolsets/kafka.py +18 -18
  59. holmes/plugins/toolsets/kubernetes.yaml +58 -0
  60. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  61. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  62. holmes/plugins/toolsets/logging_utils/logging_api.py +1 -1
  63. holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
  64. holmes/plugins/toolsets/newrelic.py +8 -8
  65. holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
  66. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  67. holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
  68. holmes/plugins/toolsets/prometheus/prometheus.py +841 -351
  69. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +39 -2
  70. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  71. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
  72. holmes/plugins/toolsets/robusta/robusta.py +10 -10
  73. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
  74. holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
  75. holmes/plugins/toolsets/utils.py +88 -0
  76. holmes/utils/config_utils.py +91 -0
  77. holmes/utils/env.py +7 -0
  78. holmes/utils/holmes_status.py +2 -1
  79. holmes/utils/sentry_helper.py +41 -0
  80. holmes/utils/stream.py +9 -0
  81. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/METADATA +11 -15
  82. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/RECORD +85 -75
  83. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  84. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/LICENSE.txt +0 -0
  85. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/WHEEL +0 -0
  86. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/entry_points.txt +0 -0
@@ -2,7 +2,7 @@ import concurrent.futures
2
2
  import json
3
3
  import logging
4
4
  import textwrap
5
- from typing import Dict, List, Optional, Type, Union, Callable
5
+ from typing import Dict, List, Optional, Type, Union, Callable, Any
6
6
 
7
7
 
8
8
  import sentry_sdk
@@ -27,19 +27,28 @@ from holmes.core.investigation_structured_output import (
27
27
  is_response_an_incorrect_tool_call,
28
28
  )
29
29
  from holmes.core.issue import Issue
30
- from holmes.core.llm import LLM
30
+ from holmes.core.llm import LLM, get_llm_usage
31
31
  from holmes.core.performance_timing import PerformanceTiming
32
32
  from holmes.core.resource_instruction import ResourceInstructions
33
33
  from holmes.core.runbooks import RunbookManager
34
34
  from holmes.core.safeguards import prevent_overly_repeated_tool_call
35
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
35
+ from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
36
+ from holmes.core.tools_utils.tool_context_window_limiter import (
37
+ prevent_overly_big_tool_response,
38
+ )
36
39
  from holmes.plugins.prompts import load_and_render_prompt
40
+ from holmes.utils import sentry_helper
37
41
  from holmes.utils.global_instructions import (
38
42
  Instructions,
39
43
  add_global_instructions_to_user_prompt,
40
44
  )
41
45
  from holmes.utils.tags import format_tags_in_string, parse_messages_tags
42
46
  from holmes.core.tools_utils.tool_executor import ToolExecutor
47
+ from holmes.core.tools_utils.data_types import (
48
+ TruncationResult,
49
+ ToolCallResult,
50
+ TruncationMetadata,
51
+ )
43
52
  from holmes.core.tracing import DummySpan
44
53
  from holmes.utils.colors import AI_COLOR
45
54
  from holmes.utils.stream import StreamEvents, StreamMessage
@@ -48,6 +57,9 @@ from holmes.utils.stream import StreamEvents, StreamMessage
48
57
  cost_logger = logging.getLogger("holmes.costs")
49
58
 
50
59
 
60
+ TRUNCATION_NOTICE = "\n\n[TRUNCATED]"
61
+
62
+
51
63
  class LLMCosts(BaseModel):
52
64
  """Tracks cost and token usage for LLM calls."""
53
65
 
@@ -119,23 +131,6 @@ def _process_cost_info(
119
131
  logging.debug(f"Could not extract cost information: {e}")
120
132
 
121
133
 
122
- def format_tool_result_data(tool_result: StructuredToolResult) -> str:
123
- tool_response = tool_result.data
124
- if isinstance(tool_result.data, str):
125
- tool_response = tool_result.data
126
- else:
127
- try:
128
- if isinstance(tool_result.data, BaseModel):
129
- tool_response = tool_result.data.model_dump_json(indent=2)
130
- else:
131
- tool_response = json.dumps(tool_result.data, indent=2)
132
- except Exception:
133
- tool_response = str(tool_result.data)
134
- if tool_result.status == ToolResultStatus.ERROR:
135
- tool_response = f"{tool_result.error or 'Tool execution failed'}:\n\n{tool_result.data or ''}".strip()
136
- return tool_response
137
-
138
-
139
134
  # TODO: I think there's a bug here because we don't account for the 'role' or json structure like '{...}' when counting tokens
140
135
  # However, in practice it works because we reserve enough space for the output tokens that the minor inconsistency does not matter
141
136
  # We should fix this in the future
@@ -143,7 +138,7 @@ def format_tool_result_data(tool_result: StructuredToolResult) -> str:
143
138
  # token truncation and not character truncation
144
139
  def truncate_messages_to_fit_context(
145
140
  messages: list, max_context_size: int, maximum_output_token: int, count_tokens_fn
146
- ) -> list:
141
+ ) -> TruncationResult:
147
142
  """
148
143
  Helper function to truncate tool messages to fit within context limits.
149
144
 
@@ -176,13 +171,17 @@ def truncate_messages_to_fit_context(
176
171
  )
177
172
 
178
173
  if len(tool_call_messages) == 0:
179
- return messages
174
+ return TruncationResult(truncated_messages=messages, truncations=[])
180
175
 
181
176
  available_space = (
182
- max_context_size - message_size_without_tools - maximum_output_token
177
+ max_context_size - message_size_without_tools - reserved_for_output_tokens
183
178
  )
184
179
  remaining_space = available_space
185
- tool_call_messages.sort(key=lambda x: len(x["content"]))
180
+ tool_call_messages.sort(
181
+ key=lambda x: count_tokens_fn([{"role": "tool", "content": x["content"]}])
182
+ )
183
+
184
+ truncations = []
186
185
 
187
186
  # Allocate space starting with small tools and going to larger tools, while maintaining fairness
188
187
  # Small tools can often get exactly what they need, while larger tools may need to be truncated
@@ -190,75 +189,49 @@ def truncate_messages_to_fit_context(
190
189
  for i, msg in enumerate(tool_call_messages):
191
190
  remaining_tools = len(tool_call_messages) - i
192
191
  max_allocation = remaining_space // remaining_tools
193
- needed_space = len(msg["content"])
192
+ needed_space = count_tokens_fn([{"role": "tool", "content": msg["content"]}])
194
193
  allocated_space = min(needed_space, max_allocation)
195
194
 
196
195
  if needed_space > allocated_space:
197
- truncation_notice = "\n\n[TRUNCATED]"
198
- # Ensure the indicator fits in the allocated space
199
- if allocated_space > len(truncation_notice):
200
- msg["content"] = (
201
- msg["content"][: allocated_space - len(truncation_notice)]
202
- + truncation_notice
203
- )
204
- logging.info(
205
- f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space-len(truncation_notice)} tokens"
206
- )
207
- else:
208
- msg["content"] = truncation_notice[:allocated_space]
209
- logging.info(
210
- f"Truncating tool message '{msg['name']}' from {needed_space} to {allocated_space} tokens"
211
- )
212
- msg.pop("token_count", None) # Remove token_count if present
196
+ truncation_metadata = _truncate_tool_message(
197
+ msg, allocated_space, needed_space
198
+ )
199
+ truncations.append(truncation_metadata)
213
200
 
214
201
  remaining_space -= allocated_space
215
- return messages
216
-
217
-
218
- class ToolCallResult(BaseModel):
219
- tool_call_id: str
220
- tool_name: str
221
- description: str
222
- result: StructuredToolResult
223
- size: Optional[int] = None
224
-
225
- def as_tool_call_message(self):
226
- content = format_tool_result_data(self.result)
227
- if self.result.params:
228
- content = (
229
- f"Params used for the tool call: {json.dumps(self.result.params)}. The tool call output follows on the next line.\n"
230
- + content
231
- )
232
- return {
233
- "tool_call_id": self.tool_call_id,
234
- "role": "tool",
235
- "name": self.tool_name,
236
- "content": content,
237
- }
238
-
239
- def as_tool_result_response(self):
240
- result_dump = self.result.model_dump()
241
- result_dump["data"] = self.result.get_stringified_data()
242
-
243
- return {
244
- "tool_call_id": self.tool_call_id,
245
- "tool_name": self.tool_name,
246
- "description": self.description,
247
- "role": "tool",
248
- "result": result_dump,
249
- }
250
-
251
- def as_streaming_tool_result_response(self):
252
- result_dump = self.result.model_dump()
253
- result_dump["data"] = self.result.get_stringified_data()
254
-
255
- return {
256
- "tool_call_id": self.tool_call_id,
257
- "role": "tool",
258
- "description": self.description,
259
- "name": self.tool_name,
260
- "result": result_dump,
261
- }
202
+ return TruncationResult(truncated_messages=messages, truncations=truncations)
203
+
204
+
205
+ def _truncate_tool_message(
206
+ msg: dict, allocated_space: int, needed_space: int
207
+ ) -> TruncationMetadata:
208
+ msg_content = msg["content"]
209
+ tool_call_id = msg["tool_call_id"]
210
+ tool_name = msg["name"]
211
+
212
+ # Ensure the indicator fits in the allocated space
213
+ if allocated_space > len(TRUNCATION_NOTICE):
214
+ original = msg_content if isinstance(msg_content, str) else str(msg_content)
215
+ msg["content"] = (
216
+ original[: allocated_space - len(TRUNCATION_NOTICE)] + TRUNCATION_NOTICE
217
+ )
218
+ end_index = allocated_space - len(TRUNCATION_NOTICE)
219
+ else:
220
+ msg["content"] = TRUNCATION_NOTICE[:allocated_space]
221
+ end_index = allocated_space
222
+
223
+ msg.pop("token_count", None) # Remove token_count if present
224
+ logging.info(
225
+ f"Truncating tool message '{tool_name}' from {needed_space} to {allocated_space} tokens"
226
+ )
227
+ truncation_metadata = TruncationMetadata(
228
+ tool_call_id=tool_call_id,
229
+ start_index=0,
230
+ end_index=end_index,
231
+ tool_name=tool_name,
232
+ original_token_count=needed_space,
233
+ )
234
+ return truncation_metadata
262
235
 
263
236
 
264
237
  class LLMResult(LLMCosts):
@@ -269,6 +242,7 @@ class LLMResult(LLMCosts):
269
242
  # TODO: clean up these two
270
243
  prompt: Optional[str] = None
271
244
  messages: Optional[List[dict]] = None
245
+ metadata: Optional[Dict[Any, Any]] = None
272
246
 
273
247
  def get_tool_usage_summary(self):
274
248
  return "AI used info from issue and " + ",".join(
@@ -344,7 +318,7 @@ class ToolCallingLLM:
344
318
  perf_timing.measure("get_all_tools_openai_format")
345
319
  max_steps = self.max_steps
346
320
  i = 0
347
-
321
+ metadata: Dict[Any, Any] = {}
348
322
  while i < max_steps:
349
323
  i += 1
350
324
  perf_timing.measure(f"start iteration {i}")
@@ -360,9 +334,13 @@ class ToolCallingLLM:
360
334
 
361
335
  if (total_tokens + maximum_output_token) > max_context_size:
362
336
  logging.warning("Token limit exceeded. Truncating tool responses.")
363
- messages = self.truncate_messages_to_fit_context(
337
+ truncated_res = self.truncate_messages_to_fit_context(
364
338
  messages, max_context_size, maximum_output_token
365
339
  )
340
+ metadata["truncations"] = [
341
+ t.model_dump() for t in truncated_res.truncations
342
+ ]
343
+ messages = truncated_res.truncated_messages
366
344
  perf_timing.measure("truncate_messages_to_fit_context")
367
345
 
368
346
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
@@ -408,6 +386,7 @@ class ToolCallingLLM:
408
386
  "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
409
387
  )
410
388
  # disable structured output going forward and and retry
389
+ sentry_helper.capture_structured_output_incorrect_tool_call()
411
390
  response_format = None
412
391
  max_steps = max_steps + 1
413
392
  continue
@@ -443,7 +422,11 @@ class ToolCallingLLM:
443
422
  )
444
423
  costs.total_cost += post_processing_cost
445
424
 
425
+ self.llm.count_tokens_for_message(messages)
446
426
  perf_timing.end(f"- completed in {i} iterations -")
427
+ metadata["usage"] = get_llm_usage(full_response)
428
+ metadata["max_tokens"] = max_context_size
429
+ metadata["max_output_tokens"] = maximum_output_token
447
430
  return LLMResult(
448
431
  result=post_processed_response,
449
432
  unprocessed_result=raw_response,
@@ -451,6 +434,7 @@ class ToolCallingLLM:
451
434
  prompt=json.dumps(messages, indent=2),
452
435
  messages=messages,
453
436
  **costs.model_dump(), # Include all cost fields
437
+ metadata=metadata,
454
438
  )
455
439
 
456
440
  perf_timing.end(f"- completed in {i} iterations -")
@@ -460,6 +444,7 @@ class ToolCallingLLM:
460
444
  prompt=json.dumps(messages, indent=2),
461
445
  messages=messages,
462
446
  **costs.model_dump(), # Include all cost fields
447
+ metadata=metadata,
463
448
  )
464
449
 
465
450
  if text_response and text_response.strip():
@@ -495,9 +480,19 @@ class ToolCallingLLM:
495
480
  if future in futures_tool_numbers
496
481
  else None
497
482
  )
498
- tool_call_result = self.handle_tool_call_approval(
499
- tool_call_result=tool_call_result, tool_number=tool_number
500
- )
483
+
484
+ if (
485
+ tool_call_result.result.status
486
+ == StructuredToolResultStatus.APPROVAL_REQUIRED
487
+ ):
488
+ with trace_span.start_span(type="tool") as tool_span:
489
+ tool_call_result = self._handle_tool_call_approval(
490
+ tool_call_result=tool_call_result,
491
+ tool_number=tool_number,
492
+ )
493
+ ToolCallingLLM._log_tool_call_result(
494
+ tool_span, tool_call_result
495
+ )
501
496
 
502
497
  tool_calls.append(tool_call_result.as_tool_result_response())
503
498
  messages.append(tool_call_result.as_tool_call_message())
@@ -513,91 +508,47 @@ class ToolCallingLLM:
513
508
 
514
509
  raise Exception(f"Too many LLM calls - exceeded max_steps: {i}/{max_steps}")
515
510
 
516
- def _directly_invoke_tool(
511
+ def _directly_invoke_tool_call(
517
512
  self,
518
513
  tool_name: str,
519
514
  tool_params: dict,
520
515
  user_approved: bool,
521
- trace_span=DummySpan(),
522
516
  tool_number: Optional[int] = None,
523
517
  ) -> StructuredToolResult:
524
- tool_span = trace_span.start_span(name=tool_name, type="tool")
525
518
  tool = self.tool_executor.get_tool_by_name(tool_name)
526
- tool_response = None
519
+ if not tool:
520
+ logging.warning(
521
+ f"Skipping tool execution for {tool_name}: args: {tool_params}"
522
+ )
523
+ return StructuredToolResult(
524
+ status=StructuredToolResultStatus.ERROR,
525
+ error=f"Failed to find tool {tool_name}",
526
+ params=tool_params,
527
+ )
528
+
527
529
  try:
528
- if (not tool) or (tool_params is None):
529
- logging.warning(
530
- f"Skipping tool execution for {tool_name}: args: {tool_params}"
531
- )
532
- tool_response = StructuredToolResult(
533
- status=ToolResultStatus.ERROR,
534
- error=f"Failed to find tool {tool_name}",
535
- params=tool_params,
536
- )
537
- else:
538
- tool_response = tool.invoke(
539
- tool_params, tool_number=tool_number, user_approved=user_approved
540
- )
530
+ tool_response = tool.invoke(
531
+ tool_params, tool_number=tool_number, user_approved=user_approved
532
+ )
541
533
  except Exception as e:
542
534
  logging.error(
543
535
  f"Tool call to {tool_name} failed with an Exception", exc_info=True
544
536
  )
545
537
  tool_response = StructuredToolResult(
546
- status=ToolResultStatus.ERROR,
538
+ status=StructuredToolResultStatus.ERROR,
547
539
  error=f"Tool call failed: {e}",
548
540
  params=tool_params,
549
541
  )
550
-
551
- # Log error to trace span
552
- tool_span.log(
553
- input=tool_params, output=str(e), metadata={"status": "ERROR"}
554
- )
555
-
556
- tool_span.log(
557
- input=tool_params,
558
- output=tool_response.data,
559
- metadata={
560
- "status": tool_response.status.value,
561
- "error": tool_response.error,
562
- "description": tool.get_parameterized_one_liner(tool_params)
563
- if tool
564
- else "",
565
- "structured_tool_result": tool_response,
566
- },
567
- )
568
- tool_span.end()
569
-
570
542
  return tool_response
571
543
 
572
- def _invoke_llm_tool_call(
544
+ def _get_tool_call_result(
573
545
  self,
574
- tool_to_call: ChatCompletionMessageToolCall,
546
+ tool_call_id: str,
547
+ tool_name: str,
548
+ tool_arguments: str,
575
549
  previous_tool_calls: list[dict],
576
- trace_span=DummySpan(),
577
- tool_number=None,
550
+ tool_number: Optional[int] = None,
578
551
  ) -> ToolCallResult:
579
- # Handle the union type - ChatCompletionMessageToolCall can be either
580
- # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
581
- # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
582
- # We use hasattr to check for the 'function' attribute as it's more flexible
583
- # and doesn't require importing the specific type.
584
- if hasattr(tool_to_call, "function"):
585
- tool_name = tool_to_call.function.name
586
- tool_arguments = tool_to_call.function.arguments
587
- else:
588
- # This is a custom tool call - we don't support these currently
589
- logging.error(f"Unsupported custom tool call: {tool_to_call}")
590
- return ToolCallResult(
591
- tool_call_id=tool_to_call.id,
592
- tool_name="unknown",
593
- description="NA",
594
- result=StructuredToolResult(
595
- status=ToolResultStatus.ERROR,
596
- error="Custom tool calls are not supported",
597
- params=None,
598
- ),
599
- )
600
-
601
552
  tool_params = {}
602
553
  try:
603
554
  tool_params = json.loads(tool_arguments)
@@ -606,8 +557,6 @@ class ToolCallingLLM:
606
557
  f"Failed to parse arguments for tool: {tool_name}. args: {tool_arguments}"
607
558
  )
608
559
 
609
- tool_call_id = tool_to_call.id
610
-
611
560
  tool_response = prevent_overly_repeated_tool_call(
612
561
  tool_name=tool_name,
613
562
  tool_params=tool_params,
@@ -615,11 +564,10 @@ class ToolCallingLLM:
615
564
  )
616
565
 
617
566
  if not tool_response:
618
- tool_response = self._directly_invoke_tool(
567
+ tool_response = self._directly_invoke_tool_call(
619
568
  tool_name=tool_name,
620
569
  tool_params=tool_params,
621
570
  user_approved=False,
622
- trace_span=trace_span,
623
571
  tool_number=tool_number,
624
572
  )
625
573
 
@@ -629,12 +577,13 @@ class ToolCallingLLM:
629
577
  f"Tool {tool_name} return type is not StructuredToolResult. Nesting the tool result into StructuredToolResult..."
630
578
  )
631
579
  tool_response = StructuredToolResult(
632
- status=ToolResultStatus.SUCCESS,
580
+ status=StructuredToolResultStatus.SUCCESS,
633
581
  data=tool_response,
634
582
  params=tool_params,
635
583
  )
636
584
 
637
585
  tool = self.tool_executor.get_tool_by_name(tool_name)
586
+
638
587
  return ToolCallResult(
639
588
  tool_call_id=tool_call_id,
640
589
  tool_name=tool_name,
@@ -642,25 +591,85 @@ class ToolCallingLLM:
642
591
  result=tool_response,
643
592
  )
644
593
 
645
- def handle_tool_call_approval(
646
- self, tool_call_result: ToolCallResult, tool_number: Optional[int]
594
+ @staticmethod
595
+ def _log_tool_call_result(tool_span, tool_call_result: ToolCallResult):
596
+ tool_span.set_attributes(name=tool_call_result.tool_name)
597
+ tool_span.log(
598
+ input=tool_call_result.result.params,
599
+ output=tool_call_result.result.data,
600
+ error=tool_call_result.result.error,
601
+ metadata={
602
+ "status": tool_call_result.result.status,
603
+ "description": tool_call_result.description,
604
+ },
605
+ )
606
+
607
+ def _invoke_llm_tool_call(
608
+ self,
609
+ tool_to_call: ChatCompletionMessageToolCall,
610
+ previous_tool_calls: list[dict],
611
+ trace_span=None,
612
+ tool_number=None,
613
+ ) -> ToolCallResult:
614
+ if trace_span is None:
615
+ trace_span = DummySpan()
616
+ with trace_span.start_span(type="tool") as tool_span:
617
+ if not hasattr(tool_to_call, "function"):
618
+ # Handle the union type - ChatCompletionMessageToolCall can be either
619
+ # ChatCompletionMessageFunctionToolCall (with 'function' field and type='function')
620
+ # or ChatCompletionMessageCustomToolCall (with 'custom' field and type='custom').
621
+ # We use hasattr to check for the 'function' attribute as it's more flexible
622
+ # and doesn't require importing the specific type.
623
+ tool_name = "Unknown_Custom_Tool"
624
+ logging.error(f"Unsupported custom tool call: {tool_to_call}")
625
+ tool_call_result = ToolCallResult(
626
+ tool_call_id=tool_to_call.id,
627
+ tool_name=tool_name,
628
+ description="NA",
629
+ result=StructuredToolResult(
630
+ status=StructuredToolResultStatus.ERROR,
631
+ error="Custom tool calls are not supported",
632
+ params=None,
633
+ ),
634
+ )
635
+ else:
636
+ tool_name = tool_to_call.function.name
637
+ tool_arguments = tool_to_call.function.arguments
638
+ tool_id = tool_to_call.id
639
+ tool_call_result = self._get_tool_call_result(
640
+ tool_id,
641
+ tool_name,
642
+ tool_arguments,
643
+ previous_tool_calls=previous_tool_calls,
644
+ tool_number=tool_number,
645
+ )
646
+
647
+ prevent_overly_big_tool_response(
648
+ tool_call_result=tool_call_result, llm=self.llm
649
+ )
650
+
651
+ ToolCallingLLM._log_tool_call_result(tool_span, tool_call_result)
652
+ return tool_call_result
653
+
654
+ def _handle_tool_call_approval(
655
+ self,
656
+ tool_call_result: ToolCallResult,
657
+ tool_number: Optional[int],
647
658
  ) -> ToolCallResult:
648
659
  """
649
660
  Handle approval for a single tool call if required.
650
661
 
651
662
  Args:
652
663
  tool_call_result: A single tool call result that may require approval
664
+ tool_number: The tool call number
653
665
 
654
666
  Returns:
655
667
  Updated tool call result with approved/denied status
656
668
  """
657
669
 
658
- if tool_call_result.result.status != ToolResultStatus.APPROVAL_REQUIRED:
659
- return tool_call_result
660
-
661
670
  # If no approval callback, convert to ERROR because it is assumed the client may not be able to handle approvals
662
671
  if not self.approval_callback:
663
- tool_call_result.result.status = ToolResultStatus.ERROR
672
+ tool_call_result.result.status = StructuredToolResultStatus.ERROR
664
673
  return tool_call_result
665
674
 
666
675
  # Get approval from user
@@ -670,19 +679,17 @@ class ToolCallingLLM:
670
679
  logging.debug(
671
680
  f"User approved command: {tool_call_result.result.invocation}"
672
681
  )
673
-
674
- new_response = self._directly_invoke_tool(
682
+ new_response = self._directly_invoke_tool_call(
675
683
  tool_name=tool_call_result.tool_name,
676
684
  tool_params=tool_call_result.result.params or {},
677
685
  user_approved=True,
678
- trace_span=DummySpan(),
679
686
  tool_number=tool_number,
680
687
  )
681
688
  tool_call_result.result = new_response
682
689
  else:
683
690
  # User denied - update to error
684
691
  feedback_text = f" User feedback: {feedback}" if feedback else ""
685
- tool_call_result.result.status = ToolResultStatus.ERROR
692
+ tool_call_result.result.status = StructuredToolResultStatus.ERROR
686
693
  tool_call_result.result.error = (
687
694
  f"User denied command execution.{feedback_text}"
688
695
  )
@@ -740,13 +747,16 @@ class ToolCallingLLM:
740
747
  @sentry_sdk.trace
741
748
  def truncate_messages_to_fit_context(
742
749
  self, messages: list, max_context_size: int, maximum_output_token: int
743
- ) -> list:
744
- return truncate_messages_to_fit_context(
750
+ ) -> TruncationResult:
751
+ truncated_res = truncate_messages_to_fit_context(
745
752
  messages,
746
753
  max_context_size,
747
754
  maximum_output_token,
748
755
  self.llm.count_tokens_for_message,
749
756
  )
757
+ if truncated_res.truncations:
758
+ sentry_helper.capture_tool_truncations(truncated_res.truncations)
759
+ return truncated_res
750
760
 
751
761
  def call_stream(
752
762
  self,
@@ -774,6 +784,7 @@ class ToolCallingLLM:
774
784
  )
775
785
  perf_timing.measure("get_all_tools_openai_format")
776
786
  max_steps = self.max_steps
787
+ metadata: Dict[Any, Any] = {}
777
788
  i = 0
778
789
  tool_number_offset = 0
779
790
 
@@ -792,10 +803,16 @@ class ToolCallingLLM:
792
803
 
793
804
  if (total_tokens + maximum_output_token) > max_context_size:
794
805
  logging.warning("Token limit exceeded. Truncating tool responses.")
795
- messages = self.truncate_messages_to_fit_context(
806
+ truncated_res = self.truncate_messages_to_fit_context(
796
807
  messages, max_context_size, maximum_output_token
797
808
  )
809
+ metadata["truncations"] = [
810
+ t.model_dump() for t in truncated_res.truncations
811
+ ]
812
+ messages = truncated_res.truncated_messages
798
813
  perf_timing.measure("truncate_messages_to_fit_context")
814
+ else:
815
+ metadata["truncations"] = []
799
816
 
800
817
  logging.debug(f"sending messages={messages}\n\ntools={tools}")
801
818
  try:
@@ -837,6 +854,7 @@ class ToolCallingLLM:
837
854
  "Detected incorrect tool call. Structured output will be disabled. This can happen on models that do not support tool calling. For Azure AI, make sure the model name contains 'gpt-4o'. To disable this holmes behaviour, set REQUEST_STRUCTURED_OUTPUT_FROM_LLM to `false`."
838
855
  )
839
856
  # disable structured output going forward and and retry
857
+ sentry_helper.capture_structured_output_incorrect_tool_call()
840
858
  response_format = None
841
859
  max_steps = max_steps + 1
842
860
  continue
@@ -849,9 +867,17 @@ class ToolCallingLLM:
849
867
 
850
868
  tools_to_call = getattr(response_message, "tool_calls", None)
851
869
  if not tools_to_call:
870
+ self.llm.count_tokens_for_message(messages)
871
+ metadata["usage"] = get_llm_usage(full_response)
872
+ metadata["max_tokens"] = max_context_size
873
+ metadata["max_output_tokens"] = maximum_output_token
852
874
  yield StreamMessage(
853
875
  event=StreamEvents.ANSWER_END,
854
- data={"content": response_message.content, "messages": messages},
876
+ data={
877
+ "content": response_message.content,
878
+ "messages": messages,
879
+ "metadata": metadata,
880
+ },
855
881
  )
856
882
  return
857
883
 
@@ -883,7 +909,6 @@ class ToolCallingLLM:
883
909
 
884
910
  for future in concurrent.futures.as_completed(futures):
885
911
  tool_call_result: ToolCallResult = future.result()
886
-
887
912
  tool_calls.append(tool_call_result.as_tool_result_response())
888
913
  messages.append(tool_call_result.as_tool_call_message())
889
914