inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. inspect_ai/_cli/eval.py +27 -0
  2. inspect_ai/_display/textual/widgets/samples.py +3 -3
  3. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  4. inspect_ai/_eval/eval.py +19 -2
  5. inspect_ai/_eval/evalset.py +4 -1
  6. inspect_ai/_eval/run.py +41 -0
  7. inspect_ai/_eval/task/generate.py +38 -44
  8. inspect_ai/_eval/task/log.py +26 -28
  9. inspect_ai/_eval/task/run.py +23 -27
  10. inspect_ai/_util/answer.py +26 -0
  11. inspect_ai/_util/constants.py +0 -1
  12. inspect_ai/_util/local_server.py +398 -0
  13. inspect_ai/_util/working.py +10 -4
  14. inspect_ai/_view/www/dist/assets/index.css +173 -159
  15. inspect_ai/_view/www/dist/assets/index.js +1417 -1142
  16. inspect_ai/_view/www/log-schema.json +379 -3
  17. inspect_ai/_view/www/package.json +1 -1
  18. inspect_ai/_view/www/src/@types/log.d.ts +93 -14
  19. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  20. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  21. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  22. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  23. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  24. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  25. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  26. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  27. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  28. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  29. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  30. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  31. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  32. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  33. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  34. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  35. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  36. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  37. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  38. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  39. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  40. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  41. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  42. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  43. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  44. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  45. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  46. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  47. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  48. inspect_ai/_view/www/src/components/Card.css +0 -1
  49. inspect_ai/_view/www/src/constants.ts +2 -0
  50. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  51. inspect_ai/agent/_agent.py +3 -3
  52. inspect_ai/agent/_as_solver.py +22 -12
  53. inspect_ai/agent/_as_tool.py +20 -6
  54. inspect_ai/agent/_handoff.py +12 -1
  55. inspect_ai/agent/_react.py +4 -3
  56. inspect_ai/agent/_run.py +16 -3
  57. inspect_ai/agent/_types.py +9 -0
  58. inspect_ai/dataset/_dataset.py +6 -3
  59. inspect_ai/log/__init__.py +14 -0
  60. inspect_ai/log/_convert.py +4 -9
  61. inspect_ai/log/_file.py +56 -0
  62. inspect_ai/log/_log.py +99 -0
  63. inspect_ai/log/_recorders/__init__.py +2 -0
  64. inspect_ai/log/_recorders/buffer/database.py +12 -11
  65. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  66. inspect_ai/log/_recorders/buffer/types.py +2 -2
  67. inspect_ai/log/_recorders/eval.py +20 -65
  68. inspect_ai/log/_recorders/file.py +28 -6
  69. inspect_ai/log/_recorders/recorder.py +7 -0
  70. inspect_ai/log/_recorders/types.py +1 -23
  71. inspect_ai/log/_samples.py +14 -25
  72. inspect_ai/log/_transcript.py +84 -36
  73. inspect_ai/log/_tree.py +118 -0
  74. inspect_ai/log/_util.py +52 -0
  75. inspect_ai/model/__init__.py +5 -1
  76. inspect_ai/model/_call_tools.py +72 -44
  77. inspect_ai/model/_generate_config.py +14 -8
  78. inspect_ai/model/_model.py +66 -88
  79. inspect_ai/model/_model_output.py +25 -0
  80. inspect_ai/model/_openai.py +2 -0
  81. inspect_ai/model/_providers/anthropic.py +13 -23
  82. inspect_ai/model/_providers/hf.py +27 -1
  83. inspect_ai/model/_providers/openai_o1.py +8 -2
  84. inspect_ai/model/_providers/providers.py +18 -4
  85. inspect_ai/model/_providers/sglang.py +247 -0
  86. inspect_ai/model/_providers/vllm.py +211 -400
  87. inspect_ai/scorer/_choice.py +1 -2
  88. inspect_ai/solver/__init__.py +7 -2
  89. inspect_ai/solver/_basic_agent.py +3 -10
  90. inspect_ai/solver/_chain.py +1 -1
  91. inspect_ai/solver/_fork.py +1 -1
  92. inspect_ai/solver/_multiple_choice.py +5 -22
  93. inspect_ai/solver/_plan.py +2 -2
  94. inspect_ai/solver/_task_state.py +26 -88
  95. inspect_ai/solver/_transcript.py +6 -7
  96. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  97. inspect_ai/tool/_mcp/_mcp.py +8 -5
  98. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  99. inspect_ai/tool/_mcp/server.py +3 -1
  100. inspect_ai/tool/_tool_call.py +4 -1
  101. inspect_ai/tool/_tool_support_helpers.py +51 -12
  102. inspect_ai/tool/_tools/_bash_session.py +190 -68
  103. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  104. inspect_ai/tool/_tools/_execute.py +4 -1
  105. inspect_ai/tool/_tools/_text_editor.py +4 -3
  106. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  107. inspect_ai/util/__init__.py +16 -0
  108. inspect_ai/util/_anyio.py +11 -0
  109. inspect_ai/util/_collect.py +50 -0
  110. inspect_ai/util/_limit.py +393 -0
  111. inspect_ai/util/_limited_conversation.py +57 -0
  112. inspect_ai/util/_span.py +58 -0
  113. inspect_ai/util/_subtask.py +27 -42
  114. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  115. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
  116. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  117. inspect_ai/_display/core/group.py +0 -79
  118. inspect_ai/solver/_limit.py +0 -39
  119. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  120. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  121. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  122. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  123. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  124. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  125. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  126. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  127. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  128. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  129. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  130. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  131. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  132. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  133. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  134. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  135. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  136. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  137. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  138. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  139. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  140. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  141. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  142. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  143. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  144. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  145. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  146. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  147. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  148. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  149. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,118 @@
1
+ from dataclasses import dataclass, field
2
+ from logging import getLogger
3
+ from typing import Iterable, Sequence, TypeAlias
4
+
5
+ from ._transcript import Event, SpanBeginEvent, SpanEndEvent
6
+
7
+ logger = getLogger(__name__)
8
+
9
+ EventNode: TypeAlias = "SpanNode" | Event
10
+ """Node in an event tree."""
11
+
12
+ EventTree: TypeAlias = list[EventNode]
13
+ """Tree of events (has invividual events and event spans)."""
14
+
15
+
16
+ @dataclass
17
+ class SpanNode:
18
+ """Event tree node representing a span of events."""
19
+
20
+ id: str
21
+ """Span id."""
22
+
23
+ parent_id: str | None
24
+ """Parent span id."""
25
+
26
+ type: str | None
27
+ """Optional 'type' field for span."""
28
+
29
+ name: str
30
+ """Span name."""
31
+
32
+ begin: SpanBeginEvent
33
+ """Span begin event."""
34
+
35
+ end: SpanEndEvent | None = None
36
+ """Span end event (if any)."""
37
+
38
+ children: list[EventNode] = field(default_factory=list)
39
+ """Children in the span."""
40
+
41
+
42
+ def event_tree(events: Sequence[Event]) -> EventTree:
43
+ """Build a tree representation of a sequence of events.
44
+
45
+ Organize events heirarchially into event spans.
46
+
47
+ Args:
48
+ events: Sequence of `Event`.
49
+
50
+ Returns:
51
+ Event tree.
52
+ """
53
+ # Convert one flat list of (possibly interleaved) events into *forest*
54
+ # (list of root-level items).
55
+
56
+ # Pre-create one node per span so we can attach events no matter when they
57
+ # arrive in the file. A single forward scan guarantees that the order of
58
+ # `children` inside every span reflects the order in which things appeared
59
+ # in the transcript.
60
+ nodes: dict[str, SpanNode] = {
61
+ ev.id: SpanNode(
62
+ id=ev.id, parent_id=ev.parent_id, type=ev.type, name=ev.name, begin=ev
63
+ )
64
+ for ev in events
65
+ if isinstance(ev, SpanBeginEvent)
66
+ }
67
+
68
+ roots: list[EventNode] = []
69
+
70
+ # Where should an event with `span_id` go?
71
+ def bucket(span_id: str | None) -> list[EventNode]:
72
+ if span_id and span_id in nodes:
73
+ return nodes[span_id].children
74
+ return roots # root level
75
+
76
+ # Single pass in original order
77
+ for ev in events:
78
+ if isinstance(ev, SpanBeginEvent): # span starts
79
+ bucket(ev.parent_id).append(nodes[ev.id])
80
+
81
+ elif isinstance(ev, SpanEndEvent): # span ends
82
+ if n := nodes.get(ev.id):
83
+ n.end = ev
84
+ else:
85
+ logger.warning(f"Span end event (id: {ev.id} with no span begin)")
86
+
87
+ else: # ordinary event
88
+ bucket(ev.span_id).append(ev)
89
+
90
+ return roots
91
+
92
+
93
+ def event_sequence(tree: EventTree) -> Iterable[Event]:
94
+ """Flatten a span forest back into a properly ordered seqeunce.
95
+
96
+ Args:
97
+ tree: Event tree
98
+
99
+ Returns:
100
+ Sequence of events.
101
+ """
102
+ for item in tree:
103
+ if isinstance(item, SpanNode):
104
+ yield item.begin
105
+ yield from event_sequence(item.children)
106
+ if item.end:
107
+ yield item.end
108
+ else:
109
+ yield item
110
+
111
+
112
+ def _print_event_tree(tree: EventTree, indent: str = "") -> None:
113
+ for item in tree:
114
+ if isinstance(item, SpanNode):
115
+ print(f"{indent}span ({item.type}): {item.name}")
116
+ _print_event_tree(item.children, f"{indent} ")
117
+ else:
118
+ print(f"{indent}{item.event}")
@@ -0,0 +1,52 @@
1
+ import textwrap
2
+ from datetime import date, datetime, time
3
+ from typing import Any
4
+
5
+ from inspect_ai._util.content import (
6
+ ContentAudio,
7
+ ContentImage,
8
+ ContentReasoning,
9
+ ContentText,
10
+ ContentVideo,
11
+ )
12
+ from inspect_ai.model._chat_message import ChatMessage
13
+
14
+
15
+ def text_input_only(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
16
+ # Clean the input of any images
17
+ if isinstance(inputs, list):
18
+ input: list[ChatMessage] = []
19
+ for message in inputs:
20
+ if not isinstance(message.content, str):
21
+ filtered_content: list[
22
+ ContentText
23
+ | ContentReasoning
24
+ | ContentImage
25
+ | ContentAudio
26
+ | ContentVideo
27
+ ] = []
28
+ for content in message.content:
29
+ if content.type == "text":
30
+ filtered_content.append(content)
31
+ else:
32
+ filtered_content.append(
33
+ ContentText(text=f"({content.type.capitalize()})")
34
+ )
35
+ message.content = filtered_content
36
+ input.append(message)
37
+ else:
38
+ input.append(message)
39
+
40
+ return input
41
+ else:
42
+ return inputs
43
+
44
+
45
+ def thin_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
46
+ thinned: dict[str, Any] = {}
47
+ for key, value in metadata.items():
48
+ if isinstance(value, int | float | bool | date | time | datetime):
49
+ thinned[key] = value
50
+ elif isinstance(value, str):
51
+ thinned[key] = textwrap.shorten(value, width=1024, placeholder="...")
52
+ return thinned
@@ -28,7 +28,11 @@ from ._chat_message import (
28
28
  ChatMessageUser,
29
29
  )
30
30
  from ._conversation import ModelConversation
31
- from ._generate_config import GenerateConfig, GenerateConfigArgs, ResponseSchema
31
+ from ._generate_config import (
32
+ GenerateConfig,
33
+ GenerateConfigArgs,
34
+ ResponseSchema,
35
+ )
32
36
  from ._model import (
33
37
  Model,
34
38
  ModelAPI,
@@ -60,6 +60,8 @@ from inspect_ai.tool._tool_info import parse_docstring
60
60
  from inspect_ai.tool._tool_params import ToolParams
61
61
  from inspect_ai.util import OutputLimitExceededError
62
62
  from inspect_ai.util._anyio import inner_exception
63
+ from inspect_ai.util._limit import LimitExceededError, apply_limits
64
+ from inspect_ai.util._span import span
63
65
 
64
66
  from ._chat_message import (
65
67
  ChatMessage,
@@ -108,26 +110,18 @@ async def execute_tools(
108
110
  """
109
111
  message = messages[-1]
110
112
  if isinstance(message, ChatMessageAssistant) and message.tool_calls:
111
- from inspect_ai.log._transcript import (
112
- ToolEvent,
113
- Transcript,
114
- init_transcript,
115
- track_store_changes,
116
- transcript,
117
- )
113
+ from inspect_ai.log._transcript import ToolEvent, transcript
118
114
 
119
115
  tdefs = await tool_defs(tools)
120
116
 
121
117
  async def call_tool_task(
122
118
  call: ToolCall,
119
+ event: ToolEvent,
123
120
  conversation: list[ChatMessage],
124
121
  send_stream: MemoryObjectSendStream[
125
122
  tuple[ExecuteToolsResult, ToolEvent, Exception | None]
126
123
  ],
127
124
  ) -> None:
128
- # create a transript for this call
129
- init_transcript(Transcript(name=call.function))
130
-
131
125
  result: ToolResult = ""
132
126
  messages: list[ChatMessage] = []
133
127
  output: ModelOutput | None = None
@@ -135,15 +129,14 @@ async def execute_tools(
135
129
  tool_error: ToolCallError | None = None
136
130
  tool_exception: Exception | None = None
137
131
  try:
138
- with track_store_changes():
139
- try:
140
- result, messages, output, agent = await call_tool(
141
- tdefs, message.text, call, conversation
142
- )
143
- # unwrap exception group
144
- except Exception as ex:
145
- inner_ex = inner_exception(ex)
146
- raise inner_ex.with_traceback(inner_ex.__traceback__)
132
+ try:
133
+ result, messages, output, agent = await call_tool(
134
+ tdefs, message.text, call, event, conversation
135
+ )
136
+ # unwrap exception group
137
+ except Exception as ex:
138
+ inner_ex = inner_exception(ex)
139
+ raise inner_ex.with_traceback(inner_ex.__traceback__)
147
140
 
148
141
  except TimeoutError:
149
142
  tool_error = ToolCallError(
@@ -171,10 +164,15 @@ async def execute_tools(
171
164
  tool_error = ToolCallError("is_a_directory", err)
172
165
  except OutputLimitExceededError as ex:
173
166
  tool_error = ToolCallError(
174
- "output_limit",
175
- f"The tool output limit of {ex.limit_str} was exceeded.",
167
+ "limit",
168
+ f"The tool exceeded its output limit of {ex.limit_str}.",
176
169
  )
177
170
  result = ex.truncated_output or ""
171
+ except LimitExceededError as ex:
172
+ tool_error = ToolCallError(
173
+ "limit",
174
+ f"The tool exceeded its {ex.type} limit of {ex.limit}.",
175
+ )
178
176
  except ToolParsingError as ex:
179
177
  tool_error = ToolCallError("parsing", ex.message)
180
178
  except ToolApprovalError as ex:
@@ -221,7 +219,6 @@ async def execute_tools(
221
219
  truncated=truncated,
222
220
  view=call.view,
223
221
  error=tool_error,
224
- events=list(transcript().events),
225
222
  agent=agent,
226
223
  )
227
224
 
@@ -264,7 +261,6 @@ async def execute_tools(
264
261
  internal=call.internal,
265
262
  pending=True,
266
263
  )
267
- transcript()._event(event)
268
264
 
269
265
  # execute the tool call. if the operator cancels the
270
266
  # tool call then synthesize the appropriate message/event
@@ -274,7 +270,7 @@ async def execute_tools(
274
270
 
275
271
  result_exception = None
276
272
  async with anyio.create_task_group() as tg:
277
- tg.start_soon(call_tool_task, call, messages, send_stream)
273
+ tg.start_soon(call_tool_task, call, event, messages, send_stream)
278
274
  event._set_cancel_fn(tg.cancel_scope.cancel)
279
275
  async with receive_stream:
280
276
  (
@@ -300,7 +296,6 @@ async def execute_tools(
300
296
  truncated=None,
301
297
  view=call.view,
302
298
  error=tool_message.error,
303
- events=[],
304
299
  )
305
300
  transcript().info(
306
301
  f"Tool call '{call.function}' was cancelled by operator."
@@ -320,7 +315,6 @@ async def execute_tools(
320
315
  result=result_event.result,
321
316
  truncated=result_event.truncated,
322
317
  error=result_event.error,
323
- events=result_event.events,
324
318
  waiting_time=waiting_time_end - waiting_time_start,
325
319
  agent=result_event.agent,
326
320
  failed=True if result_exception else None,
@@ -341,18 +335,34 @@ async def execute_tools(
341
335
 
342
336
 
343
337
  async def call_tool(
344
- tools: list[ToolDef], message: str, call: ToolCall, conversation: list[ChatMessage]
338
+ tools: list[ToolDef],
339
+ message: str,
340
+ call: ToolCall,
341
+ event: BaseModel,
342
+ conversation: list[ChatMessage],
345
343
  ) -> tuple[ToolResult, list[ChatMessage], ModelOutput | None, str | None]:
346
344
  from inspect_ai.agent._handoff import AgentTool
345
+ from inspect_ai.log._transcript import SampleLimitEvent, ToolEvent, transcript
346
+
347
+ # dodge circular import
348
+ assert isinstance(event, ToolEvent)
349
+
350
+ # this function is responsible for transcript events so that it can
351
+ # put them in the right enclosure (e.g. handoff/agent/tool). This
352
+ # means that if we throw early we need to do the enclosure when raising.
353
+ async def record_tool_parsing_error(error: str) -> Exception:
354
+ async with span(name=call.function, type="tool"):
355
+ transcript()._event(event)
356
+ return ToolParsingError(error)
347
357
 
348
358
  # if there was an error parsing the ToolCall, raise that
349
359
  if call.parse_error:
350
- raise ToolParsingError(call.parse_error)
360
+ raise await record_tool_parsing_error(call.parse_error)
351
361
 
352
362
  # find the tool
353
363
  tool_def = next((tool for tool in tools if tool.name == call.function), None)
354
364
  if tool_def is None:
355
- raise ToolParsingError(f"Tool {call.function} not found")
365
+ raise await record_tool_parsing_error(f"Tool {call.function} not found")
356
366
 
357
367
  # if we have a tool approver, apply it now
358
368
  from inspect_ai.approval._apply import apply_tool_approval
@@ -362,14 +372,11 @@ async def call_tool(
362
372
  )
363
373
  if not approved:
364
374
  if approval and approval.decision == "terminate":
365
- from inspect_ai.solver._limit import SampleLimitExceededError
366
-
367
- raise SampleLimitExceededError(
368
- "operator",
369
- value=1,
370
- limit=1,
371
- message="Tool call approver requested termination.",
375
+ message = "Tool call approver requested termination."
376
+ transcript()._event(
377
+ SampleLimitEvent(type="operator", limit=1, message=message)
372
378
  )
379
+ raise LimitExceededError("operator", value=1, limit=1, message=message)
373
380
  else:
374
381
  raise ToolApprovalError(approval.explanation if approval else None)
375
382
  if approval and approval.modified:
@@ -378,7 +385,7 @@ async def call_tool(
378
385
  # validate the schema of the passed object
379
386
  validation_errors = validate_tool_input(call.arguments, tool_def.parameters)
380
387
  if validation_errors:
381
- raise ToolParsingError(validation_errors)
388
+ raise await record_tool_parsing_error(validation_errors)
382
389
 
383
390
  # get arguments (with creation of dataclasses, pydantic objects, etc.)
384
391
  arguments = tool_params(call.arguments, tool_def.tool)
@@ -387,14 +394,18 @@ async def call_tool(
387
394
  with trace_action(
388
395
  logger, "Tool Call", format_function_call(tool_def.name, arguments, width=1000)
389
396
  ):
390
- # agent tools get special handling
391
397
  if isinstance(tool_def.tool, AgentTool):
392
- return await agent_handoff(tool_def, call, conversation)
398
+ async with span(tool_def.tool.name, type="handoff"):
399
+ async with span(name=call.function, type="tool"):
400
+ transcript()._event(event)
401
+ return await agent_handoff(tool_def, call, conversation)
393
402
 
394
403
  # normal tool call
395
404
  else:
396
- result: ToolResult = await tool_def.tool(**arguments)
397
- return result, [], None, None
405
+ async with span(name=call.function, type="tool"):
406
+ transcript()._event(event)
407
+ result: ToolResult = await tool_def.tool(**arguments)
408
+ return result, [], None, None
398
409
 
399
410
 
400
411
  async def agent_handoff(
@@ -454,9 +465,15 @@ async def agent_handoff(
454
465
  arguments = tool_params(arguments, agent_tool.agent)
455
466
  del arguments["state"]
456
467
 
457
- # make the call
468
+ # run the agent with limits
469
+ limit_error: LimitExceededError | None = None
458
470
  agent_state = AgentState(messages=copy(agent_conversation))
459
- agent_state = await agent_tool.agent(agent_state, **arguments)
471
+ try:
472
+ with apply_limits(agent_tool.limits):
473
+ async with span(name=agent_name, type="agent"):
474
+ agent_state = await agent_tool.agent(agent_state, **arguments)
475
+ except LimitExceededError as ex:
476
+ limit_error = ex
460
477
 
461
478
  # determine which messages are new and return only those (but exclude new
462
479
  # system messages as they an internal matter for the handed off to agent.
@@ -474,9 +491,20 @@ async def agent_handoff(
474
491
  if agent_tool.output_filter is not None:
475
492
  agent_messages = await agent_tool.output_filter(agent_messages)
476
493
 
494
+ if limit_error is not None:
495
+ agent_messages.append(
496
+ ChatMessageUser(
497
+ content=(
498
+ f"The {agent_name} exceeded its {limit_error.type} limit of "
499
+ f"{limit_error.limit}."
500
+ )
501
+ )
502
+ )
477
503
  # if we end with an assistant message then add a user message
478
504
  # so that the calling agent carries on
479
- if len(agent_messages) == 0 or isinstance(agent_messages[-1], ChatMessageAssistant):
505
+ elif len(agent_messages) == 0 or isinstance(
506
+ agent_messages[-1], ChatMessageAssistant
507
+ ):
480
508
  agent_messages.append(
481
509
  ChatMessageUser(content=f"The {agent_name} agent has completed its work.")
482
510
  )
@@ -106,6 +106,9 @@ class GenerateConfigArgs(TypedDict, total=False):
106
106
  response_schema: ResponseSchema | None
107
107
  """Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
108
108
 
109
+ extra_body: dict[str, Any] | None
110
+ """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
111
+
109
112
 
110
113
  class GenerateConfig(BaseModel):
111
114
  """Model generation options."""
@@ -138,28 +141,28 @@ class GenerateConfig(BaseModel):
138
141
  """Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
139
142
 
140
143
  frequency_penalty: float | None = Field(default=None)
141
- """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, and vLLM only."""
144
+ """Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
142
145
 
143
146
  presence_penalty: float | None = Field(default=None)
144
- """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, and vLLM only."""
147
+ """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
145
148
 
146
149
  logit_bias: dict[int, float] | None = Field(default=None)
147
- """Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, and Grok only."""
150
+ """Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
148
151
 
149
152
  seed: int | None = Field(default=None)
150
153
  """Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
151
154
 
152
155
  top_k: int | None = Field(default=None)
153
- """Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, and vLLM only."""
156
+ """Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, vLLM, and SGLang only."""
154
157
 
155
158
  num_choices: int | None = Field(default=None)
156
- """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, and vLLM only."""
159
+ """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, vLLM, and SGLang only."""
157
160
 
158
161
  logprobs: bool | None = Field(default=None)
159
- """Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
162
+ """Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, vLLM, and SGLang only."""
160
163
 
161
164
  top_logprobs: int | None = Field(default=None)
162
- """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, and vLLM only."""
165
+ """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, vLLM, and SGLang only."""
163
166
 
164
167
  parallel_tool_calls: bool | None = Field(default=None)
165
168
  """Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
@@ -190,7 +193,10 @@ class GenerateConfig(BaseModel):
190
193
  """Include reasoning in chat message history sent to generate."""
191
194
 
192
195
  response_schema: ResponseSchema | None = Field(default=None)
193
- """Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
196
+ """Request a response format as JSONSchema (output should still be validated). OpenAI, Google, Mistral, vLLM, and SGLang only."""
197
+
198
+ extra_body: dict[str, Any] | None = Field(default=None)
199
+ """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
194
200
 
195
201
  # migrate reasoning_history as a bool
196
202
  @model_validator(mode="before")