inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -0
- inspect_ai/_display/textual/widgets/samples.py +3 -3
- inspect_ai/_display/textual/widgets/transcript.py +3 -29
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +23 -27
- inspect_ai/_util/answer.py +26 -0
- inspect_ai/_util/constants.py +0 -1
- inspect_ai/_util/local_server.py +398 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +173 -159
- inspect_ai/_view/www/dist/assets/index.js +1417 -1142
- inspect_ai/_view/www/log-schema.json +379 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +93 -14
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +22 -12
- inspect_ai/agent/_as_tool.py +20 -6
- inspect_ai/agent/_handoff.py +12 -1
- inspect_ai/agent/_react.py +4 -3
- inspect_ai/agent/_run.py +16 -3
- inspect_ai/agent/_types.py +9 -0
- inspect_ai/dataset/_dataset.py +6 -3
- inspect_ai/log/__init__.py +14 -0
- inspect_ai/log/_convert.py +4 -9
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +14 -25
- inspect_ai/log/_transcript.py +84 -36
- inspect_ai/log/_tree.py +118 -0
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +72 -44
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +66 -88
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/hf.py +27 -1
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +247 -0
- inspect_ai/model/_providers/vllm.py +211 -400
- inspect_ai/scorer/_choice.py +1 -2
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_chain.py +1 -1
- inspect_ai/solver/_fork.py +1 -1
- inspect_ai/solver/_multiple_choice.py +5 -22
- inspect_ai/solver/_plan.py +2 -2
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/solver/_transcript.py +6 -7
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +8 -5
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_execute.py +4 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +16 -0
- inspect_ai/util/_anyio.py +11 -0
- inspect_ai/util/_collect.py +50 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- inspect_ai/util/_span.py +58 -0
- inspect_ai/util/_subtask.py +27 -42
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
- inspect_ai/_display/core/group.py +0 -79
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
inspect_ai/log/_tree.py
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
from dataclasses import dataclass, field
|
2
|
+
from logging import getLogger
|
3
|
+
from typing import Iterable, Sequence, TypeAlias
|
4
|
+
|
5
|
+
from ._transcript import Event, SpanBeginEvent, SpanEndEvent
|
6
|
+
|
7
|
+
logger = getLogger(__name__)
|
8
|
+
|
9
|
+
EventNode: TypeAlias = "SpanNode" | Event
|
10
|
+
"""Node in an event tree."""
|
11
|
+
|
12
|
+
EventTree: TypeAlias = list[EventNode]
|
13
|
+
"""Tree of events (has invividual events and event spans)."""
|
14
|
+
|
15
|
+
|
16
|
+
@dataclass
|
17
|
+
class SpanNode:
|
18
|
+
"""Event tree node representing a span of events."""
|
19
|
+
|
20
|
+
id: str
|
21
|
+
"""Span id."""
|
22
|
+
|
23
|
+
parent_id: str | None
|
24
|
+
"""Parent span id."""
|
25
|
+
|
26
|
+
type: str | None
|
27
|
+
"""Optional 'type' field for span."""
|
28
|
+
|
29
|
+
name: str
|
30
|
+
"""Span name."""
|
31
|
+
|
32
|
+
begin: SpanBeginEvent
|
33
|
+
"""Span begin event."""
|
34
|
+
|
35
|
+
end: SpanEndEvent | None = None
|
36
|
+
"""Span end event (if any)."""
|
37
|
+
|
38
|
+
children: list[EventNode] = field(default_factory=list)
|
39
|
+
"""Children in the span."""
|
40
|
+
|
41
|
+
|
42
|
+
def event_tree(events: Sequence[Event]) -> EventTree:
|
43
|
+
"""Build a tree representation of a sequence of events.
|
44
|
+
|
45
|
+
Organize events heirarchially into event spans.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
events: Sequence of `Event`.
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
Event tree.
|
52
|
+
"""
|
53
|
+
# Convert one flat list of (possibly interleaved) events into *forest*
|
54
|
+
# (list of root-level items).
|
55
|
+
|
56
|
+
# Pre-create one node per span so we can attach events no matter when they
|
57
|
+
# arrive in the file. A single forward scan guarantees that the order of
|
58
|
+
# `children` inside every span reflects the order in which things appeared
|
59
|
+
# in the transcript.
|
60
|
+
nodes: dict[str, SpanNode] = {
|
61
|
+
ev.id: SpanNode(
|
62
|
+
id=ev.id, parent_id=ev.parent_id, type=ev.type, name=ev.name, begin=ev
|
63
|
+
)
|
64
|
+
for ev in events
|
65
|
+
if isinstance(ev, SpanBeginEvent)
|
66
|
+
}
|
67
|
+
|
68
|
+
roots: list[EventNode] = []
|
69
|
+
|
70
|
+
# Where should an event with `span_id` go?
|
71
|
+
def bucket(span_id: str | None) -> list[EventNode]:
|
72
|
+
if span_id and span_id in nodes:
|
73
|
+
return nodes[span_id].children
|
74
|
+
return roots # root level
|
75
|
+
|
76
|
+
# Single pass in original order
|
77
|
+
for ev in events:
|
78
|
+
if isinstance(ev, SpanBeginEvent): # span starts
|
79
|
+
bucket(ev.parent_id).append(nodes[ev.id])
|
80
|
+
|
81
|
+
elif isinstance(ev, SpanEndEvent): # span ends
|
82
|
+
if n := nodes.get(ev.id):
|
83
|
+
n.end = ev
|
84
|
+
else:
|
85
|
+
logger.warning(f"Span end event (id: {ev.id} with no span begin)")
|
86
|
+
|
87
|
+
else: # ordinary event
|
88
|
+
bucket(ev.span_id).append(ev)
|
89
|
+
|
90
|
+
return roots
|
91
|
+
|
92
|
+
|
93
|
+
def event_sequence(tree: EventTree) -> Iterable[Event]:
|
94
|
+
"""Flatten a span forest back into a properly ordered seqeunce.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
tree: Event tree
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
Sequence of events.
|
101
|
+
"""
|
102
|
+
for item in tree:
|
103
|
+
if isinstance(item, SpanNode):
|
104
|
+
yield item.begin
|
105
|
+
yield from event_sequence(item.children)
|
106
|
+
if item.end:
|
107
|
+
yield item.end
|
108
|
+
else:
|
109
|
+
yield item
|
110
|
+
|
111
|
+
|
112
|
+
def _print_event_tree(tree: EventTree, indent: str = "") -> None:
|
113
|
+
for item in tree:
|
114
|
+
if isinstance(item, SpanNode):
|
115
|
+
print(f"{indent}span ({item.type}): {item.name}")
|
116
|
+
_print_event_tree(item.children, f"{indent} ")
|
117
|
+
else:
|
118
|
+
print(f"{indent}{item.event}")
|
inspect_ai/log/_util.py
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
import textwrap
|
2
|
+
from datetime import date, datetime, time
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from inspect_ai._util.content import (
|
6
|
+
ContentAudio,
|
7
|
+
ContentImage,
|
8
|
+
ContentReasoning,
|
9
|
+
ContentText,
|
10
|
+
ContentVideo,
|
11
|
+
)
|
12
|
+
from inspect_ai.model._chat_message import ChatMessage
|
13
|
+
|
14
|
+
|
15
|
+
def text_input_only(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
|
16
|
+
# Clean the input of any images
|
17
|
+
if isinstance(inputs, list):
|
18
|
+
input: list[ChatMessage] = []
|
19
|
+
for message in inputs:
|
20
|
+
if not isinstance(message.content, str):
|
21
|
+
filtered_content: list[
|
22
|
+
ContentText
|
23
|
+
| ContentReasoning
|
24
|
+
| ContentImage
|
25
|
+
| ContentAudio
|
26
|
+
| ContentVideo
|
27
|
+
] = []
|
28
|
+
for content in message.content:
|
29
|
+
if content.type == "text":
|
30
|
+
filtered_content.append(content)
|
31
|
+
else:
|
32
|
+
filtered_content.append(
|
33
|
+
ContentText(text=f"({content.type.capitalize()})")
|
34
|
+
)
|
35
|
+
message.content = filtered_content
|
36
|
+
input.append(message)
|
37
|
+
else:
|
38
|
+
input.append(message)
|
39
|
+
|
40
|
+
return input
|
41
|
+
else:
|
42
|
+
return inputs
|
43
|
+
|
44
|
+
|
45
|
+
def thin_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
|
46
|
+
thinned: dict[str, Any] = {}
|
47
|
+
for key, value in metadata.items():
|
48
|
+
if isinstance(value, int | float | bool | date | time | datetime):
|
49
|
+
thinned[key] = value
|
50
|
+
elif isinstance(value, str):
|
51
|
+
thinned[key] = textwrap.shorten(value, width=1024, placeholder="...")
|
52
|
+
return thinned
|
inspect_ai/model/__init__.py
CHANGED
@@ -28,7 +28,11 @@ from ._chat_message import (
|
|
28
28
|
ChatMessageUser,
|
29
29
|
)
|
30
30
|
from ._conversation import ModelConversation
|
31
|
-
from ._generate_config import
|
31
|
+
from ._generate_config import (
|
32
|
+
GenerateConfig,
|
33
|
+
GenerateConfigArgs,
|
34
|
+
ResponseSchema,
|
35
|
+
)
|
32
36
|
from ._model import (
|
33
37
|
Model,
|
34
38
|
ModelAPI,
|
inspect_ai/model/_call_tools.py
CHANGED
@@ -60,6 +60,8 @@ from inspect_ai.tool._tool_info import parse_docstring
|
|
60
60
|
from inspect_ai.tool._tool_params import ToolParams
|
61
61
|
from inspect_ai.util import OutputLimitExceededError
|
62
62
|
from inspect_ai.util._anyio import inner_exception
|
63
|
+
from inspect_ai.util._limit import LimitExceededError, apply_limits
|
64
|
+
from inspect_ai.util._span import span
|
63
65
|
|
64
66
|
from ._chat_message import (
|
65
67
|
ChatMessage,
|
@@ -108,26 +110,18 @@ async def execute_tools(
|
|
108
110
|
"""
|
109
111
|
message = messages[-1]
|
110
112
|
if isinstance(message, ChatMessageAssistant) and message.tool_calls:
|
111
|
-
from inspect_ai.log._transcript import
|
112
|
-
ToolEvent,
|
113
|
-
Transcript,
|
114
|
-
init_transcript,
|
115
|
-
track_store_changes,
|
116
|
-
transcript,
|
117
|
-
)
|
113
|
+
from inspect_ai.log._transcript import ToolEvent, transcript
|
118
114
|
|
119
115
|
tdefs = await tool_defs(tools)
|
120
116
|
|
121
117
|
async def call_tool_task(
|
122
118
|
call: ToolCall,
|
119
|
+
event: ToolEvent,
|
123
120
|
conversation: list[ChatMessage],
|
124
121
|
send_stream: MemoryObjectSendStream[
|
125
122
|
tuple[ExecuteToolsResult, ToolEvent, Exception | None]
|
126
123
|
],
|
127
124
|
) -> None:
|
128
|
-
# create a transript for this call
|
129
|
-
init_transcript(Transcript(name=call.function))
|
130
|
-
|
131
125
|
result: ToolResult = ""
|
132
126
|
messages: list[ChatMessage] = []
|
133
127
|
output: ModelOutput | None = None
|
@@ -135,15 +129,14 @@ async def execute_tools(
|
|
135
129
|
tool_error: ToolCallError | None = None
|
136
130
|
tool_exception: Exception | None = None
|
137
131
|
try:
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
raise inner_ex.with_traceback(inner_ex.__traceback__)
|
132
|
+
try:
|
133
|
+
result, messages, output, agent = await call_tool(
|
134
|
+
tdefs, message.text, call, event, conversation
|
135
|
+
)
|
136
|
+
# unwrap exception group
|
137
|
+
except Exception as ex:
|
138
|
+
inner_ex = inner_exception(ex)
|
139
|
+
raise inner_ex.with_traceback(inner_ex.__traceback__)
|
147
140
|
|
148
141
|
except TimeoutError:
|
149
142
|
tool_error = ToolCallError(
|
@@ -171,10 +164,15 @@ async def execute_tools(
|
|
171
164
|
tool_error = ToolCallError("is_a_directory", err)
|
172
165
|
except OutputLimitExceededError as ex:
|
173
166
|
tool_error = ToolCallError(
|
174
|
-
"
|
175
|
-
f"The tool output limit of {ex.limit_str}
|
167
|
+
"limit",
|
168
|
+
f"The tool exceeded its output limit of {ex.limit_str}.",
|
176
169
|
)
|
177
170
|
result = ex.truncated_output or ""
|
171
|
+
except LimitExceededError as ex:
|
172
|
+
tool_error = ToolCallError(
|
173
|
+
"limit",
|
174
|
+
f"The tool exceeded its {ex.type} limit of {ex.limit}.",
|
175
|
+
)
|
178
176
|
except ToolParsingError as ex:
|
179
177
|
tool_error = ToolCallError("parsing", ex.message)
|
180
178
|
except ToolApprovalError as ex:
|
@@ -221,7 +219,6 @@ async def execute_tools(
|
|
221
219
|
truncated=truncated,
|
222
220
|
view=call.view,
|
223
221
|
error=tool_error,
|
224
|
-
events=list(transcript().events),
|
225
222
|
agent=agent,
|
226
223
|
)
|
227
224
|
|
@@ -264,7 +261,6 @@ async def execute_tools(
|
|
264
261
|
internal=call.internal,
|
265
262
|
pending=True,
|
266
263
|
)
|
267
|
-
transcript()._event(event)
|
268
264
|
|
269
265
|
# execute the tool call. if the operator cancels the
|
270
266
|
# tool call then synthesize the appropriate message/event
|
@@ -274,7 +270,7 @@ async def execute_tools(
|
|
274
270
|
|
275
271
|
result_exception = None
|
276
272
|
async with anyio.create_task_group() as tg:
|
277
|
-
tg.start_soon(call_tool_task, call, messages, send_stream)
|
273
|
+
tg.start_soon(call_tool_task, call, event, messages, send_stream)
|
278
274
|
event._set_cancel_fn(tg.cancel_scope.cancel)
|
279
275
|
async with receive_stream:
|
280
276
|
(
|
@@ -300,7 +296,6 @@ async def execute_tools(
|
|
300
296
|
truncated=None,
|
301
297
|
view=call.view,
|
302
298
|
error=tool_message.error,
|
303
|
-
events=[],
|
304
299
|
)
|
305
300
|
transcript().info(
|
306
301
|
f"Tool call '{call.function}' was cancelled by operator."
|
@@ -320,7 +315,6 @@ async def execute_tools(
|
|
320
315
|
result=result_event.result,
|
321
316
|
truncated=result_event.truncated,
|
322
317
|
error=result_event.error,
|
323
|
-
events=result_event.events,
|
324
318
|
waiting_time=waiting_time_end - waiting_time_start,
|
325
319
|
agent=result_event.agent,
|
326
320
|
failed=True if result_exception else None,
|
@@ -341,18 +335,34 @@ async def execute_tools(
|
|
341
335
|
|
342
336
|
|
343
337
|
async def call_tool(
|
344
|
-
tools: list[ToolDef],
|
338
|
+
tools: list[ToolDef],
|
339
|
+
message: str,
|
340
|
+
call: ToolCall,
|
341
|
+
event: BaseModel,
|
342
|
+
conversation: list[ChatMessage],
|
345
343
|
) -> tuple[ToolResult, list[ChatMessage], ModelOutput | None, str | None]:
|
346
344
|
from inspect_ai.agent._handoff import AgentTool
|
345
|
+
from inspect_ai.log._transcript import SampleLimitEvent, ToolEvent, transcript
|
346
|
+
|
347
|
+
# dodge circular import
|
348
|
+
assert isinstance(event, ToolEvent)
|
349
|
+
|
350
|
+
# this function is responsible for transcript events so that it can
|
351
|
+
# put them in the right enclosure (e.g. handoff/agent/tool). This
|
352
|
+
# means that if we throw early we need to do the enclosure when raising.
|
353
|
+
async def record_tool_parsing_error(error: str) -> Exception:
|
354
|
+
async with span(name=call.function, type="tool"):
|
355
|
+
transcript()._event(event)
|
356
|
+
return ToolParsingError(error)
|
347
357
|
|
348
358
|
# if there was an error parsing the ToolCall, raise that
|
349
359
|
if call.parse_error:
|
350
|
-
raise
|
360
|
+
raise await record_tool_parsing_error(call.parse_error)
|
351
361
|
|
352
362
|
# find the tool
|
353
363
|
tool_def = next((tool for tool in tools if tool.name == call.function), None)
|
354
364
|
if tool_def is None:
|
355
|
-
raise
|
365
|
+
raise await record_tool_parsing_error(f"Tool {call.function} not found")
|
356
366
|
|
357
367
|
# if we have a tool approver, apply it now
|
358
368
|
from inspect_ai.approval._apply import apply_tool_approval
|
@@ -362,14 +372,11 @@ async def call_tool(
|
|
362
372
|
)
|
363
373
|
if not approved:
|
364
374
|
if approval and approval.decision == "terminate":
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
"operator",
|
369
|
-
value=1,
|
370
|
-
limit=1,
|
371
|
-
message="Tool call approver requested termination.",
|
375
|
+
message = "Tool call approver requested termination."
|
376
|
+
transcript()._event(
|
377
|
+
SampleLimitEvent(type="operator", limit=1, message=message)
|
372
378
|
)
|
379
|
+
raise LimitExceededError("operator", value=1, limit=1, message=message)
|
373
380
|
else:
|
374
381
|
raise ToolApprovalError(approval.explanation if approval else None)
|
375
382
|
if approval and approval.modified:
|
@@ -378,7 +385,7 @@ async def call_tool(
|
|
378
385
|
# validate the schema of the passed object
|
379
386
|
validation_errors = validate_tool_input(call.arguments, tool_def.parameters)
|
380
387
|
if validation_errors:
|
381
|
-
raise
|
388
|
+
raise await record_tool_parsing_error(validation_errors)
|
382
389
|
|
383
390
|
# get arguments (with creation of dataclasses, pydantic objects, etc.)
|
384
391
|
arguments = tool_params(call.arguments, tool_def.tool)
|
@@ -387,14 +394,18 @@ async def call_tool(
|
|
387
394
|
with trace_action(
|
388
395
|
logger, "Tool Call", format_function_call(tool_def.name, arguments, width=1000)
|
389
396
|
):
|
390
|
-
# agent tools get special handling
|
391
397
|
if isinstance(tool_def.tool, AgentTool):
|
392
|
-
|
398
|
+
async with span(tool_def.tool.name, type="handoff"):
|
399
|
+
async with span(name=call.function, type="tool"):
|
400
|
+
transcript()._event(event)
|
401
|
+
return await agent_handoff(tool_def, call, conversation)
|
393
402
|
|
394
403
|
# normal tool call
|
395
404
|
else:
|
396
|
-
|
397
|
-
|
405
|
+
async with span(name=call.function, type="tool"):
|
406
|
+
transcript()._event(event)
|
407
|
+
result: ToolResult = await tool_def.tool(**arguments)
|
408
|
+
return result, [], None, None
|
398
409
|
|
399
410
|
|
400
411
|
async def agent_handoff(
|
@@ -454,9 +465,15 @@ async def agent_handoff(
|
|
454
465
|
arguments = tool_params(arguments, agent_tool.agent)
|
455
466
|
del arguments["state"]
|
456
467
|
|
457
|
-
#
|
468
|
+
# run the agent with limits
|
469
|
+
limit_error: LimitExceededError | None = None
|
458
470
|
agent_state = AgentState(messages=copy(agent_conversation))
|
459
|
-
|
471
|
+
try:
|
472
|
+
with apply_limits(agent_tool.limits):
|
473
|
+
async with span(name=agent_name, type="agent"):
|
474
|
+
agent_state = await agent_tool.agent(agent_state, **arguments)
|
475
|
+
except LimitExceededError as ex:
|
476
|
+
limit_error = ex
|
460
477
|
|
461
478
|
# determine which messages are new and return only those (but exclude new
|
462
479
|
# system messages as they an internal matter for the handed off to agent.
|
@@ -474,9 +491,20 @@ async def agent_handoff(
|
|
474
491
|
if agent_tool.output_filter is not None:
|
475
492
|
agent_messages = await agent_tool.output_filter(agent_messages)
|
476
493
|
|
494
|
+
if limit_error is not None:
|
495
|
+
agent_messages.append(
|
496
|
+
ChatMessageUser(
|
497
|
+
content=(
|
498
|
+
f"The {agent_name} exceeded its {limit_error.type} limit of "
|
499
|
+
f"{limit_error.limit}."
|
500
|
+
)
|
501
|
+
)
|
502
|
+
)
|
477
503
|
# if we end with an assistant message then add a user message
|
478
504
|
# so that the calling agent carries on
|
479
|
-
|
505
|
+
elif len(agent_messages) == 0 or isinstance(
|
506
|
+
agent_messages[-1], ChatMessageAssistant
|
507
|
+
):
|
480
508
|
agent_messages.append(
|
481
509
|
ChatMessageUser(content=f"The {agent_name} agent has completed its work.")
|
482
510
|
)
|
@@ -106,6 +106,9 @@ class GenerateConfigArgs(TypedDict, total=False):
|
|
106
106
|
response_schema: ResponseSchema | None
|
107
107
|
"""Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
|
108
108
|
|
109
|
+
extra_body: dict[str, Any] | None
|
110
|
+
"""Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
111
|
+
|
109
112
|
|
110
113
|
class GenerateConfig(BaseModel):
|
111
114
|
"""Model generation options."""
|
@@ -138,28 +141,28 @@ class GenerateConfig(BaseModel):
|
|
138
141
|
"""Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
|
139
142
|
|
140
143
|
frequency_penalty: float | None = Field(default=None)
|
141
|
-
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, and
|
144
|
+
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
|
142
145
|
|
143
146
|
presence_penalty: float | None = Field(default=None)
|
144
|
-
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, and
|
147
|
+
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
|
145
148
|
|
146
149
|
logit_bias: dict[int, float] | None = Field(default=None)
|
147
|
-
"""Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, and
|
150
|
+
"""Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
|
148
151
|
|
149
152
|
seed: int | None = Field(default=None)
|
150
153
|
"""Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
|
151
154
|
|
152
155
|
top_k: int | None = Field(default=None)
|
153
|
-
"""Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, and
|
156
|
+
"""Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, vLLM, and SGLang only."""
|
154
157
|
|
155
158
|
num_choices: int | None = Field(default=None)
|
156
|
-
"""How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, and
|
159
|
+
"""How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, vLLM, and SGLang only."""
|
157
160
|
|
158
161
|
logprobs: bool | None = Field(default=None)
|
159
|
-
"""Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, and
|
162
|
+
"""Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, vLLM, and SGLang only."""
|
160
163
|
|
161
164
|
top_logprobs: int | None = Field(default=None)
|
162
|
-
"""Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, and
|
165
|
+
"""Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, vLLM, and SGLang only."""
|
163
166
|
|
164
167
|
parallel_tool_calls: bool | None = Field(default=None)
|
165
168
|
"""Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
|
@@ -190,7 +193,10 @@ class GenerateConfig(BaseModel):
|
|
190
193
|
"""Include reasoning in chat message history sent to generate."""
|
191
194
|
|
192
195
|
response_schema: ResponseSchema | None = Field(default=None)
|
193
|
-
"""Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and
|
196
|
+
"""Request a response format as JSONSchema (output should still be validated). OpenAI, Google, Mistral, vLLM, and SGLang only."""
|
197
|
+
|
198
|
+
extra_body: dict[str, Any] | None = Field(default=None)
|
199
|
+
"""Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
194
200
|
|
195
201
|
# migrate reasoning_history as a bool
|
196
202
|
@model_validator(mode="before")
|