inspect-ai 0.3.82__py3-none-any.whl → 0.3.84__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +2 -1
- inspect_ai/_display/textual/app.py +14 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +9 -3
- inspect_ai/_display/textual/widgets/task_detail.py +3 -4
- inspect_ai/_display/textual/widgets/tasks.py +17 -1
- inspect_ai/_display/textual/widgets/vscode.py +48 -0
- inspect_ai/_eval/eval.py +36 -24
- inspect_ai/_eval/evalset.py +17 -18
- inspect_ai/_eval/loader.py +34 -11
- inspect_ai/_eval/run.py +8 -13
- inspect_ai/_eval/score.py +13 -3
- inspect_ai/_eval/task/generate.py +8 -9
- inspect_ai/_eval/task/log.py +2 -0
- inspect_ai/_eval/task/task.py +23 -9
- inspect_ai/_util/file.py +13 -0
- inspect_ai/_util/json.py +2 -1
- inspect_ai/_util/registry.py +1 -0
- inspect_ai/_util/vscode.py +37 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +304 -128
- inspect_ai/_view/www/dist/assets/index.js +47495 -27519
- inspect_ai/_view/www/log-schema.json +124 -31
- inspect_ai/_view/www/package.json +3 -0
- inspect_ai/_view/www/src/App.tsx +12 -0
- inspect_ai/_view/www/src/appearance/icons.ts +1 -0
- inspect_ai/_view/www/src/components/Card.tsx +6 -4
- inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
- inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
- inspect_ai/_view/www/src/components/LiveVirtualList.tsx +1 -1
- inspect_ai/_view/www/src/components/MarkdownDiv.tsx +113 -23
- inspect_ai/_view/www/src/components/Modal.module.css +38 -0
- inspect_ai/_view/www/src/components/Modal.tsx +77 -0
- inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
- inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
- inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +7 -0
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +7 -0
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +11 -34
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +6 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +2 -2
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +12 -0
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +2 -0
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -0
- inspect_ai/_view/www/src/samples/chat/messages.ts +3 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +1 -0
- inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +9 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
- inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
- inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -11
- inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +2 -1
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +7 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +25 -8
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +1 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +11 -22
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
- inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
- inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
- inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +25 -4
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +29 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +0 -1
- inspect_ai/_view/www/src/state/hooks.ts +5 -3
- inspect_ai/_view/www/src/state/logPolling.ts +5 -1
- inspect_ai/_view/www/src/state/logSlice.ts +10 -0
- inspect_ai/_view/www/src/state/samplePolling.ts +4 -1
- inspect_ai/_view/www/src/state/sampleSlice.ts +13 -0
- inspect_ai/_view/www/src/types/log.d.ts +34 -26
- inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
- inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +18 -16
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -0
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +68 -71
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
- inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +1 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +18 -0
- inspect_ai/_view/www/yarn.lock +94 -1
- inspect_ai/agent/__init__.py +36 -0
- inspect_ai/agent/_agent.py +268 -0
- inspect_ai/agent/_as_solver.py +72 -0
- inspect_ai/agent/_as_tool.py +122 -0
- inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
- inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
- inspect_ai/agent/_filter.py +46 -0
- inspect_ai/agent/_handoff.py +93 -0
- inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
- inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
- inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
- inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
- inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
- inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
- inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
- inspect_ai/agent/_react.py +241 -0
- inspect_ai/agent/_run.py +36 -0
- inspect_ai/agent/_types.py +81 -0
- inspect_ai/log/_log.py +11 -2
- inspect_ai/log/_transcript.py +13 -9
- inspect_ai/model/__init__.py +7 -1
- inspect_ai/model/_call_tools.py +256 -52
- inspect_ai/model/_chat_message.py +7 -4
- inspect_ai/model/_conversation.py +13 -62
- inspect_ai/model/_display.py +85 -0
- inspect_ai/model/_model.py +113 -14
- inspect_ai/model/_model_output.py +14 -9
- inspect_ai/model/_openai.py +16 -4
- inspect_ai/model/_openai_computer_use.py +162 -0
- inspect_ai/model/_openai_responses.py +319 -165
- inspect_ai/model/_providers/anthropic.py +20 -21
- inspect_ai/model/_providers/azureai.py +24 -13
- inspect_ai/model/_providers/bedrock.py +1 -7
- inspect_ai/model/_providers/cloudflare.py +3 -3
- inspect_ai/model/_providers/goodfire.py +2 -6
- inspect_ai/model/_providers/google.py +11 -10
- inspect_ai/model/_providers/groq.py +6 -3
- inspect_ai/model/_providers/hf.py +7 -3
- inspect_ai/model/_providers/mistral.py +7 -10
- inspect_ai/model/_providers/openai.py +47 -17
- inspect_ai/model/_providers/openai_o1.py +11 -4
- inspect_ai/model/_providers/openai_responses.py +12 -14
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/together.py +12 -2
- inspect_ai/model/_providers/util/chatapi.py +7 -2
- inspect_ai/model/_providers/util/hf_handler.py +4 -2
- inspect_ai/model/_providers/util/llama31.py +4 -2
- inspect_ai/model/_providers/vertex.py +11 -9
- inspect_ai/model/_providers/vllm.py +4 -4
- inspect_ai/scorer/__init__.py +2 -0
- inspect_ai/scorer/_metrics/__init__.py +2 -0
- inspect_ai/scorer/_metrics/grouped.py +84 -0
- inspect_ai/scorer/_score.py +26 -6
- inspect_ai/solver/__init__.py +2 -2
- inspect_ai/solver/_basic_agent.py +22 -9
- inspect_ai/solver/_bridge.py +31 -0
- inspect_ai/solver/_chain.py +20 -12
- inspect_ai/solver/_fork.py +5 -1
- inspect_ai/solver/_human_agent.py +52 -0
- inspect_ai/solver/_prompt.py +3 -1
- inspect_ai/solver/_run.py +59 -0
- inspect_ai/solver/_solver.py +14 -4
- inspect_ai/solver/_task_state.py +5 -3
- inspect_ai/tool/_tool_call.py +15 -8
- inspect_ai/tool/_tool_def.py +17 -12
- inspect_ai/tool/_tool_support_helpers.py +2 -2
- inspect_ai/tool/_tool_with.py +14 -11
- inspect_ai/tool/_tools/_bash_session.py +11 -2
- inspect_ai/tool/_tools/_computer/_common.py +18 -2
- inspect_ai/tool/_tools/_computer/_computer.py +18 -2
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
- inspect_ai/tool/_tools/_think.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +100 -61
- inspect_ai/util/__init__.py +2 -0
- inspect_ai/util/_anyio.py +27 -0
- inspect_ai/util/_sandbox/__init__.py +2 -1
- inspect_ai/util/_sandbox/context.py +32 -7
- inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
- inspect_ai/util/_sandbox/docker/compose.py +2 -2
- inspect_ai/util/_sandbox/docker/docker.py +12 -1
- inspect_ai/util/_store_model.py +30 -7
- inspect_ai/util/_subprocess.py +13 -3
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/RECORD +179 -153
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -167
- /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
- /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.82.dist-info → inspect_ai-0.3.84.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
import time as python_time
|
2
2
|
|
3
3
|
from pydantic import BaseModel, Field
|
4
4
|
|
@@ -25,7 +25,7 @@ class HumanAgentState(StoreModel):
|
|
25
25
|
"""Set current running state."""
|
26
26
|
# if we are flipping to running mode then update started running
|
27
27
|
if not self.running_state and running:
|
28
|
-
self.started_running =
|
28
|
+
self.started_running = python_time.time()
|
29
29
|
|
30
30
|
# if we are exiting running mode then update accumulated time
|
31
31
|
if self.running_state and not running:
|
@@ -37,7 +37,7 @@ class HumanAgentState(StoreModel):
|
|
37
37
|
@property
|
38
38
|
def time(self) -> float:
|
39
39
|
"""Total time spend on task."""
|
40
|
-
running_time =
|
40
|
+
running_time = python_time.time() - self.started_running if self.running else 0
|
41
41
|
return self.accumulated_time + running_time
|
42
42
|
|
43
43
|
scorings: list[IntermediateScoring] = Field(default_factory=list)
|
@@ -50,6 +50,6 @@ class HumanAgentState(StoreModel):
|
|
50
50
|
"""Session logs generated by `script` """
|
51
51
|
|
52
52
|
# internal state variables used by running and time properties
|
53
|
-
running_state: bool = Field(default=
|
54
|
-
started_running: float = Field(default_factory=
|
53
|
+
running_state: bool = Field(default=False)
|
54
|
+
started_running: float = Field(default_factory=python_time.time)
|
55
55
|
accumulated_time: float = Field(default=0.0)
|
@@ -0,0 +1,241 @@
|
|
1
|
+
from logging import getLogger
|
2
|
+
|
3
|
+
from inspect_ai._util._async import is_callable_coroutine
|
4
|
+
from inspect_ai.model._call_tools import execute_tools
|
5
|
+
from inspect_ai.model._chat_message import (
|
6
|
+
ChatMessage,
|
7
|
+
ChatMessageSystem,
|
8
|
+
ChatMessageUser,
|
9
|
+
)
|
10
|
+
from inspect_ai.model._model import Model, get_model
|
11
|
+
from inspect_ai.scorer._score import score
|
12
|
+
from inspect_ai.tool._tool import Tool, ToolResult, tool
|
13
|
+
from inspect_ai.tool._tool_call import ToolCall
|
14
|
+
from inspect_ai.tool._tool_info import parse_tool_info
|
15
|
+
from inspect_ai.tool._tool_with import tool_with
|
16
|
+
|
17
|
+
from ._agent import Agent, AgentState, agent, agent_with
|
18
|
+
from ._handoff import has_handoff
|
19
|
+
from ._types import (
|
20
|
+
AgentAttempts,
|
21
|
+
AgentContinue,
|
22
|
+
AgentPrompt,
|
23
|
+
AgentSubmit,
|
24
|
+
)
|
25
|
+
|
26
|
+
logger = getLogger(__name__)
|
27
|
+
|
28
|
+
|
29
|
+
@agent
|
30
|
+
def react(
|
31
|
+
*,
|
32
|
+
name: str | None = None,
|
33
|
+
description: str | None = None,
|
34
|
+
prompt: str | AgentPrompt | None = AgentPrompt(),
|
35
|
+
tools: list[Tool] | None = None,
|
36
|
+
model: str | Model | Agent | None = None,
|
37
|
+
attempts: int | AgentAttempts = 1,
|
38
|
+
submit: AgentSubmit = AgentSubmit(),
|
39
|
+
on_continue: str | AgentContinue | None = None,
|
40
|
+
) -> Agent:
|
41
|
+
"""Extensible ReAct agent based on the paper [ReAct: Synergizing Reasoning and Acting in Language Models](https://arxiv.org/abs/2210.03629).
|
42
|
+
|
43
|
+
Provide a `name` and `description` for the agent if you plan on using it
|
44
|
+
in a multi-agent system (this is so other agents can clearly identify
|
45
|
+
its name and purpose). These fields are not required when using `react()`
|
46
|
+
as a top-level solver.
|
47
|
+
|
48
|
+
The agent runs a tool use loop until the model submits an answer using the
|
49
|
+
`submit()` tool. Use `instructions` to tailor the agent's system message
|
50
|
+
(the default `instructions` provides a basic ReAct prompt).
|
51
|
+
|
52
|
+
Use the `attempts` option to enable additional submissions if the initial
|
53
|
+
submission(s) are incorrect (by default, no additional attempts are permitted).
|
54
|
+
|
55
|
+
By default, the model will be urged to continue if it fails to call
|
56
|
+
a tool. Customise this behavior using the `on_continue` option.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
name: Agent name (required when using with `handoff()` or `as_tool()`)
|
60
|
+
description: Agent description (required when using with `handoff()` or `as_tool()`)
|
61
|
+
prompt: Prompt for agent. Includes agent-specific contextual `instructions`
|
62
|
+
as well as an optional `assistant_prompt` and `handoff_prompt` (for agents
|
63
|
+
that use handoffs). both are provided by default but can be removed or
|
64
|
+
customized). Pass `str` to specify the instructions and use the defaults
|
65
|
+
for handoff and prompt messages.
|
66
|
+
tools: Tools available for the agent.
|
67
|
+
model: Model to use for agent (defaults to currently evaluated model).
|
68
|
+
attempts: Configure agent to make multiple attempts.
|
69
|
+
submit: Configure submit tool used by agent.
|
70
|
+
on_continue: Message to play back to the model to urge it to continue.
|
71
|
+
Optionally, can also be an async function to call to determine whether
|
72
|
+
the loop should continue (executed on every turn) and what message
|
73
|
+
to play back.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
ReAct agent.
|
77
|
+
"""
|
78
|
+
# resolve prompt / system message
|
79
|
+
prompt = AgentPrompt(prompt) if isinstance(prompt, str) else prompt
|
80
|
+
if prompt:
|
81
|
+
prompt_lines: list[str] = []
|
82
|
+
if prompt.instructions:
|
83
|
+
prompt_lines.append(prompt.instructions)
|
84
|
+
if prompt.handoff_prompt and has_handoff(tools):
|
85
|
+
prompt_lines.append(prompt.handoff_prompt)
|
86
|
+
if prompt.assistant_prompt:
|
87
|
+
prompt_lines.append(prompt.assistant_prompt)
|
88
|
+
prompt_content = "\n\n".join(prompt_lines).format(submit=submit.name)
|
89
|
+
system_message: ChatMessage | None = ChatMessageSystem(content=prompt_content)
|
90
|
+
else:
|
91
|
+
system_message = None
|
92
|
+
|
93
|
+
# resolve on_continue
|
94
|
+
if on_continue is None:
|
95
|
+
on_continue = "If you believe you have completed the task, please call the `submit()` tool with your answer."
|
96
|
+
if isinstance(on_continue, str):
|
97
|
+
no_tools_continue_message = on_continue
|
98
|
+
|
99
|
+
async def no_tools_continue(state: AgentState) -> bool | str:
|
100
|
+
if state.output is None or not state.output.message.tool_calls:
|
101
|
+
return no_tools_continue_message
|
102
|
+
else:
|
103
|
+
return True
|
104
|
+
|
105
|
+
on_continue = no_tools_continue
|
106
|
+
|
107
|
+
# validate that on_continue is async
|
108
|
+
if not is_callable_coroutine(on_continue):
|
109
|
+
raise ValueError("The on_continue function must be async.")
|
110
|
+
|
111
|
+
# resolve attempts
|
112
|
+
attempts = AgentAttempts(attempts) if isinstance(attempts, int) else attempts
|
113
|
+
|
114
|
+
# submission tool
|
115
|
+
@tool
|
116
|
+
def submit_tool() -> Tool:
|
117
|
+
async def execute(answer: str) -> ToolResult:
|
118
|
+
"""Submit an answer for evaluation.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
answer (str): Submitted answer
|
122
|
+
"""
|
123
|
+
return answer
|
124
|
+
|
125
|
+
return execute
|
126
|
+
|
127
|
+
# helper to see if there is a submit tool call
|
128
|
+
def submitted_answer(tool_calls: list[ToolCall] | None) -> str | None:
|
129
|
+
for tool_call in tool_calls or []:
|
130
|
+
if tool_call.function == submit.name and tool_call.parse_error is None:
|
131
|
+
return str(tool_call.arguments["answer"])
|
132
|
+
return None
|
133
|
+
|
134
|
+
# resolve tools
|
135
|
+
tools = tools or []
|
136
|
+
tools.append(tool_with(submit_tool(), submit.name, submit.description))
|
137
|
+
|
138
|
+
async def execute(state: AgentState) -> AgentState:
|
139
|
+
# prepend system message if we have one
|
140
|
+
if system_message:
|
141
|
+
state.messages.insert(0, system_message)
|
142
|
+
|
143
|
+
# track attempts
|
144
|
+
attempt_count = 0
|
145
|
+
|
146
|
+
# main loop = will terminate after submit (subject to max_attempts)
|
147
|
+
# or if a message or token limit is hit
|
148
|
+
while True:
|
149
|
+
# generate output and append assistant message
|
150
|
+
state = await _agent_generate(model, state, tools)
|
151
|
+
|
152
|
+
# check for context window overflow
|
153
|
+
if state.output.stop_reason == "model_length":
|
154
|
+
from inspect_ai.log._transcript import transcript
|
155
|
+
|
156
|
+
transcript().info("Agent terminated: model context window exceeded")
|
157
|
+
break
|
158
|
+
|
159
|
+
# check for a submission
|
160
|
+
answer = submitted_answer(state.output.message.tool_calls)
|
161
|
+
if answer is not None:
|
162
|
+
# remove the tool call and set the output to the answer for scoring
|
163
|
+
state.output.message.tool_calls = None
|
164
|
+
state.output.completion = (
|
165
|
+
f"{state.output.completion}\n\n{answer}".strip()
|
166
|
+
)
|
167
|
+
|
168
|
+
# exit if we are at max_attempts
|
169
|
+
attempt_count += 1
|
170
|
+
if attempt_count >= attempts.attempts:
|
171
|
+
break
|
172
|
+
|
173
|
+
# exit if the submission is successful
|
174
|
+
answer_scores = await score(state)
|
175
|
+
if attempts.score_value(answer_scores[0].value) == 1.0:
|
176
|
+
break
|
177
|
+
|
178
|
+
# otherwise notify the model that it was incorrect and continue
|
179
|
+
else:
|
180
|
+
if callable(attempts.incorrect_message):
|
181
|
+
if not is_callable_coroutine(attempts.incorrect_message):
|
182
|
+
raise ValueError(
|
183
|
+
"The incorrect_message function must be async."
|
184
|
+
)
|
185
|
+
response_message: str = await attempts.incorrect_message(
|
186
|
+
state, answer_scores
|
187
|
+
)
|
188
|
+
else:
|
189
|
+
response_message = attempts.incorrect_message
|
190
|
+
|
191
|
+
state.messages.append(ChatMessageUser(content=response_message))
|
192
|
+
|
193
|
+
# no submitted answer, call tools and evaluate whether we should continue
|
194
|
+
else:
|
195
|
+
if state.output.message.tool_calls:
|
196
|
+
# call tool functions
|
197
|
+
messages, output = await execute_tools(state.messages, tools)
|
198
|
+
state.messages.extend(messages)
|
199
|
+
if output:
|
200
|
+
state.output = output
|
201
|
+
|
202
|
+
# check if we should continue....
|
203
|
+
do_continue = await on_continue(state)
|
204
|
+
if isinstance(do_continue, str):
|
205
|
+
state.messages.append(ChatMessageUser(content=do_continue))
|
206
|
+
elif do_continue is False:
|
207
|
+
break
|
208
|
+
|
209
|
+
return state
|
210
|
+
|
211
|
+
if name is not None or description is not None:
|
212
|
+
return agent_with(execute, name=name, description=description)
|
213
|
+
else:
|
214
|
+
return execute
|
215
|
+
|
216
|
+
|
217
|
+
async def _agent_generate(
|
218
|
+
model: str | Model | Agent | None, state: AgentState, tools: list[Tool]
|
219
|
+
) -> AgentState:
|
220
|
+
# convert model to agent
|
221
|
+
if isinstance(model, str | Model) or model is None:
|
222
|
+
model = _model_generate(model)
|
223
|
+
|
224
|
+
# confirm we have a tools param
|
225
|
+
agent_tool_info = parse_tool_info(model)
|
226
|
+
if "tools" not in agent_tool_info.parameters.properties:
|
227
|
+
raise ValueError(
|
228
|
+
"Agent passed as model for react agent must have a tools parameter."
|
229
|
+
)
|
230
|
+
|
231
|
+
# call the agent
|
232
|
+
return await model(state, tools)
|
233
|
+
|
234
|
+
|
235
|
+
def _model_generate(model: str | Model | None) -> Agent:
|
236
|
+
async def generate(state: AgentState, tools: list[Tool]) -> AgentState:
|
237
|
+
state.output = await get_model(model).generate(state.messages, tools)
|
238
|
+
state.messages.append(state.output.message)
|
239
|
+
return state
|
240
|
+
|
241
|
+
return generate
|
inspect_ai/agent/_run.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
from copy import copy
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
|
5
|
+
|
6
|
+
from ._agent import Agent, AgentState
|
7
|
+
|
8
|
+
|
9
|
+
async def run(
|
10
|
+
agent: Agent, input: str | list[ChatMessage] | AgentState, **agent_kwargs: Any
|
11
|
+
) -> AgentState:
|
12
|
+
"""Run an agent.
|
13
|
+
|
14
|
+
The input messages(s) will be copied prior to running so are
|
15
|
+
not modified in place.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
agent: Agent to run.
|
19
|
+
input: Agent input (string, list of messages, or an `AgentState`).
|
20
|
+
**agent_kwargs: Additional arguments to pass to agent.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
AgentState: Messages and generated output.
|
24
|
+
"""
|
25
|
+
# copy input so we don't mutate it in place
|
26
|
+
input = copy(input)
|
27
|
+
|
28
|
+
# resolve str
|
29
|
+
if isinstance(input, str):
|
30
|
+
input = [ChatMessageUser(content=input)]
|
31
|
+
|
32
|
+
# create state
|
33
|
+
state = AgentState(messages=input) if isinstance(input, list) else input
|
34
|
+
|
35
|
+
# run the agent
|
36
|
+
return await agent(state, **agent_kwargs)
|
@@ -0,0 +1,81 @@
|
|
1
|
+
from typing import Awaitable, Callable, NamedTuple, TypeAlias
|
2
|
+
|
3
|
+
from inspect_ai.agent._agent import AgentState
|
4
|
+
from inspect_ai.scorer._metric import Score, ValueToFloat, value_to_float
|
5
|
+
|
6
|
+
DEFAULT_HANDOFF_PROMPT = """
|
7
|
+
You are part of a multi-agent system designed to make agent coordination and
|
8
|
+
execution easy. Agents uses two primary abstraction: **Agents** and **Handoffs**.
|
9
|
+
An agent encompasses instructions and tools and can hand off a conversation to
|
10
|
+
another agent when appropriate. Handoffs are achieved by calling a handoff function,
|
11
|
+
generally named `transfer_to_<agent_name>`. Transfers between agents are handled
|
12
|
+
seamlessly in the background; do not mention or draw attention to these transfers
|
13
|
+
in your conversation with the user.
|
14
|
+
"""
|
15
|
+
|
16
|
+
|
17
|
+
DEFAULT_ASSISTANT_PROMPT = """
|
18
|
+
You are a helpful assistant attempting to submit the best possible answer.
|
19
|
+
You have several tools available to help with finding the answer. You will
|
20
|
+
see the result of tool calls right after sending the message. If you need
|
21
|
+
to perform multiple actions, you can always send more messages with additional
|
22
|
+
tool calls. Do some reasoning before your actions, describing what tool calls
|
23
|
+
you are going to use and how they fit into your plan.
|
24
|
+
|
25
|
+
When you have completed the task and have an answer, call the {submit}()
|
26
|
+
tool to report it.
|
27
|
+
"""
|
28
|
+
|
29
|
+
|
30
|
+
class AgentPrompt(NamedTuple):
|
31
|
+
"""Prompt for agent."""
|
32
|
+
|
33
|
+
instructions: str | None = None
|
34
|
+
"""Agent-specific contextual instructions."""
|
35
|
+
|
36
|
+
handoff_prompt: str | None = DEFAULT_HANDOFF_PROMPT
|
37
|
+
"""Prompt used when there are additional handoff agents active."""
|
38
|
+
|
39
|
+
assistant_prompt: str | None = DEFAULT_ASSISTANT_PROMPT
|
40
|
+
"""Prompt for assistant (covers tool use, submit tool, CoT, etc.)."""
|
41
|
+
|
42
|
+
|
43
|
+
AgentContinue: TypeAlias = Callable[[AgentState], Awaitable[bool | str]]
|
44
|
+
"""Function called to determine whether the agent should continue.
|
45
|
+
|
46
|
+
Returns `True` to continue (with no additional messages inserted),
|
47
|
+
return `False` to stop. Returns `str` to continue with an additional
|
48
|
+
custom user message inserted.
|
49
|
+
"""
|
50
|
+
|
51
|
+
|
52
|
+
class AgentAttempts(NamedTuple):
|
53
|
+
"""Configure a react agent to make multiple attempts.
|
54
|
+
|
55
|
+
Submissions are evaluated using the task's main scorer, with value of 1.0
|
56
|
+
indicating a correct answer. Scorer values are converted to float (e.g.
|
57
|
+
"C" becomes 1.0) using the standard value_to_float() function. Provide an
|
58
|
+
alternate conversion scheme as required via `score_value`.
|
59
|
+
"""
|
60
|
+
|
61
|
+
attempts: int = 1
|
62
|
+
"""Maximum number of attempts."""
|
63
|
+
|
64
|
+
incorrect_message: str | Callable[[AgentState, list[Score]], Awaitable[str]] = (
|
65
|
+
"Your submission was incorrect. Please proceed and attempt to find the correct answer."
|
66
|
+
)
|
67
|
+
"""User message reply for an incorrect submission from the model. Alternatively,
|
68
|
+
an async function which returns a message."""
|
69
|
+
|
70
|
+
score_value: ValueToFloat = value_to_float()
|
71
|
+
"""Function used to extract float from scores (defaults to standard value_to_float())"""
|
72
|
+
|
73
|
+
|
74
|
+
class AgentSubmit(NamedTuple):
|
75
|
+
"""Configure the submit tool of a react agent."""
|
76
|
+
|
77
|
+
name: str = "submit"
|
78
|
+
"""Name for submit tool."""
|
79
|
+
|
80
|
+
description: str = "Submit an answer for evaluation."
|
81
|
+
"""Description of submit tool."""
|
inspect_ai/log/_log.py
CHANGED
@@ -209,14 +209,16 @@ class EvalSample(BaseModel):
|
|
209
209
|
store: dict[str, Any] = Field(default_factory=dict)
|
210
210
|
"""State at end of sample execution."""
|
211
211
|
|
212
|
-
def store_as(self, model_cls: Type[SMT]) -> SMT:
|
212
|
+
def store_as(self, model_cls: Type[SMT], instance: str | None = None) -> SMT:
|
213
213
|
"""Pydantic model interface to the store.
|
214
214
|
|
215
215
|
Args:
|
216
216
|
model_cls: Pydantic model type (must derive from StoreModel)
|
217
|
+
instance: Optional instances name for store (enables multiple instances
|
218
|
+
of a given StoreModel type within a single sample)
|
217
219
|
|
218
220
|
Returns:
|
219
|
-
StoreModel:
|
221
|
+
StoreModel: model_cls bound to sample store data.
|
220
222
|
"""
|
221
223
|
# un-namespace names for creation
|
222
224
|
data = {
|
@@ -226,6 +228,10 @@ class EvalSample(BaseModel):
|
|
226
228
|
# since we are reading from the log provide a fully detached store
|
227
229
|
data["store"] = Store()
|
228
230
|
|
231
|
+
# provide instance if specified
|
232
|
+
if instance is not None:
|
233
|
+
data["instance"] = instance
|
234
|
+
|
229
235
|
# create the model
|
230
236
|
return model_cls.model_validate(data)
|
231
237
|
|
@@ -566,6 +572,9 @@ class EvalSpec(BaseModel):
|
|
566
572
|
task_file: str | None = Field(default=None)
|
567
573
|
"""Task source file."""
|
568
574
|
|
575
|
+
task_registry_name: str | None = Field(default=None)
|
576
|
+
"""Task registry name."""
|
577
|
+
|
569
578
|
task_attribs: dict[str, Any] = Field(default_factory=dict)
|
570
579
|
"""Attributes of the @task decorator."""
|
571
580
|
|
inspect_ai/log/_transcript.py
CHANGED
@@ -14,13 +14,7 @@ from typing import (
|
|
14
14
|
Union,
|
15
15
|
)
|
16
16
|
|
17
|
-
from pydantic import
|
18
|
-
BaseModel,
|
19
|
-
ConfigDict,
|
20
|
-
Field,
|
21
|
-
JsonValue,
|
22
|
-
field_serializer,
|
23
|
-
)
|
17
|
+
from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
|
24
18
|
from shortuuid import uuid
|
25
19
|
|
26
20
|
from inspect_ai._util.constants import SAMPLE_SUBTASK
|
@@ -184,8 +178,8 @@ class ToolEvent(BaseEvent):
|
|
184
178
|
arguments: dict[str, JsonValue]
|
185
179
|
"""Arguments to function."""
|
186
180
|
|
187
|
-
|
188
|
-
"""
|
181
|
+
internal: JsonValue | None = Field(default=None)
|
182
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
189
183
|
|
190
184
|
view: ToolCallContent | None = Field(default=None)
|
191
185
|
"""Custom view of tool call input."""
|
@@ -208,6 +202,12 @@ class ToolEvent(BaseEvent):
|
|
208
202
|
working_time: float | None = Field(default=None)
|
209
203
|
"""Working time for tool call (i.e. time not spent waiting on semaphores)."""
|
210
204
|
|
205
|
+
agent: str | None = Field(default=None)
|
206
|
+
"""Name of agent if the tool call was an agent handoff."""
|
207
|
+
|
208
|
+
failed: bool | None = Field(default=None)
|
209
|
+
"""Did the tool call fail with a hard error?."""
|
210
|
+
|
211
211
|
def _set_result(
|
212
212
|
self,
|
213
213
|
result: ToolResult,
|
@@ -215,6 +215,8 @@ class ToolEvent(BaseEvent):
|
|
215
215
|
error: ToolCallError | None,
|
216
216
|
events: list["Event"],
|
217
217
|
waiting_time: float,
|
218
|
+
agent: str | None,
|
219
|
+
failed: bool | None,
|
218
220
|
) -> None:
|
219
221
|
self.result = result
|
220
222
|
self.truncated = truncated
|
@@ -224,6 +226,8 @@ class ToolEvent(BaseEvent):
|
|
224
226
|
completed = datetime.now()
|
225
227
|
self.completed = completed
|
226
228
|
self.working_time = (completed - self.timestamp).total_seconds() - waiting_time
|
229
|
+
self.agent = agent
|
230
|
+
self.failed = failed
|
227
231
|
|
228
232
|
# mechanism for operator to cancel the tool call
|
229
233
|
|
inspect_ai/model/__init__.py
CHANGED
@@ -18,7 +18,7 @@ from ._cache import (
|
|
18
18
|
cache_prune,
|
19
19
|
cache_size,
|
20
20
|
)
|
21
|
-
from ._call_tools import call_tools
|
21
|
+
from ._call_tools import ExecuteToolsResult, call_tools, execute_tools
|
22
22
|
from ._chat_message import (
|
23
23
|
ChatMessage,
|
24
24
|
ChatMessageAssistant,
|
@@ -27,6 +27,7 @@ from ._chat_message import (
|
|
27
27
|
ChatMessageTool,
|
28
28
|
ChatMessageUser,
|
29
29
|
)
|
30
|
+
from ._conversation import ModelConversation
|
30
31
|
from ._generate_config import GenerateConfig, GenerateConfigArgs, ResponseSchema
|
31
32
|
from ._model import (
|
32
33
|
Model,
|
@@ -34,6 +35,7 @@ from ._model import (
|
|
34
35
|
ModelName,
|
35
36
|
get_model,
|
36
37
|
)
|
38
|
+
from ._model_call import ModelCall
|
37
39
|
from ._model_output import (
|
38
40
|
ChatCompletionChoice,
|
39
41
|
Logprob,
|
@@ -64,7 +66,9 @@ __all__ = [
|
|
64
66
|
"ChatMessageAssistant",
|
65
67
|
"ChatMessageTool",
|
66
68
|
"ChatCompletionChoice",
|
69
|
+
"ModelCall",
|
67
70
|
"ModelOutput",
|
71
|
+
"ModelConversation",
|
68
72
|
"Logprobs",
|
69
73
|
"Logprob",
|
70
74
|
"TopLogprob",
|
@@ -74,6 +78,8 @@ __all__ = [
|
|
74
78
|
"ModelUsage",
|
75
79
|
"StopReason",
|
76
80
|
"call_tools",
|
81
|
+
"execute_tools",
|
82
|
+
"ExecuteToolsResult",
|
77
83
|
"cache_clear",
|
78
84
|
"cache_list_expired",
|
79
85
|
"cache_path",
|