inspect-ai 0.3.55__py3-none-any.whl → 0.3.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +1 -0
- inspect_ai/_cli/common.py +1 -1
- inspect_ai/_cli/trace.py +33 -20
- inspect_ai/_display/core/active.py +1 -1
- inspect_ai/_display/core/display.py +1 -1
- inspect_ai/_display/core/footer.py +1 -1
- inspect_ai/_display/core/progress.py +0 -6
- inspect_ai/_display/core/rich.py +1 -1
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/app.py +15 -17
- inspect_ai/_display/textual/widgets/clock.py +3 -3
- inspect_ai/_display/textual/widgets/samples.py +6 -13
- inspect_ai/_eval/context.py +9 -1
- inspect_ai/_eval/score.py +4 -10
- inspect_ai/_eval/task/results.py +5 -4
- inspect_ai/_eval/task/run.py +6 -12
- inspect_ai/_eval/task/task.py +10 -0
- inspect_ai/_util/ansi.py +31 -0
- inspect_ai/_util/format.py +7 -0
- inspect_ai/_util/logger.py +12 -12
- inspect_ai/_util/throttle.py +10 -1
- inspect_ai/_util/trace.py +43 -47
- inspect_ai/_util/transcript.py +4 -0
- inspect_ai/_util/vscode.py +51 -0
- inspect_ai/_view/notify.py +2 -1
- inspect_ai/_view/www/App.css +22 -1
- inspect_ai/_view/www/dist/assets/index.css +2374 -2
- inspect_ai/_view/www/dist/assets/index.js +29622 -24424
- inspect_ai/_view/www/log-schema.json +138 -90
- inspect_ai/_view/www/package.json +1 -0
- inspect_ai/_view/www/src/App.mjs +1 -0
- inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
- inspect_ai/_view/www/src/components/Tools.mjs +11 -3
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
- inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +26 -12
- inspect_ai/_view/www/yarn.lock +44 -0
- inspect_ai/approval/_apply.py +4 -0
- inspect_ai/approval/_human/panel.py +5 -8
- inspect_ai/dataset/_dataset.py +51 -10
- inspect_ai/dataset/_util.py +31 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +5 -2
- inspect_ai/model/_call_tools.py +4 -2
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_model.py +42 -1
- inspect_ai/model/_providers/anthropic.py +4 -0
- inspect_ai/model/_render.py +9 -2
- inspect_ai/scorer/_metric.py +12 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_human_agent/agent.py +83 -0
- inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
- inspect_ai/solver/_human_agent/commands/clock.py +70 -0
- inspect_ai/solver/_human_agent/commands/command.py +59 -0
- inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
- inspect_ai/solver/_human_agent/commands/note.py +42 -0
- inspect_ai/solver/_human_agent/commands/score.py +80 -0
- inspect_ai/solver/_human_agent/commands/status.py +62 -0
- inspect_ai/solver/_human_agent/commands/submit.py +151 -0
- inspect_ai/solver/_human_agent/install.py +222 -0
- inspect_ai/solver/_human_agent/panel.py +252 -0
- inspect_ai/solver/_human_agent/service.py +45 -0
- inspect_ai/solver/_human_agent/state.py +55 -0
- inspect_ai/solver/_human_agent/view.py +24 -0
- inspect_ai/solver/_task_state.py +28 -2
- inspect_ai/tool/_tool.py +10 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +13 -10
- inspect_ai/util/__init__.py +8 -4
- inspect_ai/{_util/display.py → util/_display.py} +6 -0
- inspect_ai/util/_panel.py +31 -9
- inspect_ai/util/_sandbox/__init__.py +0 -3
- inspect_ai/util/_sandbox/context.py +5 -1
- inspect_ai/util/_sandbox/docker/compose.py +16 -10
- inspect_ai/util/_sandbox/docker/docker.py +9 -6
- inspect_ai/util/_sandbox/docker/internal.py +1 -1
- inspect_ai/util/_sandbox/docker/util.py +2 -2
- inspect_ai/util/_sandbox/environment.py +6 -5
- inspect_ai/util/_sandbox/local.py +1 -1
- inspect_ai/util/_sandbox/service.py +22 -7
- inspect_ai/util/_store.py +5 -6
- inspect_ai/util/_store_model.py +110 -0
- inspect_ai/util/_throttle.py +32 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/RECORD +95 -73
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,222 @@
|
|
1
|
+
import inspect
|
2
|
+
from textwrap import dedent
|
3
|
+
|
4
|
+
from inspect_ai.util import sandbox
|
5
|
+
|
6
|
+
from .._task_state import TaskState
|
7
|
+
from .commands.command import HumanAgentCommand
|
8
|
+
|
9
|
+
INSTALL_DIR = "human_agent_install"
|
10
|
+
HUMAN_AGENT_DIR = "/opt/human_agent"
|
11
|
+
TASK_PY = "task.py"
|
12
|
+
INSTALL_SH = "install.sh"
|
13
|
+
BASHRC = ".bashrc"
|
14
|
+
WELCOME_FILE = "welcome.txt"
|
15
|
+
WELCOME_LOGIN_FILE = "welcome_login.txt"
|
16
|
+
INSTRUCTIONS_FILE = "instructions.txt"
|
17
|
+
RECORD_SESSION_DIR = "/var/tmp/user-sessions"
|
18
|
+
|
19
|
+
|
20
|
+
async def install_human_agent(
|
21
|
+
state: TaskState, commands: list[HumanAgentCommand], record_session: bool
|
22
|
+
) -> None:
|
23
|
+
# see if we have already installed
|
24
|
+
if not (await sandbox().exec(["mkdir", HUMAN_AGENT_DIR])).success:
|
25
|
+
return
|
26
|
+
|
27
|
+
# setup installation directory
|
28
|
+
await checked_exec(["mkdir", "-p", INSTALL_DIR])
|
29
|
+
|
30
|
+
# generate task.py
|
31
|
+
task_py = human_agent_commands(commands)
|
32
|
+
await checked_write_file(f"{INSTALL_DIR}/{TASK_PY}", task_py, executable=True)
|
33
|
+
|
34
|
+
# generate .bashrc
|
35
|
+
bash_rc = human_agent_bashrc(commands, record_session)
|
36
|
+
await checked_write_file(f"{INSTALL_DIR}/{BASHRC}", bash_rc, executable=True)
|
37
|
+
|
38
|
+
# write and run installation script
|
39
|
+
install_sh = human_agent_install_sh()
|
40
|
+
await checked_write_file(f"{INSTALL_DIR}/{INSTALL_SH}", install_sh, executable=True)
|
41
|
+
await checked_exec(["bash", f"./{INSTALL_SH}"], cwd=INSTALL_DIR)
|
42
|
+
await checked_exec(["rm", "-rf", INSTALL_DIR])
|
43
|
+
|
44
|
+
|
45
|
+
def human_agent_commands(commands: list[HumanAgentCommand]) -> str:
|
46
|
+
# filter out hidden commands
|
47
|
+
commands = [command for command in commands if "cli" in command.contexts]
|
48
|
+
|
49
|
+
# standard imports (including any dependencies that call methods carry)
|
50
|
+
imports = dedent("""
|
51
|
+
import argparse
|
52
|
+
import sys
|
53
|
+
from argparse import Namespace
|
54
|
+
from pathlib import Path
|
55
|
+
|
56
|
+
sys.path.append("/var/tmp/sandbox-services/human_agent")
|
57
|
+
from human_agent import call_human_agent
|
58
|
+
|
59
|
+
def format_time(t):
|
60
|
+
minutes, seconds = divmod(t, 60)
|
61
|
+
hours, minutes = divmod(minutes, 60)
|
62
|
+
return f"{hours:.0f}:{minutes:02.0f}:{seconds:02.0f}"
|
63
|
+
""")
|
64
|
+
|
65
|
+
# command handler source code (extracted from call methods)
|
66
|
+
command_handlers = "\n\n".join(
|
67
|
+
dedent(
|
68
|
+
inspect.getsource(command.cli).replace("cli(self, ", f"{command.name}(", 1)
|
69
|
+
)
|
70
|
+
for command in commands
|
71
|
+
)
|
72
|
+
|
73
|
+
# parse commands
|
74
|
+
command_parsers: list[str] = []
|
75
|
+
for command in commands:
|
76
|
+
command_parsers.append(
|
77
|
+
dedent(f"""
|
78
|
+
{command.name}_parser = subparsers.add_parser("{command.name}", help="{command.description}")
|
79
|
+
""").lstrip()
|
80
|
+
)
|
81
|
+
for arg in command.cli_args:
|
82
|
+
if arg.name.startswith("--"):
|
83
|
+
extras = 'action="store_true", default=False'
|
84
|
+
else:
|
85
|
+
extras = f"""nargs={1 if arg.required else '"?"'}"""
|
86
|
+
command_parsers.append(
|
87
|
+
dedent(f"""
|
88
|
+
{command.name}_parser.add_argument("{arg.name}", {extras}, help="{arg.description}")
|
89
|
+
""").strip()
|
90
|
+
)
|
91
|
+
|
92
|
+
parse = (
|
93
|
+
dedent("""
|
94
|
+
parser = argparse.ArgumentParser(description="Human agent tools.")
|
95
|
+
subparsers = parser.add_subparsers(dest="command")
|
96
|
+
""")
|
97
|
+
+ "\n"
|
98
|
+
+ "\n".join(command_parsers)
|
99
|
+
)
|
100
|
+
|
101
|
+
# dispatch commands
|
102
|
+
command_dispatchers: list[str] = []
|
103
|
+
for i, command in enumerate(commands):
|
104
|
+
conditional = "if" if i == 0 else "elif"
|
105
|
+
command_dispatchers.append(
|
106
|
+
f'{conditional} command == "{command.name}": {command.name}(args)'
|
107
|
+
)
|
108
|
+
command_dispatchers.append("else: parser.print_help()")
|
109
|
+
|
110
|
+
dispatch = dedent("""
|
111
|
+
args = parser.parse_args()
|
112
|
+
command = args.command
|
113
|
+
delattr(args, 'command')
|
114
|
+
""") + "\n".join(command_dispatchers)
|
115
|
+
|
116
|
+
return "\n".join([imports, command_handlers, parse, dispatch]) + "\n"
|
117
|
+
|
118
|
+
|
119
|
+
def human_agent_bashrc(commands: list[HumanAgentCommand], record_session: bool) -> str:
|
120
|
+
# only run in interative terminals
|
121
|
+
TERMINAL_CHECK = dedent("""
|
122
|
+
|
123
|
+
### Inspect Human Agent Setup #########################################=
|
124
|
+
|
125
|
+
# only run if shell is interactive
|
126
|
+
case $- in
|
127
|
+
*i*) ;;
|
128
|
+
*) return ;;
|
129
|
+
esac
|
130
|
+
|
131
|
+
# only run if attached to a terminal
|
132
|
+
if ! tty -s; then
|
133
|
+
return
|
134
|
+
fi
|
135
|
+
""")
|
136
|
+
|
137
|
+
# shell alias and completions
|
138
|
+
command_names = " ".join(
|
139
|
+
[f"{command.name}" for command in commands if "cli" in command.contexts]
|
140
|
+
)
|
141
|
+
COMMANDS = dedent(f"""
|
142
|
+
# shell alias for human agent commands
|
143
|
+
alias task='python3 {HUMAN_AGENT_DIR}/{TASK_PY}'
|
144
|
+
|
145
|
+
# completion handler
|
146
|
+
_task_completion() {{
|
147
|
+
local cur
|
148
|
+
cur="${{COMP_WORDS[COMP_CWORD]}}"
|
149
|
+
if [ "$COMP_CWORD" -eq 1 ]; then
|
150
|
+
local commands="{command_names}"
|
151
|
+
|
152
|
+
# Generate completion matches
|
153
|
+
COMPREPLY=($(compgen -W "${{commands}}" -- ${{cur}}))
|
154
|
+
fi
|
155
|
+
}}
|
156
|
+
complete -F _task_completion task
|
157
|
+
""")
|
158
|
+
|
159
|
+
# session recording
|
160
|
+
if record_session:
|
161
|
+
RECORDING = dedent(f"""
|
162
|
+
# record human agent session transcript
|
163
|
+
if [ -z "$SCRIPT_RUNNING" ]; then
|
164
|
+
export SCRIPT_RUNNING=1
|
165
|
+
LOGDIR={RECORD_SESSION_DIR}
|
166
|
+
mkdir -p "$LOGDIR"
|
167
|
+
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
168
|
+
INPUTFILE="$LOGDIR/$(whoami)_$TIMESTAMP.input"
|
169
|
+
OUTPUTFILE="$LOGDIR/$(whoami)_$TIMESTAMP.output"
|
170
|
+
TIMINGFILE="$LOGDIR/$(whoami)_$TIMESTAMP.timing"
|
171
|
+
exec script -q -f -m advanced -I "$INPUTFILE" -O "$OUTPUTFILE" -T "$TIMINGFILE" -c "bash --login -i"
|
172
|
+
fi
|
173
|
+
""")
|
174
|
+
else:
|
175
|
+
RECORDING = ""
|
176
|
+
|
177
|
+
# display task instructions
|
178
|
+
INSTRUCTIONS = dedent("""
|
179
|
+
if [ -z "$INSTRUCTIONS_SHOWN" ]; then
|
180
|
+
export INSTRUCTIONS_SHOWN=1
|
181
|
+
task instructions > instructions.txt
|
182
|
+
cat instructions.txt
|
183
|
+
fi
|
184
|
+
""").lstrip()
|
185
|
+
|
186
|
+
# return .bashrc
|
187
|
+
return "\n".join([TERMINAL_CHECK, COMMANDS, RECORDING, INSTRUCTIONS])
|
188
|
+
|
189
|
+
|
190
|
+
def human_agent_install_sh() -> str:
|
191
|
+
return dedent(f"""
|
192
|
+
#!/usr/bin/env bash
|
193
|
+
|
194
|
+
# create installation directory
|
195
|
+
HUMAN_AGENT="{HUMAN_AGENT_DIR}"
|
196
|
+
mkdir -p $HUMAN_AGENT
|
197
|
+
|
198
|
+
# copy command script
|
199
|
+
cp {TASK_PY} $HUMAN_AGENT
|
200
|
+
|
201
|
+
# append to .bashrc
|
202
|
+
cat {BASHRC} >> ~/{BASHRC}
|
203
|
+
""")
|
204
|
+
|
205
|
+
|
206
|
+
async def checked_exec(
|
207
|
+
cmd: list[str],
|
208
|
+
input: str | bytes | None = None,
|
209
|
+
cwd: str | None = None,
|
210
|
+
) -> str:
|
211
|
+
result = await sandbox().exec(cmd, input=input, cwd=cwd)
|
212
|
+
if not result.success:
|
213
|
+
raise RuntimeError(f"Error executing command {' '.join(cmd)}: {result.stderr}")
|
214
|
+
return result.stdout
|
215
|
+
|
216
|
+
|
217
|
+
async def checked_write_file(
|
218
|
+
file: str, contents: str, executable: bool = False
|
219
|
+
) -> None:
|
220
|
+
await checked_exec(["tee", "--", file], input=contents)
|
221
|
+
if executable:
|
222
|
+
await checked_exec(["chmod", "+x", file])
|
@@ -0,0 +1,252 @@
|
|
1
|
+
from typing import cast
|
2
|
+
|
3
|
+
from textual.app import ComposeResult
|
4
|
+
from textual.containers import (
|
5
|
+
Container,
|
6
|
+
Horizontal,
|
7
|
+
VerticalScroll,
|
8
|
+
)
|
9
|
+
from textual.reactive import reactive
|
10
|
+
from textual.widgets import (
|
11
|
+
Button,
|
12
|
+
ContentSwitcher,
|
13
|
+
Label,
|
14
|
+
Link,
|
15
|
+
LoadingIndicator,
|
16
|
+
Static,
|
17
|
+
)
|
18
|
+
|
19
|
+
from inspect_ai._util.format import format_progress_time
|
20
|
+
from inspect_ai._util.vscode import (
|
21
|
+
VSCodeCommand,
|
22
|
+
can_execute_vscode_commands,
|
23
|
+
execute_vscode_commands,
|
24
|
+
)
|
25
|
+
from inspect_ai.util import InputPanel, SandboxConnection, throttle
|
26
|
+
|
27
|
+
from .state import HumanAgentState
|
28
|
+
|
29
|
+
|
30
|
+
class HumanAgentPanel(InputPanel):
|
31
|
+
DEFAULT_TITLE = "Human Agent"
|
32
|
+
|
33
|
+
SANDBOX_VIEW_ID = "human-agent-sandbox-view"
|
34
|
+
SANDBOX_INSTRUCTIONS_ID = "sandbox-instructions"
|
35
|
+
VSCODE_LINKS_ID = "vscode-links"
|
36
|
+
LOGIN_VSCODE_TERMINAL_ID = "login-vscode-terminal"
|
37
|
+
LOGIN_VSCODE_WINDOW_ID = "login-vscode-window"
|
38
|
+
COMMAND_INSTRUCTIONS_ID = "command-instructions"
|
39
|
+
SANDBOX_COMMAND_ID = "sandbox-command"
|
40
|
+
|
41
|
+
INSTRUCTIONS_CLASS = "instructions"
|
42
|
+
LINK_LABEL_CLASS = "link-label"
|
43
|
+
|
44
|
+
DEFAULT_CSS = f"""
|
45
|
+
#{SANDBOX_VIEW_ID} {{
|
46
|
+
scrollbar-size-vertical: 1;
|
47
|
+
}}
|
48
|
+
HumanAgentPanel .{INSTRUCTIONS_CLASS} {{
|
49
|
+
color: $text-muted;
|
50
|
+
margin-bottom: 1;
|
51
|
+
}}
|
52
|
+
#{SANDBOX_COMMAND_ID} {{
|
53
|
+
color: $secondary;
|
54
|
+
}}
|
55
|
+
HumanAgentPanel .{LINK_LABEL_CLASS} {{
|
56
|
+
color: $text-muted;
|
57
|
+
}}
|
58
|
+
HumanAgentPanel VSCodeLink {{
|
59
|
+
margin-left: 1;
|
60
|
+
margin-right: 2;
|
61
|
+
}}
|
62
|
+
HumanAgentPanel #{VSCODE_LINKS_ID} {{
|
63
|
+
height: 1;
|
64
|
+
margin-bottom: 1;
|
65
|
+
}}
|
66
|
+
"""
|
67
|
+
|
68
|
+
connection: reactive[SandboxConnection | None] = reactive(None)
|
69
|
+
|
70
|
+
# implement HumanAgentView
|
71
|
+
def connect(self, connection: SandboxConnection) -> None:
|
72
|
+
self.connection = connection
|
73
|
+
|
74
|
+
@throttle(1)
|
75
|
+
def update_state(self, state: HumanAgentState) -> None:
|
76
|
+
status_bar = self.query_one(StatusBar)
|
77
|
+
status_bar.running = state.running
|
78
|
+
status_bar.time = state.time
|
79
|
+
|
80
|
+
def compose(self) -> ComposeResult:
|
81
|
+
with ContentSwitcher(initial=LoadingView.ID):
|
82
|
+
yield LoadingView()
|
83
|
+
with VerticalScroll(id=self.SANDBOX_VIEW_ID):
|
84
|
+
yield StatusBar()
|
85
|
+
yield Static(
|
86
|
+
id=self.SANDBOX_INSTRUCTIONS_ID, classes=self.INSTRUCTIONS_CLASS
|
87
|
+
)
|
88
|
+
with Horizontal(id=self.VSCODE_LINKS_ID):
|
89
|
+
yield Label("Login:", classes=self.LINK_LABEL_CLASS)
|
90
|
+
yield VSCodeLink(
|
91
|
+
"VS Code Window",
|
92
|
+
id=self.LOGIN_VSCODE_WINDOW_ID,
|
93
|
+
)
|
94
|
+
yield Label("Login:", classes=self.LINK_LABEL_CLASS)
|
95
|
+
yield VSCodeLink(
|
96
|
+
"VS Code Terminal",
|
97
|
+
id=self.LOGIN_VSCODE_TERMINAL_ID,
|
98
|
+
)
|
99
|
+
yield Static(
|
100
|
+
id=self.COMMAND_INSTRUCTIONS_ID, classes=self.INSTRUCTIONS_CLASS
|
101
|
+
)
|
102
|
+
yield Static(id=self.SANDBOX_COMMAND_ID)
|
103
|
+
|
104
|
+
def watch_connection(self, connection: SandboxConnection | None) -> None:
|
105
|
+
if connection:
|
106
|
+
# switch to sandbox view
|
107
|
+
self.query_one(ContentSwitcher).current = self.SANDBOX_VIEW_ID
|
108
|
+
|
109
|
+
# note whether we are in vscode
|
110
|
+
vscode = can_execute_vscode_commands()
|
111
|
+
|
112
|
+
# suffix for instructions based on whether we are in vscode
|
113
|
+
instructions_command = "Login to the system with the following command (hold down Alt or Option to select text for copying):"
|
114
|
+
instructions_vscode = (
|
115
|
+
"Use the links below to login to the system within VS Code:"
|
116
|
+
)
|
117
|
+
|
118
|
+
# update instructions
|
119
|
+
instructions_lbl = cast(
|
120
|
+
Static, self.query_one(f"#{self.SANDBOX_INSTRUCTIONS_ID}")
|
121
|
+
)
|
122
|
+
instructions = f"""You are completing a task on a Linux system (task instructions will be presented when you login). {instructions_vscode if vscode else instructions_command}"""
|
123
|
+
instructions_lbl.update(instructions)
|
124
|
+
|
125
|
+
# login: vscode terminal
|
126
|
+
vscode_links = self.query_one(f"#{self.VSCODE_LINKS_ID}")
|
127
|
+
vscode_links.display = vscode
|
128
|
+
terminal_btn = cast(
|
129
|
+
VSCodeLink, self.query_one(f"#{self.LOGIN_VSCODE_TERMINAL_ID}")
|
130
|
+
)
|
131
|
+
terminal_btn.commands = [
|
132
|
+
VSCodeCommand(
|
133
|
+
command="workbench.action.terminal.new", args=[{"location": 2}]
|
134
|
+
),
|
135
|
+
VSCodeCommand(
|
136
|
+
command="workbench.action.terminal.sendSequence",
|
137
|
+
args=[{"text": f"{connection.command}\n"}],
|
138
|
+
),
|
139
|
+
]
|
140
|
+
|
141
|
+
# login: vscode window
|
142
|
+
window_btn = cast(
|
143
|
+
VSCodeLink, self.query_one(f"#{self.LOGIN_VSCODE_WINDOW_ID}")
|
144
|
+
)
|
145
|
+
if connection.vscode_command is not None:
|
146
|
+
window_btn.commands = [
|
147
|
+
VSCodeCommand(
|
148
|
+
command=connection.vscode_command[0],
|
149
|
+
args=connection.vscode_command[1:],
|
150
|
+
)
|
151
|
+
]
|
152
|
+
|
153
|
+
# command (always available)
|
154
|
+
command_instructions_lbl = cast(
|
155
|
+
Static, self.query_one(f"#{self.COMMAND_INSTRUCTIONS_ID}")
|
156
|
+
)
|
157
|
+
command_instructions_lbl.display = vscode
|
158
|
+
command_instructions_lbl.update(
|
159
|
+
instructions_command.replace("Login", "Alternatively, login", 1)
|
160
|
+
)
|
161
|
+
command_lbl = cast(Static, self.query_one(f"#{self.SANDBOX_COMMAND_ID}"))
|
162
|
+
command_lbl.update(connection.command)
|
163
|
+
|
164
|
+
|
165
|
+
class StatusBar(Horizontal):
|
166
|
+
STATUS_ID = "task-status"
|
167
|
+
TIME_ID = "task-time"
|
168
|
+
|
169
|
+
LABEL_CLASS = "status-label"
|
170
|
+
VALUE_CLASS = "status-value"
|
171
|
+
|
172
|
+
DEFAULT_CSS = f"""
|
173
|
+
StatusBar {{
|
174
|
+
width: 1fr;
|
175
|
+
height: 1;
|
176
|
+
background: $surface;
|
177
|
+
margin-bottom: 1;
|
178
|
+
layout: grid;
|
179
|
+
grid-size: 4 1;
|
180
|
+
grid-columns: auto auto auto auto;
|
181
|
+
grid-gutter: 1;
|
182
|
+
}}
|
183
|
+
.{LABEL_CLASS} {{
|
184
|
+
color: $primary;
|
185
|
+
}}
|
186
|
+
.{VALUE_CLASS} {{
|
187
|
+
color: $foreground;
|
188
|
+
}}
|
189
|
+
StatusBar Link {{
|
190
|
+
dock: right;
|
191
|
+
margin-right: 1;
|
192
|
+
}}
|
193
|
+
"""
|
194
|
+
|
195
|
+
running: reactive[bool] = reactive(True)
|
196
|
+
time: reactive[float] = reactive(0)
|
197
|
+
|
198
|
+
def __init__(self) -> None:
|
199
|
+
super().__init__()
|
200
|
+
|
201
|
+
def compose(self) -> ComposeResult:
|
202
|
+
yield Label("Status:", classes=self.LABEL_CLASS)
|
203
|
+
yield Static("Running", id=self.STATUS_ID, classes=self.VALUE_CLASS)
|
204
|
+
yield Label(" Time:", classes=self.LABEL_CLASS)
|
205
|
+
yield Static("0:00:00", id=self.TIME_ID, classes=self.VALUE_CLASS)
|
206
|
+
|
207
|
+
def watch_running(self, running: bool) -> None:
|
208
|
+
cast(Static, self.query_one(f"#{self.STATUS_ID}")).update(
|
209
|
+
"Running" if running else "Stopped"
|
210
|
+
)
|
211
|
+
|
212
|
+
def watch_time(self, time: float) -> None:
|
213
|
+
time_display = format_progress_time(time)
|
214
|
+
cast(Static, self.query_one(f"#{self.TIME_ID}")).update(time_display)
|
215
|
+
|
216
|
+
|
217
|
+
class LoadingView(Container):
|
218
|
+
ID = "human-agent-loading-view"
|
219
|
+
|
220
|
+
def __init__(self) -> None:
|
221
|
+
super().__init__(id=self.ID)
|
222
|
+
|
223
|
+
def compose(self) -> ComposeResult:
|
224
|
+
yield LoadingIndicator()
|
225
|
+
yield Button() # add focusable widget so the tab can activate
|
226
|
+
|
227
|
+
|
228
|
+
class VSCodeLink(Link):
|
229
|
+
def __init__(
|
230
|
+
self,
|
231
|
+
text: str,
|
232
|
+
*,
|
233
|
+
url: str | None = None,
|
234
|
+
tooltip: str | None = None,
|
235
|
+
name: str | None = None,
|
236
|
+
id: str | None = None,
|
237
|
+
classes: str | None = None,
|
238
|
+
disabled: bool = False,
|
239
|
+
) -> None:
|
240
|
+
super().__init__(
|
241
|
+
text,
|
242
|
+
url=url,
|
243
|
+
tooltip=tooltip,
|
244
|
+
name=name,
|
245
|
+
id=id,
|
246
|
+
classes=classes,
|
247
|
+
disabled=disabled,
|
248
|
+
)
|
249
|
+
self.commands: list[VSCodeCommand] = []
|
250
|
+
|
251
|
+
def on_click(self) -> None:
|
252
|
+
execute_vscode_commands(self.commands)
|
@@ -0,0 +1,45 @@
|
|
1
|
+
from inspect_ai.model import ModelOutput
|
2
|
+
from inspect_ai.util._sandbox import sandbox
|
3
|
+
from inspect_ai.util._sandbox.service import sandbox_service
|
4
|
+
|
5
|
+
from .._task_state import TaskState
|
6
|
+
from .commands.command import HumanAgentCommand
|
7
|
+
from .state import HumanAgentState
|
8
|
+
from .view import HumanAgentView
|
9
|
+
|
10
|
+
|
11
|
+
async def run_human_agent_service(
|
12
|
+
state: TaskState, commands: list[HumanAgentCommand], view: HumanAgentView | None
|
13
|
+
) -> TaskState:
|
14
|
+
# initialise agent state
|
15
|
+
instructions = "\n\n".join([message.text for message in state.messages]).strip()
|
16
|
+
agent_state = HumanAgentState(instructions=instructions)
|
17
|
+
|
18
|
+
# extract service methods from commands
|
19
|
+
methods = {
|
20
|
+
command.name: command.service(agent_state)
|
21
|
+
for command in commands
|
22
|
+
if "service" in command.contexts
|
23
|
+
}
|
24
|
+
|
25
|
+
# callback to check if task is completed (use this to periodically
|
26
|
+
# update the view with the current state)
|
27
|
+
def task_is_completed() -> bool:
|
28
|
+
if view:
|
29
|
+
view.update_state(agent_state)
|
30
|
+
return agent_state.answer is not None
|
31
|
+
|
32
|
+
# run the service
|
33
|
+
await sandbox_service(
|
34
|
+
name="human_agent",
|
35
|
+
methods=methods,
|
36
|
+
until=task_is_completed,
|
37
|
+
sandbox=sandbox(),
|
38
|
+
)
|
39
|
+
|
40
|
+
# set the answer if we have one
|
41
|
+
if agent_state.answer is not None:
|
42
|
+
state.output = ModelOutput.from_content("human_agent", agent_state.answer)
|
43
|
+
|
44
|
+
# return state
|
45
|
+
return state
|
@@ -0,0 +1,55 @@
|
|
1
|
+
from time import time as current_time
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
from inspect_ai.scorer._metric import Score
|
6
|
+
from inspect_ai.util._store_model import StoreModel
|
7
|
+
|
8
|
+
|
9
|
+
class IntermediateScoring(BaseModel):
|
10
|
+
time: float
|
11
|
+
scores: list[Score]
|
12
|
+
|
13
|
+
|
14
|
+
class HumanAgentState(StoreModel):
|
15
|
+
instructions: str
|
16
|
+
"""Task instructions."""
|
17
|
+
|
18
|
+
@property
|
19
|
+
def running(self) -> bool:
|
20
|
+
"""Is the task currently running?"""
|
21
|
+
return self.running_state
|
22
|
+
|
23
|
+
@running.setter
|
24
|
+
def running(self, running: bool) -> None:
|
25
|
+
"""Set current running state."""
|
26
|
+
# if we are flipping to running mode then update started running
|
27
|
+
if not self.running_state and running:
|
28
|
+
self.started_running = current_time()
|
29
|
+
|
30
|
+
# if we are exiting running mode then update accumulated time
|
31
|
+
if self.running_state and not running:
|
32
|
+
self.accumulated_time = self.time
|
33
|
+
|
34
|
+
# update running
|
35
|
+
self.running_state = running
|
36
|
+
|
37
|
+
@property
|
38
|
+
def time(self) -> float:
|
39
|
+
"""Total time spend on task."""
|
40
|
+
running_time = current_time() - self.started_running if self.running else 0
|
41
|
+
return self.accumulated_time + running_time
|
42
|
+
|
43
|
+
scorings: list[IntermediateScoring] = Field(default_factory=list)
|
44
|
+
"""Intermediate scorings yielded by `task score`"""
|
45
|
+
|
46
|
+
answer: str | None = Field(default=None)
|
47
|
+
"""Final answer provided in `task submit`"""
|
48
|
+
|
49
|
+
logs: dict[str, str] = Field(default_factory=dict)
|
50
|
+
"""Session logs generated by `script` """
|
51
|
+
|
52
|
+
# internal state variables used by running and time properties
|
53
|
+
running_state: bool = Field(default=True)
|
54
|
+
started_running: float = Field(default_factory=current_time)
|
55
|
+
accumulated_time: float = Field(default=0.0)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
from typing import Protocol
|
2
|
+
|
3
|
+
from inspect_ai.util import SandboxConnection
|
4
|
+
|
5
|
+
from .state import HumanAgentState
|
6
|
+
|
7
|
+
|
8
|
+
class HumanAgentView(Protocol):
|
9
|
+
def connect(self, connection: SandboxConnection) -> None: ...
|
10
|
+
def update_state(self, state: HumanAgentState) -> None: ...
|
11
|
+
|
12
|
+
|
13
|
+
class ConsoleView(HumanAgentView):
|
14
|
+
"""Fallback view for when we aren't running fullscreen UI."""
|
15
|
+
|
16
|
+
def connect(self, connection: SandboxConnection) -> None:
|
17
|
+
print(
|
18
|
+
"You are completing a task on a Linux system (task instructions will be presented "
|
19
|
+
+ "when you login). Login to the system with the following command:\n"
|
20
|
+
)
|
21
|
+
print(f"{connection.command}\n")
|
22
|
+
|
23
|
+
def update_state(self, state: HumanAgentState) -> None:
|
24
|
+
pass
|
inspect_ai/solver/_task_state.py
CHANGED
@@ -3,11 +3,11 @@ from contextvars import ContextVar
|
|
3
3
|
from copy import deepcopy
|
4
4
|
from dataclasses import dataclass
|
5
5
|
from random import Random
|
6
|
-
from typing import Any, Union, cast, overload
|
6
|
+
from typing import Any, Type, Union, cast, overload
|
7
7
|
|
8
8
|
from pydantic_core import to_jsonable_python
|
9
9
|
|
10
|
-
from inspect_ai.dataset._dataset import Sample
|
10
|
+
from inspect_ai.dataset._dataset import MT, Sample, metadata_as
|
11
11
|
from inspect_ai.model import (
|
12
12
|
ChatMessage,
|
13
13
|
ChatMessageUser,
|
@@ -19,6 +19,7 @@ from inspect_ai.model._model import sample_total_tokens
|
|
19
19
|
from inspect_ai.tool import Tool, ToolChoice
|
20
20
|
from inspect_ai.tool._tool_def import ToolDef
|
21
21
|
from inspect_ai.util._store import Store, store_jsonable
|
22
|
+
from inspect_ai.util._store_model import SMT
|
22
23
|
|
23
24
|
|
24
25
|
@dataclass
|
@@ -349,6 +350,31 @@ class TaskState:
|
|
349
350
|
for tool in tools:
|
350
351
|
self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
|
351
352
|
|
353
|
+
def metadata_as(self, metadata_cls: Type[MT]) -> MT:
|
354
|
+
"""Pydantic model interface to metadata.
|
355
|
+
|
356
|
+
Args:
|
357
|
+
metadata_cls: Pydantic model type
|
358
|
+
|
359
|
+
Returns:
|
360
|
+
BaseModel: Instance of metadata_cls bound to current Store.
|
361
|
+
"""
|
362
|
+
if not self.metadata:
|
363
|
+
raise ValueError("Sample does not have metadata")
|
364
|
+
|
365
|
+
return metadata_as(self.metadata, metadata_cls)
|
366
|
+
|
367
|
+
def store_as(self, model_cls: Type[SMT]) -> SMT:
|
368
|
+
"""Pydantic model interface to the store.
|
369
|
+
|
370
|
+
Args:
|
371
|
+
model_cls: Pydantic model type (must derive from StoreModel)
|
372
|
+
|
373
|
+
Returns:
|
374
|
+
StoreModel: Instance of model_cls bound to current Store.
|
375
|
+
"""
|
376
|
+
return model_cls(store=self.store)
|
377
|
+
|
352
378
|
|
353
379
|
def sample_state() -> TaskState | None:
|
354
380
|
return _sample_state.get(None)
|
inspect_ai/tool/_tool.py
CHANGED
@@ -11,6 +11,7 @@ from typing import (
|
|
11
11
|
runtime_checkable,
|
12
12
|
)
|
13
13
|
|
14
|
+
from inspect_ai._util.content import ContentImage, ContentText
|
14
15
|
from inspect_ai._util.registry import (
|
15
16
|
RegistryInfo,
|
16
17
|
registry_add,
|
@@ -18,13 +19,20 @@ from inspect_ai._util.registry import (
|
|
18
19
|
registry_tag,
|
19
20
|
)
|
20
21
|
|
21
|
-
from . import Content
|
22
22
|
from ._tool_call import ToolCallViewer
|
23
23
|
|
24
24
|
logger = getLogger(__name__)
|
25
25
|
|
26
26
|
|
27
|
-
ToolResult =
|
27
|
+
ToolResult = (
|
28
|
+
str
|
29
|
+
| int
|
30
|
+
| float
|
31
|
+
| bool
|
32
|
+
| ContentText
|
33
|
+
| ContentImage
|
34
|
+
| list[ContentText | ContentImage]
|
35
|
+
)
|
28
36
|
|
29
37
|
|
30
38
|
class ToolError(Exception):
|