inspect-ai 0.3.54__py3-none-any.whl → 0.3.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +1 -0
- inspect_ai/_cli/common.py +1 -1
- inspect_ai/_cli/trace.py +33 -20
- inspect_ai/_display/core/active.py +1 -1
- inspect_ai/_display/core/display.py +1 -1
- inspect_ai/_display/core/footer.py +1 -1
- inspect_ai/_display/core/progress.py +0 -6
- inspect_ai/_display/core/rich.py +1 -1
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/app.py +15 -17
- inspect_ai/_display/textual/widgets/clock.py +3 -3
- inspect_ai/_display/textual/widgets/samples.py +6 -13
- inspect_ai/_eval/context.py +9 -1
- inspect_ai/_eval/score.py +4 -10
- inspect_ai/_eval/task/log.py +2 -1
- inspect_ai/_eval/task/results.py +5 -4
- inspect_ai/_eval/task/run.py +6 -12
- inspect_ai/_eval/task/task.py +10 -0
- inspect_ai/_util/ansi.py +31 -0
- inspect_ai/_util/format.py +7 -0
- inspect_ai/_util/logger.py +12 -12
- inspect_ai/_util/throttle.py +10 -1
- inspect_ai/_util/trace.py +43 -47
- inspect_ai/_util/transcript.py +4 -0
- inspect_ai/_util/vscode.py +51 -0
- inspect_ai/_view/notify.py +2 -1
- inspect_ai/_view/www/App.css +22 -1
- inspect_ai/_view/www/dist/assets/index.css +2374 -2
- inspect_ai/_view/www/dist/assets/index.js +29622 -24424
- inspect_ai/_view/www/log-schema.json +138 -90
- inspect_ai/_view/www/package.json +1 -0
- inspect_ai/_view/www/src/App.mjs +1 -0
- inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
- inspect_ai/_view/www/src/components/Tools.mjs +11 -3
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
- inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +26 -12
- inspect_ai/_view/www/yarn.lock +44 -0
- inspect_ai/approval/_apply.py +4 -0
- inspect_ai/approval/_human/panel.py +5 -8
- inspect_ai/dataset/_dataset.py +51 -10
- inspect_ai/dataset/_util.py +31 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +5 -2
- inspect_ai/model/_cache.py +1 -1
- inspect_ai/model/_call_tools.py +4 -2
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_model.py +42 -1
- inspect_ai/model/_providers/anthropic.py +4 -0
- inspect_ai/model/_providers/openai.py +11 -1
- inspect_ai/model/_render.py +9 -2
- inspect_ai/scorer/_metric.py +12 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_human_agent/agent.py +83 -0
- inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
- inspect_ai/solver/_human_agent/commands/clock.py +70 -0
- inspect_ai/solver/_human_agent/commands/command.py +59 -0
- inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
- inspect_ai/solver/_human_agent/commands/note.py +42 -0
- inspect_ai/solver/_human_agent/commands/score.py +80 -0
- inspect_ai/solver/_human_agent/commands/status.py +62 -0
- inspect_ai/solver/_human_agent/commands/submit.py +151 -0
- inspect_ai/solver/_human_agent/install.py +222 -0
- inspect_ai/solver/_human_agent/panel.py +252 -0
- inspect_ai/solver/_human_agent/service.py +45 -0
- inspect_ai/solver/_human_agent/state.py +55 -0
- inspect_ai/solver/_human_agent/view.py +24 -0
- inspect_ai/solver/_task_state.py +28 -2
- inspect_ai/tool/_tool.py +10 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +13 -10
- inspect_ai/util/__init__.py +8 -4
- inspect_ai/{_util/display.py → util/_display.py} +6 -0
- inspect_ai/util/_panel.py +31 -9
- inspect_ai/util/_sandbox/__init__.py +0 -3
- inspect_ai/util/_sandbox/context.py +5 -1
- inspect_ai/util/_sandbox/docker/compose.py +16 -10
- inspect_ai/util/_sandbox/docker/docker.py +9 -6
- inspect_ai/util/_sandbox/docker/internal.py +1 -1
- inspect_ai/util/_sandbox/docker/util.py +2 -2
- inspect_ai/util/_sandbox/environment.py +6 -5
- inspect_ai/util/_sandbox/local.py +1 -1
- inspect_ai/util/_sandbox/service.py +22 -7
- inspect_ai/util/_store.py +5 -6
- inspect_ai/util/_store_model.py +110 -0
- inspect_ai/util/_throttle.py +32 -0
- {inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/METADATA +2 -2
- {inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/RECORD +98 -76
- {inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,151 @@
|
|
1
|
+
from argparse import Namespace
|
2
|
+
from logging import getLogger
|
3
|
+
from pathlib import PurePosixPath
|
4
|
+
from re import Pattern, compile, match
|
5
|
+
from typing import Awaitable, Callable, Literal
|
6
|
+
|
7
|
+
from pydantic import JsonValue
|
8
|
+
|
9
|
+
from inspect_ai._util.ansi import render_text
|
10
|
+
from inspect_ai.util._sandbox import sandbox
|
11
|
+
|
12
|
+
from ..install import RECORD_SESSION_DIR
|
13
|
+
from ..state import HumanAgentState
|
14
|
+
from .command import HumanAgentCommand, call_human_agent
|
15
|
+
|
16
|
+
logger = getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class SubmitCommand(HumanAgentCommand):
|
20
|
+
def __init__(self, record_session: bool):
|
21
|
+
super().__init__()
|
22
|
+
self._record_session = record_session
|
23
|
+
|
24
|
+
@property
|
25
|
+
def name(self) -> str:
|
26
|
+
return "submit"
|
27
|
+
|
28
|
+
@property
|
29
|
+
def description(self) -> str:
|
30
|
+
return "Submit your final answer for the task."
|
31
|
+
|
32
|
+
@property
|
33
|
+
def group(self) -> Literal[1, 2, 3]:
|
34
|
+
return 1
|
35
|
+
|
36
|
+
@property
|
37
|
+
def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
|
38
|
+
return [
|
39
|
+
HumanAgentCommand.CLIArg(
|
40
|
+
name="answer",
|
41
|
+
description="Answer to submit for scoring (optional, not required for all tasks)",
|
42
|
+
)
|
43
|
+
]
|
44
|
+
|
45
|
+
def cli(self, args: Namespace) -> None:
|
46
|
+
# read cli args
|
47
|
+
call_args = vars(args)
|
48
|
+
|
49
|
+
# first validate (print and exit if we get a str back)
|
50
|
+
error = call_human_agent("validate", **call_args)
|
51
|
+
if error:
|
52
|
+
print(error)
|
53
|
+
return
|
54
|
+
|
55
|
+
# verify that the user wants to proceed
|
56
|
+
answer = call_args.get("answer", None)
|
57
|
+
answer_text = f" '{answer}'" if answer else ""
|
58
|
+
while True:
|
59
|
+
response = (
|
60
|
+
input(
|
61
|
+
f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
|
62
|
+
)
|
63
|
+
.lower()
|
64
|
+
.strip()
|
65
|
+
)
|
66
|
+
if response in ["yes", "y"]:
|
67
|
+
break
|
68
|
+
elif response in ["no", "n"]:
|
69
|
+
return
|
70
|
+
else:
|
71
|
+
print("Please enter yes or no.")
|
72
|
+
|
73
|
+
# thank the user!
|
74
|
+
print(
|
75
|
+
"\nThank you for working on this task!\n\n"
|
76
|
+
+ "Your task will now be scored and you will be disconnected from this container.\n"
|
77
|
+
)
|
78
|
+
|
79
|
+
# submit the task
|
80
|
+
call_human_agent("submit", **call_args)
|
81
|
+
|
82
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
83
|
+
async def submit(
|
84
|
+
answer: str | None, session_logs: dict[str, str] | None = None
|
85
|
+
) -> None:
|
86
|
+
if self._record_session:
|
87
|
+
state.logs = await self._read_session_logs()
|
88
|
+
state.running = False
|
89
|
+
state.answer = answer
|
90
|
+
|
91
|
+
return submit
|
92
|
+
|
93
|
+
async def _read_session_logs(self) -> dict[str, str]:
|
94
|
+
# retreive session logs (don't fail)
|
95
|
+
sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
|
96
|
+
result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
|
97
|
+
if not result.success:
|
98
|
+
logger.warning(f"Error listing human agent session logs: {result.stderr}")
|
99
|
+
return {}
|
100
|
+
|
101
|
+
# read logs
|
102
|
+
session_logs: dict[str, str] = {}
|
103
|
+
for session_log in result.stdout.strip().splitlines():
|
104
|
+
try:
|
105
|
+
session_logs[session_log] = await sandbox().read_file(
|
106
|
+
(sessions_dir / session_log).as_posix()
|
107
|
+
)
|
108
|
+
except Exception as ex:
|
109
|
+
logger.warning(f"Error reading human agent session log: {ex}")
|
110
|
+
|
111
|
+
return session_logs
|
112
|
+
|
113
|
+
|
114
|
+
class ValidateCommand(HumanAgentCommand):
|
115
|
+
def __init__(self, answer: bool | str) -> None:
|
116
|
+
self._answer = compile(answer) if isinstance(answer, str) else answer
|
117
|
+
|
118
|
+
@property
|
119
|
+
def name(self) -> str:
|
120
|
+
return "validate"
|
121
|
+
|
122
|
+
@property
|
123
|
+
def description(self) -> str:
|
124
|
+
return "Validate a task submission."
|
125
|
+
|
126
|
+
@property
|
127
|
+
def contexts(self) -> list[Literal["cli", "service"]]:
|
128
|
+
return ["service"]
|
129
|
+
|
130
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
131
|
+
async def validate(answer: str | None) -> str | None:
|
132
|
+
def failed(reason: str) -> str:
|
133
|
+
return render_text(f"[bold]FAILED:[/bold] {reason}")
|
134
|
+
|
135
|
+
if not state.running:
|
136
|
+
return failed("Task is stopped (use 'task start' to start)")
|
137
|
+
if self._answer:
|
138
|
+
answer = answer.strip() if isinstance(answer, str) else answer
|
139
|
+
if not answer:
|
140
|
+
return failed(
|
141
|
+
"An explicit answer is required for scoring this task."
|
142
|
+
)
|
143
|
+
elif isinstance(self._answer, Pattern) and not match(
|
144
|
+
self._answer, answer
|
145
|
+
):
|
146
|
+
return failed(
|
147
|
+
"Your answer was not in the required format (please review the task instructions)"
|
148
|
+
)
|
149
|
+
return None # made it through verification
|
150
|
+
|
151
|
+
return validate
|
@@ -0,0 +1,222 @@
|
|
1
|
+
import inspect
|
2
|
+
from textwrap import dedent
|
3
|
+
|
4
|
+
from inspect_ai.util import sandbox
|
5
|
+
|
6
|
+
from .._task_state import TaskState
|
7
|
+
from .commands.command import HumanAgentCommand
|
8
|
+
|
9
|
+
INSTALL_DIR = "human_agent_install"
|
10
|
+
HUMAN_AGENT_DIR = "/opt/human_agent"
|
11
|
+
TASK_PY = "task.py"
|
12
|
+
INSTALL_SH = "install.sh"
|
13
|
+
BASHRC = ".bashrc"
|
14
|
+
WELCOME_FILE = "welcome.txt"
|
15
|
+
WELCOME_LOGIN_FILE = "welcome_login.txt"
|
16
|
+
INSTRUCTIONS_FILE = "instructions.txt"
|
17
|
+
RECORD_SESSION_DIR = "/var/tmp/user-sessions"
|
18
|
+
|
19
|
+
|
20
|
+
async def install_human_agent(
|
21
|
+
state: TaskState, commands: list[HumanAgentCommand], record_session: bool
|
22
|
+
) -> None:
|
23
|
+
# see if we have already installed
|
24
|
+
if not (await sandbox().exec(["mkdir", HUMAN_AGENT_DIR])).success:
|
25
|
+
return
|
26
|
+
|
27
|
+
# setup installation directory
|
28
|
+
await checked_exec(["mkdir", "-p", INSTALL_DIR])
|
29
|
+
|
30
|
+
# generate task.py
|
31
|
+
task_py = human_agent_commands(commands)
|
32
|
+
await checked_write_file(f"{INSTALL_DIR}/{TASK_PY}", task_py, executable=True)
|
33
|
+
|
34
|
+
# generate .bashrc
|
35
|
+
bash_rc = human_agent_bashrc(commands, record_session)
|
36
|
+
await checked_write_file(f"{INSTALL_DIR}/{BASHRC}", bash_rc, executable=True)
|
37
|
+
|
38
|
+
# write and run installation script
|
39
|
+
install_sh = human_agent_install_sh()
|
40
|
+
await checked_write_file(f"{INSTALL_DIR}/{INSTALL_SH}", install_sh, executable=True)
|
41
|
+
await checked_exec(["bash", f"./{INSTALL_SH}"], cwd=INSTALL_DIR)
|
42
|
+
await checked_exec(["rm", "-rf", INSTALL_DIR])
|
43
|
+
|
44
|
+
|
45
|
+
def human_agent_commands(commands: list[HumanAgentCommand]) -> str:
|
46
|
+
# filter out hidden commands
|
47
|
+
commands = [command for command in commands if "cli" in command.contexts]
|
48
|
+
|
49
|
+
# standard imports (including any dependencies that call methods carry)
|
50
|
+
imports = dedent("""
|
51
|
+
import argparse
|
52
|
+
import sys
|
53
|
+
from argparse import Namespace
|
54
|
+
from pathlib import Path
|
55
|
+
|
56
|
+
sys.path.append("/var/tmp/sandbox-services/human_agent")
|
57
|
+
from human_agent import call_human_agent
|
58
|
+
|
59
|
+
def format_time(t):
|
60
|
+
minutes, seconds = divmod(t, 60)
|
61
|
+
hours, minutes = divmod(minutes, 60)
|
62
|
+
return f"{hours:.0f}:{minutes:02.0f}:{seconds:02.0f}"
|
63
|
+
""")
|
64
|
+
|
65
|
+
# command handler source code (extracted from call methods)
|
66
|
+
command_handlers = "\n\n".join(
|
67
|
+
dedent(
|
68
|
+
inspect.getsource(command.cli).replace("cli(self, ", f"{command.name}(", 1)
|
69
|
+
)
|
70
|
+
for command in commands
|
71
|
+
)
|
72
|
+
|
73
|
+
# parse commands
|
74
|
+
command_parsers: list[str] = []
|
75
|
+
for command in commands:
|
76
|
+
command_parsers.append(
|
77
|
+
dedent(f"""
|
78
|
+
{command.name}_parser = subparsers.add_parser("{command.name}", help="{command.description}")
|
79
|
+
""").lstrip()
|
80
|
+
)
|
81
|
+
for arg in command.cli_args:
|
82
|
+
if arg.name.startswith("--"):
|
83
|
+
extras = 'action="store_true", default=False'
|
84
|
+
else:
|
85
|
+
extras = f"""nargs={1 if arg.required else '"?"'}"""
|
86
|
+
command_parsers.append(
|
87
|
+
dedent(f"""
|
88
|
+
{command.name}_parser.add_argument("{arg.name}", {extras}, help="{arg.description}")
|
89
|
+
""").strip()
|
90
|
+
)
|
91
|
+
|
92
|
+
parse = (
|
93
|
+
dedent("""
|
94
|
+
parser = argparse.ArgumentParser(description="Human agent tools.")
|
95
|
+
subparsers = parser.add_subparsers(dest="command")
|
96
|
+
""")
|
97
|
+
+ "\n"
|
98
|
+
+ "\n".join(command_parsers)
|
99
|
+
)
|
100
|
+
|
101
|
+
# dispatch commands
|
102
|
+
command_dispatchers: list[str] = []
|
103
|
+
for i, command in enumerate(commands):
|
104
|
+
conditional = "if" if i == 0 else "elif"
|
105
|
+
command_dispatchers.append(
|
106
|
+
f'{conditional} command == "{command.name}": {command.name}(args)'
|
107
|
+
)
|
108
|
+
command_dispatchers.append("else: parser.print_help()")
|
109
|
+
|
110
|
+
dispatch = dedent("""
|
111
|
+
args = parser.parse_args()
|
112
|
+
command = args.command
|
113
|
+
delattr(args, 'command')
|
114
|
+
""") + "\n".join(command_dispatchers)
|
115
|
+
|
116
|
+
return "\n".join([imports, command_handlers, parse, dispatch]) + "\n"
|
117
|
+
|
118
|
+
|
119
|
+
def human_agent_bashrc(commands: list[HumanAgentCommand], record_session: bool) -> str:
|
120
|
+
# only run in interative terminals
|
121
|
+
TERMINAL_CHECK = dedent("""
|
122
|
+
|
123
|
+
### Inspect Human Agent Setup #########################################=
|
124
|
+
|
125
|
+
# only run if shell is interactive
|
126
|
+
case $- in
|
127
|
+
*i*) ;;
|
128
|
+
*) return ;;
|
129
|
+
esac
|
130
|
+
|
131
|
+
# only run if attached to a terminal
|
132
|
+
if ! tty -s; then
|
133
|
+
return
|
134
|
+
fi
|
135
|
+
""")
|
136
|
+
|
137
|
+
# shell alias and completions
|
138
|
+
command_names = " ".join(
|
139
|
+
[f"{command.name}" for command in commands if "cli" in command.contexts]
|
140
|
+
)
|
141
|
+
COMMANDS = dedent(f"""
|
142
|
+
# shell alias for human agent commands
|
143
|
+
alias task='python3 {HUMAN_AGENT_DIR}/{TASK_PY}'
|
144
|
+
|
145
|
+
# completion handler
|
146
|
+
_task_completion() {{
|
147
|
+
local cur
|
148
|
+
cur="${{COMP_WORDS[COMP_CWORD]}}"
|
149
|
+
if [ "$COMP_CWORD" -eq 1 ]; then
|
150
|
+
local commands="{command_names}"
|
151
|
+
|
152
|
+
# Generate completion matches
|
153
|
+
COMPREPLY=($(compgen -W "${{commands}}" -- ${{cur}}))
|
154
|
+
fi
|
155
|
+
}}
|
156
|
+
complete -F _task_completion task
|
157
|
+
""")
|
158
|
+
|
159
|
+
# session recording
|
160
|
+
if record_session:
|
161
|
+
RECORDING = dedent(f"""
|
162
|
+
# record human agent session transcript
|
163
|
+
if [ -z "$SCRIPT_RUNNING" ]; then
|
164
|
+
export SCRIPT_RUNNING=1
|
165
|
+
LOGDIR={RECORD_SESSION_DIR}
|
166
|
+
mkdir -p "$LOGDIR"
|
167
|
+
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
168
|
+
INPUTFILE="$LOGDIR/$(whoami)_$TIMESTAMP.input"
|
169
|
+
OUTPUTFILE="$LOGDIR/$(whoami)_$TIMESTAMP.output"
|
170
|
+
TIMINGFILE="$LOGDIR/$(whoami)_$TIMESTAMP.timing"
|
171
|
+
exec script -q -f -m advanced -I "$INPUTFILE" -O "$OUTPUTFILE" -T "$TIMINGFILE" -c "bash --login -i"
|
172
|
+
fi
|
173
|
+
""")
|
174
|
+
else:
|
175
|
+
RECORDING = ""
|
176
|
+
|
177
|
+
# display task instructions
|
178
|
+
INSTRUCTIONS = dedent("""
|
179
|
+
if [ -z "$INSTRUCTIONS_SHOWN" ]; then
|
180
|
+
export INSTRUCTIONS_SHOWN=1
|
181
|
+
task instructions > instructions.txt
|
182
|
+
cat instructions.txt
|
183
|
+
fi
|
184
|
+
""").lstrip()
|
185
|
+
|
186
|
+
# return .bashrc
|
187
|
+
return "\n".join([TERMINAL_CHECK, COMMANDS, RECORDING, INSTRUCTIONS])
|
188
|
+
|
189
|
+
|
190
|
+
def human_agent_install_sh() -> str:
|
191
|
+
return dedent(f"""
|
192
|
+
#!/usr/bin/env bash
|
193
|
+
|
194
|
+
# create installation directory
|
195
|
+
HUMAN_AGENT="{HUMAN_AGENT_DIR}"
|
196
|
+
mkdir -p $HUMAN_AGENT
|
197
|
+
|
198
|
+
# copy command script
|
199
|
+
cp {TASK_PY} $HUMAN_AGENT
|
200
|
+
|
201
|
+
# append to .bashrc
|
202
|
+
cat {BASHRC} >> ~/{BASHRC}
|
203
|
+
""")
|
204
|
+
|
205
|
+
|
206
|
+
async def checked_exec(
|
207
|
+
cmd: list[str],
|
208
|
+
input: str | bytes | None = None,
|
209
|
+
cwd: str | None = None,
|
210
|
+
) -> str:
|
211
|
+
result = await sandbox().exec(cmd, input=input, cwd=cwd)
|
212
|
+
if not result.success:
|
213
|
+
raise RuntimeError(f"Error executing command {' '.join(cmd)}: {result.stderr}")
|
214
|
+
return result.stdout
|
215
|
+
|
216
|
+
|
217
|
+
async def checked_write_file(
|
218
|
+
file: str, contents: str, executable: bool = False
|
219
|
+
) -> None:
|
220
|
+
await checked_exec(["tee", "--", file], input=contents)
|
221
|
+
if executable:
|
222
|
+
await checked_exec(["chmod", "+x", file])
|
@@ -0,0 +1,252 @@
|
|
1
|
+
from typing import cast
|
2
|
+
|
3
|
+
from textual.app import ComposeResult
|
4
|
+
from textual.containers import (
|
5
|
+
Container,
|
6
|
+
Horizontal,
|
7
|
+
VerticalScroll,
|
8
|
+
)
|
9
|
+
from textual.reactive import reactive
|
10
|
+
from textual.widgets import (
|
11
|
+
Button,
|
12
|
+
ContentSwitcher,
|
13
|
+
Label,
|
14
|
+
Link,
|
15
|
+
LoadingIndicator,
|
16
|
+
Static,
|
17
|
+
)
|
18
|
+
|
19
|
+
from inspect_ai._util.format import format_progress_time
|
20
|
+
from inspect_ai._util.vscode import (
|
21
|
+
VSCodeCommand,
|
22
|
+
can_execute_vscode_commands,
|
23
|
+
execute_vscode_commands,
|
24
|
+
)
|
25
|
+
from inspect_ai.util import InputPanel, SandboxConnection, throttle
|
26
|
+
|
27
|
+
from .state import HumanAgentState
|
28
|
+
|
29
|
+
|
30
|
+
class HumanAgentPanel(InputPanel):
|
31
|
+
DEFAULT_TITLE = "Human Agent"
|
32
|
+
|
33
|
+
SANDBOX_VIEW_ID = "human-agent-sandbox-view"
|
34
|
+
SANDBOX_INSTRUCTIONS_ID = "sandbox-instructions"
|
35
|
+
VSCODE_LINKS_ID = "vscode-links"
|
36
|
+
LOGIN_VSCODE_TERMINAL_ID = "login-vscode-terminal"
|
37
|
+
LOGIN_VSCODE_WINDOW_ID = "login-vscode-window"
|
38
|
+
COMMAND_INSTRUCTIONS_ID = "command-instructions"
|
39
|
+
SANDBOX_COMMAND_ID = "sandbox-command"
|
40
|
+
|
41
|
+
INSTRUCTIONS_CLASS = "instructions"
|
42
|
+
LINK_LABEL_CLASS = "link-label"
|
43
|
+
|
44
|
+
DEFAULT_CSS = f"""
|
45
|
+
#{SANDBOX_VIEW_ID} {{
|
46
|
+
scrollbar-size-vertical: 1;
|
47
|
+
}}
|
48
|
+
HumanAgentPanel .{INSTRUCTIONS_CLASS} {{
|
49
|
+
color: $text-muted;
|
50
|
+
margin-bottom: 1;
|
51
|
+
}}
|
52
|
+
#{SANDBOX_COMMAND_ID} {{
|
53
|
+
color: $secondary;
|
54
|
+
}}
|
55
|
+
HumanAgentPanel .{LINK_LABEL_CLASS} {{
|
56
|
+
color: $text-muted;
|
57
|
+
}}
|
58
|
+
HumanAgentPanel VSCodeLink {{
|
59
|
+
margin-left: 1;
|
60
|
+
margin-right: 2;
|
61
|
+
}}
|
62
|
+
HumanAgentPanel #{VSCODE_LINKS_ID} {{
|
63
|
+
height: 1;
|
64
|
+
margin-bottom: 1;
|
65
|
+
}}
|
66
|
+
"""
|
67
|
+
|
68
|
+
connection: reactive[SandboxConnection | None] = reactive(None)
|
69
|
+
|
70
|
+
# implement HumanAgentView
|
71
|
+
def connect(self, connection: SandboxConnection) -> None:
|
72
|
+
self.connection = connection
|
73
|
+
|
74
|
+
@throttle(1)
|
75
|
+
def update_state(self, state: HumanAgentState) -> None:
|
76
|
+
status_bar = self.query_one(StatusBar)
|
77
|
+
status_bar.running = state.running
|
78
|
+
status_bar.time = state.time
|
79
|
+
|
80
|
+
def compose(self) -> ComposeResult:
|
81
|
+
with ContentSwitcher(initial=LoadingView.ID):
|
82
|
+
yield LoadingView()
|
83
|
+
with VerticalScroll(id=self.SANDBOX_VIEW_ID):
|
84
|
+
yield StatusBar()
|
85
|
+
yield Static(
|
86
|
+
id=self.SANDBOX_INSTRUCTIONS_ID, classes=self.INSTRUCTIONS_CLASS
|
87
|
+
)
|
88
|
+
with Horizontal(id=self.VSCODE_LINKS_ID):
|
89
|
+
yield Label("Login:", classes=self.LINK_LABEL_CLASS)
|
90
|
+
yield VSCodeLink(
|
91
|
+
"VS Code Window",
|
92
|
+
id=self.LOGIN_VSCODE_WINDOW_ID,
|
93
|
+
)
|
94
|
+
yield Label("Login:", classes=self.LINK_LABEL_CLASS)
|
95
|
+
yield VSCodeLink(
|
96
|
+
"VS Code Terminal",
|
97
|
+
id=self.LOGIN_VSCODE_TERMINAL_ID,
|
98
|
+
)
|
99
|
+
yield Static(
|
100
|
+
id=self.COMMAND_INSTRUCTIONS_ID, classes=self.INSTRUCTIONS_CLASS
|
101
|
+
)
|
102
|
+
yield Static(id=self.SANDBOX_COMMAND_ID)
|
103
|
+
|
104
|
+
def watch_connection(self, connection: SandboxConnection | None) -> None:
|
105
|
+
if connection:
|
106
|
+
# switch to sandbox view
|
107
|
+
self.query_one(ContentSwitcher).current = self.SANDBOX_VIEW_ID
|
108
|
+
|
109
|
+
# note whether we are in vscode
|
110
|
+
vscode = can_execute_vscode_commands()
|
111
|
+
|
112
|
+
# suffix for instructions based on whether we are in vscode
|
113
|
+
instructions_command = "Login to the system with the following command (hold down Alt or Option to select text for copying):"
|
114
|
+
instructions_vscode = (
|
115
|
+
"Use the links below to login to the system within VS Code:"
|
116
|
+
)
|
117
|
+
|
118
|
+
# update instructions
|
119
|
+
instructions_lbl = cast(
|
120
|
+
Static, self.query_one(f"#{self.SANDBOX_INSTRUCTIONS_ID}")
|
121
|
+
)
|
122
|
+
instructions = f"""You are completing a task on a Linux system (task instructions will be presented when you login). {instructions_vscode if vscode else instructions_command}"""
|
123
|
+
instructions_lbl.update(instructions)
|
124
|
+
|
125
|
+
# login: vscode terminal
|
126
|
+
vscode_links = self.query_one(f"#{self.VSCODE_LINKS_ID}")
|
127
|
+
vscode_links.display = vscode
|
128
|
+
terminal_btn = cast(
|
129
|
+
VSCodeLink, self.query_one(f"#{self.LOGIN_VSCODE_TERMINAL_ID}")
|
130
|
+
)
|
131
|
+
terminal_btn.commands = [
|
132
|
+
VSCodeCommand(
|
133
|
+
command="workbench.action.terminal.new", args=[{"location": 2}]
|
134
|
+
),
|
135
|
+
VSCodeCommand(
|
136
|
+
command="workbench.action.terminal.sendSequence",
|
137
|
+
args=[{"text": f"{connection.command}\n"}],
|
138
|
+
),
|
139
|
+
]
|
140
|
+
|
141
|
+
# login: vscode window
|
142
|
+
window_btn = cast(
|
143
|
+
VSCodeLink, self.query_one(f"#{self.LOGIN_VSCODE_WINDOW_ID}")
|
144
|
+
)
|
145
|
+
if connection.vscode_command is not None:
|
146
|
+
window_btn.commands = [
|
147
|
+
VSCodeCommand(
|
148
|
+
command=connection.vscode_command[0],
|
149
|
+
args=connection.vscode_command[1:],
|
150
|
+
)
|
151
|
+
]
|
152
|
+
|
153
|
+
# command (always available)
|
154
|
+
command_instructions_lbl = cast(
|
155
|
+
Static, self.query_one(f"#{self.COMMAND_INSTRUCTIONS_ID}")
|
156
|
+
)
|
157
|
+
command_instructions_lbl.display = vscode
|
158
|
+
command_instructions_lbl.update(
|
159
|
+
instructions_command.replace("Login", "Alternatively, login", 1)
|
160
|
+
)
|
161
|
+
command_lbl = cast(Static, self.query_one(f"#{self.SANDBOX_COMMAND_ID}"))
|
162
|
+
command_lbl.update(connection.command)
|
163
|
+
|
164
|
+
|
165
|
+
class StatusBar(Horizontal):
|
166
|
+
STATUS_ID = "task-status"
|
167
|
+
TIME_ID = "task-time"
|
168
|
+
|
169
|
+
LABEL_CLASS = "status-label"
|
170
|
+
VALUE_CLASS = "status-value"
|
171
|
+
|
172
|
+
DEFAULT_CSS = f"""
|
173
|
+
StatusBar {{
|
174
|
+
width: 1fr;
|
175
|
+
height: 1;
|
176
|
+
background: $surface;
|
177
|
+
margin-bottom: 1;
|
178
|
+
layout: grid;
|
179
|
+
grid-size: 4 1;
|
180
|
+
grid-columns: auto auto auto auto;
|
181
|
+
grid-gutter: 1;
|
182
|
+
}}
|
183
|
+
.{LABEL_CLASS} {{
|
184
|
+
color: $primary;
|
185
|
+
}}
|
186
|
+
.{VALUE_CLASS} {{
|
187
|
+
color: $foreground;
|
188
|
+
}}
|
189
|
+
StatusBar Link {{
|
190
|
+
dock: right;
|
191
|
+
margin-right: 1;
|
192
|
+
}}
|
193
|
+
"""
|
194
|
+
|
195
|
+
running: reactive[bool] = reactive(True)
|
196
|
+
time: reactive[float] = reactive(0)
|
197
|
+
|
198
|
+
def __init__(self) -> None:
|
199
|
+
super().__init__()
|
200
|
+
|
201
|
+
def compose(self) -> ComposeResult:
|
202
|
+
yield Label("Status:", classes=self.LABEL_CLASS)
|
203
|
+
yield Static("Running", id=self.STATUS_ID, classes=self.VALUE_CLASS)
|
204
|
+
yield Label(" Time:", classes=self.LABEL_CLASS)
|
205
|
+
yield Static("0:00:00", id=self.TIME_ID, classes=self.VALUE_CLASS)
|
206
|
+
|
207
|
+
def watch_running(self, running: bool) -> None:
|
208
|
+
cast(Static, self.query_one(f"#{self.STATUS_ID}")).update(
|
209
|
+
"Running" if running else "Stopped"
|
210
|
+
)
|
211
|
+
|
212
|
+
def watch_time(self, time: float) -> None:
|
213
|
+
time_display = format_progress_time(time)
|
214
|
+
cast(Static, self.query_one(f"#{self.TIME_ID}")).update(time_display)
|
215
|
+
|
216
|
+
|
217
|
+
class LoadingView(Container):
|
218
|
+
ID = "human-agent-loading-view"
|
219
|
+
|
220
|
+
def __init__(self) -> None:
|
221
|
+
super().__init__(id=self.ID)
|
222
|
+
|
223
|
+
def compose(self) -> ComposeResult:
|
224
|
+
yield LoadingIndicator()
|
225
|
+
yield Button() # add focusable widget so the tab can activate
|
226
|
+
|
227
|
+
|
228
|
+
class VSCodeLink(Link):
|
229
|
+
def __init__(
|
230
|
+
self,
|
231
|
+
text: str,
|
232
|
+
*,
|
233
|
+
url: str | None = None,
|
234
|
+
tooltip: str | None = None,
|
235
|
+
name: str | None = None,
|
236
|
+
id: str | None = None,
|
237
|
+
classes: str | None = None,
|
238
|
+
disabled: bool = False,
|
239
|
+
) -> None:
|
240
|
+
super().__init__(
|
241
|
+
text,
|
242
|
+
url=url,
|
243
|
+
tooltip=tooltip,
|
244
|
+
name=name,
|
245
|
+
id=id,
|
246
|
+
classes=classes,
|
247
|
+
disabled=disabled,
|
248
|
+
)
|
249
|
+
self.commands: list[VSCodeCommand] = []
|
250
|
+
|
251
|
+
def on_click(self) -> None:
|
252
|
+
execute_vscode_commands(self.commands)
|
@@ -0,0 +1,45 @@
|
|
1
|
+
from inspect_ai.model import ModelOutput
|
2
|
+
from inspect_ai.util._sandbox import sandbox
|
3
|
+
from inspect_ai.util._sandbox.service import sandbox_service
|
4
|
+
|
5
|
+
from .._task_state import TaskState
|
6
|
+
from .commands.command import HumanAgentCommand
|
7
|
+
from .state import HumanAgentState
|
8
|
+
from .view import HumanAgentView
|
9
|
+
|
10
|
+
|
11
|
+
async def run_human_agent_service(
|
12
|
+
state: TaskState, commands: list[HumanAgentCommand], view: HumanAgentView | None
|
13
|
+
) -> TaskState:
|
14
|
+
# initialise agent state
|
15
|
+
instructions = "\n\n".join([message.text for message in state.messages]).strip()
|
16
|
+
agent_state = HumanAgentState(instructions=instructions)
|
17
|
+
|
18
|
+
# extract service methods from commands
|
19
|
+
methods = {
|
20
|
+
command.name: command.service(agent_state)
|
21
|
+
for command in commands
|
22
|
+
if "service" in command.contexts
|
23
|
+
}
|
24
|
+
|
25
|
+
# callback to check if task is completed (use this to periodically
|
26
|
+
# update the view with the current state)
|
27
|
+
def task_is_completed() -> bool:
|
28
|
+
if view:
|
29
|
+
view.update_state(agent_state)
|
30
|
+
return agent_state.answer is not None
|
31
|
+
|
32
|
+
# run the service
|
33
|
+
await sandbox_service(
|
34
|
+
name="human_agent",
|
35
|
+
methods=methods,
|
36
|
+
until=task_is_completed,
|
37
|
+
sandbox=sandbox(),
|
38
|
+
)
|
39
|
+
|
40
|
+
# set the answer if we have one
|
41
|
+
if agent_state.answer is not None:
|
42
|
+
state.output = ModelOutput.from_content("human_agent", agent_state.answer)
|
43
|
+
|
44
|
+
# return state
|
45
|
+
return state
|