inspect-ai 0.3.55__py3-none-any.whl → 0.3.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. inspect_ai/__init__.py +1 -0
  2. inspect_ai/_cli/common.py +1 -1
  3. inspect_ai/_cli/trace.py +33 -20
  4. inspect_ai/_display/core/active.py +1 -1
  5. inspect_ai/_display/core/display.py +1 -1
  6. inspect_ai/_display/core/footer.py +1 -1
  7. inspect_ai/_display/core/progress.py +0 -6
  8. inspect_ai/_display/core/rich.py +1 -1
  9. inspect_ai/_display/rich/display.py +2 -2
  10. inspect_ai/_display/textual/app.py +15 -17
  11. inspect_ai/_display/textual/widgets/clock.py +3 -3
  12. inspect_ai/_display/textual/widgets/samples.py +6 -13
  13. inspect_ai/_eval/context.py +9 -1
  14. inspect_ai/_eval/score.py +4 -10
  15. inspect_ai/_eval/task/results.py +5 -4
  16. inspect_ai/_eval/task/run.py +6 -12
  17. inspect_ai/_eval/task/task.py +10 -0
  18. inspect_ai/_util/ansi.py +31 -0
  19. inspect_ai/_util/format.py +7 -0
  20. inspect_ai/_util/logger.py +12 -12
  21. inspect_ai/_util/throttle.py +10 -1
  22. inspect_ai/_util/trace.py +43 -47
  23. inspect_ai/_util/transcript.py +4 -0
  24. inspect_ai/_util/vscode.py +51 -0
  25. inspect_ai/_view/notify.py +2 -1
  26. inspect_ai/_view/www/App.css +22 -1
  27. inspect_ai/_view/www/dist/assets/index.css +2374 -2
  28. inspect_ai/_view/www/dist/assets/index.js +29622 -24424
  29. inspect_ai/_view/www/log-schema.json +138 -90
  30. inspect_ai/_view/www/package.json +1 -0
  31. inspect_ai/_view/www/src/App.mjs +1 -0
  32. inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
  33. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
  34. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
  35. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
  36. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
  37. inspect_ai/_view/www/src/components/Tools.mjs +11 -3
  38. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
  39. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
  40. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
  41. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
  42. inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
  43. inspect_ai/_view/www/src/types/log.d.ts +26 -12
  44. inspect_ai/_view/www/yarn.lock +44 -0
  45. inspect_ai/approval/_apply.py +4 -0
  46. inspect_ai/approval/_human/panel.py +5 -8
  47. inspect_ai/dataset/_dataset.py +51 -10
  48. inspect_ai/dataset/_util.py +31 -3
  49. inspect_ai/log/__init__.py +2 -0
  50. inspect_ai/log/_log.py +5 -2
  51. inspect_ai/model/_call_tools.py +4 -2
  52. inspect_ai/model/_chat_message.py +3 -0
  53. inspect_ai/model/_model.py +42 -1
  54. inspect_ai/model/_providers/anthropic.py +4 -0
  55. inspect_ai/model/_render.py +9 -2
  56. inspect_ai/scorer/_metric.py +12 -1
  57. inspect_ai/solver/__init__.py +2 -0
  58. inspect_ai/solver/_human_agent/agent.py +83 -0
  59. inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
  60. inspect_ai/solver/_human_agent/commands/clock.py +70 -0
  61. inspect_ai/solver/_human_agent/commands/command.py +59 -0
  62. inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
  63. inspect_ai/solver/_human_agent/commands/note.py +42 -0
  64. inspect_ai/solver/_human_agent/commands/score.py +80 -0
  65. inspect_ai/solver/_human_agent/commands/status.py +62 -0
  66. inspect_ai/solver/_human_agent/commands/submit.py +151 -0
  67. inspect_ai/solver/_human_agent/install.py +222 -0
  68. inspect_ai/solver/_human_agent/panel.py +252 -0
  69. inspect_ai/solver/_human_agent/service.py +45 -0
  70. inspect_ai/solver/_human_agent/state.py +55 -0
  71. inspect_ai/solver/_human_agent/view.py +24 -0
  72. inspect_ai/solver/_task_state.py +28 -2
  73. inspect_ai/tool/_tool.py +10 -2
  74. inspect_ai/tool/_tools/_web_browser/_web_browser.py +13 -10
  75. inspect_ai/util/__init__.py +8 -4
  76. inspect_ai/{_util/display.py → util/_display.py} +6 -0
  77. inspect_ai/util/_panel.py +31 -9
  78. inspect_ai/util/_sandbox/__init__.py +0 -3
  79. inspect_ai/util/_sandbox/context.py +5 -1
  80. inspect_ai/util/_sandbox/docker/compose.py +16 -10
  81. inspect_ai/util/_sandbox/docker/docker.py +9 -6
  82. inspect_ai/util/_sandbox/docker/internal.py +1 -1
  83. inspect_ai/util/_sandbox/docker/util.py +2 -2
  84. inspect_ai/util/_sandbox/environment.py +6 -5
  85. inspect_ai/util/_sandbox/local.py +1 -1
  86. inspect_ai/util/_sandbox/service.py +22 -7
  87. inspect_ai/util/_store.py +5 -6
  88. inspect_ai/util/_store_model.py +110 -0
  89. inspect_ai/util/_throttle.py +32 -0
  90. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/METADATA +1 -1
  91. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/RECORD +95 -73
  92. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/LICENSE +0 -0
  93. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/WHEEL +0 -0
  94. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/entry_points.txt +0 -0
  95. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,222 @@
1
+ import inspect
2
+ from textwrap import dedent
3
+
4
+ from inspect_ai.util import sandbox
5
+
6
+ from .._task_state import TaskState
7
+ from .commands.command import HumanAgentCommand
8
+
9
+ INSTALL_DIR = "human_agent_install"
10
+ HUMAN_AGENT_DIR = "/opt/human_agent"
11
+ TASK_PY = "task.py"
12
+ INSTALL_SH = "install.sh"
13
+ BASHRC = ".bashrc"
14
+ WELCOME_FILE = "welcome.txt"
15
+ WELCOME_LOGIN_FILE = "welcome_login.txt"
16
+ INSTRUCTIONS_FILE = "instructions.txt"
17
+ RECORD_SESSION_DIR = "/var/tmp/user-sessions"
18
+
19
+
20
+ async def install_human_agent(
21
+ state: TaskState, commands: list[HumanAgentCommand], record_session: bool
22
+ ) -> None:
23
+ # see if we have already installed
24
+ if not (await sandbox().exec(["mkdir", HUMAN_AGENT_DIR])).success:
25
+ return
26
+
27
+ # setup installation directory
28
+ await checked_exec(["mkdir", "-p", INSTALL_DIR])
29
+
30
+ # generate task.py
31
+ task_py = human_agent_commands(commands)
32
+ await checked_write_file(f"{INSTALL_DIR}/{TASK_PY}", task_py, executable=True)
33
+
34
+ # generate .bashrc
35
+ bash_rc = human_agent_bashrc(commands, record_session)
36
+ await checked_write_file(f"{INSTALL_DIR}/{BASHRC}", bash_rc, executable=True)
37
+
38
+ # write and run installation script
39
+ install_sh = human_agent_install_sh()
40
+ await checked_write_file(f"{INSTALL_DIR}/{INSTALL_SH}", install_sh, executable=True)
41
+ await checked_exec(["bash", f"./{INSTALL_SH}"], cwd=INSTALL_DIR)
42
+ await checked_exec(["rm", "-rf", INSTALL_DIR])
43
+
44
+
45
+ def human_agent_commands(commands: list[HumanAgentCommand]) -> str:
46
+ # filter out hidden commands
47
+ commands = [command for command in commands if "cli" in command.contexts]
48
+
49
+ # standard imports (including any dependencies that call methods carry)
50
+ imports = dedent("""
51
+ import argparse
52
+ import sys
53
+ from argparse import Namespace
54
+ from pathlib import Path
55
+
56
+ sys.path.append("/var/tmp/sandbox-services/human_agent")
57
+ from human_agent import call_human_agent
58
+
59
+ def format_time(t):
60
+ minutes, seconds = divmod(t, 60)
61
+ hours, minutes = divmod(minutes, 60)
62
+ return f"{hours:.0f}:{minutes:02.0f}:{seconds:02.0f}"
63
+ """)
64
+
65
+ # command handler source code (extracted from call methods)
66
+ command_handlers = "\n\n".join(
67
+ dedent(
68
+ inspect.getsource(command.cli).replace("cli(self, ", f"{command.name}(", 1)
69
+ )
70
+ for command in commands
71
+ )
72
+
73
+ # parse commands
74
+ command_parsers: list[str] = []
75
+ for command in commands:
76
+ command_parsers.append(
77
+ dedent(f"""
78
+ {command.name}_parser = subparsers.add_parser("{command.name}", help="{command.description}")
79
+ """).lstrip()
80
+ )
81
+ for arg in command.cli_args:
82
+ if arg.name.startswith("--"):
83
+ extras = 'action="store_true", default=False'
84
+ else:
85
+ extras = f"""nargs={1 if arg.required else '"?"'}"""
86
+ command_parsers.append(
87
+ dedent(f"""
88
+ {command.name}_parser.add_argument("{arg.name}", {extras}, help="{arg.description}")
89
+ """).strip()
90
+ )
91
+
92
+ parse = (
93
+ dedent("""
94
+ parser = argparse.ArgumentParser(description="Human agent tools.")
95
+ subparsers = parser.add_subparsers(dest="command")
96
+ """)
97
+ + "\n"
98
+ + "\n".join(command_parsers)
99
+ )
100
+
101
+ # dispatch commands
102
+ command_dispatchers: list[str] = []
103
+ for i, command in enumerate(commands):
104
+ conditional = "if" if i == 0 else "elif"
105
+ command_dispatchers.append(
106
+ f'{conditional} command == "{command.name}": {command.name}(args)'
107
+ )
108
+ command_dispatchers.append("else: parser.print_help()")
109
+
110
+ dispatch = dedent("""
111
+ args = parser.parse_args()
112
+ command = args.command
113
+ delattr(args, 'command')
114
+ """) + "\n".join(command_dispatchers)
115
+
116
+ return "\n".join([imports, command_handlers, parse, dispatch]) + "\n"
117
+
118
+
119
+ def human_agent_bashrc(commands: list[HumanAgentCommand], record_session: bool) -> str:
120
+ # only run in interative terminals
121
+ TERMINAL_CHECK = dedent("""
122
+
123
+ ### Inspect Human Agent Setup #########################################=
124
+
125
+ # only run if shell is interactive
126
+ case $- in
127
+ *i*) ;;
128
+ *) return ;;
129
+ esac
130
+
131
+ # only run if attached to a terminal
132
+ if ! tty -s; then
133
+ return
134
+ fi
135
+ """)
136
+
137
+ # shell alias and completions
138
+ command_names = " ".join(
139
+ [f"{command.name}" for command in commands if "cli" in command.contexts]
140
+ )
141
+ COMMANDS = dedent(f"""
142
+ # shell alias for human agent commands
143
+ alias task='python3 {HUMAN_AGENT_DIR}/{TASK_PY}'
144
+
145
+ # completion handler
146
+ _task_completion() {{
147
+ local cur
148
+ cur="${{COMP_WORDS[COMP_CWORD]}}"
149
+ if [ "$COMP_CWORD" -eq 1 ]; then
150
+ local commands="{command_names}"
151
+
152
+ # Generate completion matches
153
+ COMPREPLY=($(compgen -W "${{commands}}" -- ${{cur}}))
154
+ fi
155
+ }}
156
+ complete -F _task_completion task
157
+ """)
158
+
159
+ # session recording
160
+ if record_session:
161
+ RECORDING = dedent(f"""
162
+ # record human agent session transcript
163
+ if [ -z "$SCRIPT_RUNNING" ]; then
164
+ export SCRIPT_RUNNING=1
165
+ LOGDIR={RECORD_SESSION_DIR}
166
+ mkdir -p "$LOGDIR"
167
+ TIMESTAMP=$(date +%Y%m%d_%H%M%S)
168
+ INPUTFILE="$LOGDIR/$(whoami)_$TIMESTAMP.input"
169
+ OUTPUTFILE="$LOGDIR/$(whoami)_$TIMESTAMP.output"
170
+ TIMINGFILE="$LOGDIR/$(whoami)_$TIMESTAMP.timing"
171
+ exec script -q -f -m advanced -I "$INPUTFILE" -O "$OUTPUTFILE" -T "$TIMINGFILE" -c "bash --login -i"
172
+ fi
173
+ """)
174
+ else:
175
+ RECORDING = ""
176
+
177
+ # display task instructions
178
+ INSTRUCTIONS = dedent("""
179
+ if [ -z "$INSTRUCTIONS_SHOWN" ]; then
180
+ export INSTRUCTIONS_SHOWN=1
181
+ task instructions > instructions.txt
182
+ cat instructions.txt
183
+ fi
184
+ """).lstrip()
185
+
186
+ # return .bashrc
187
+ return "\n".join([TERMINAL_CHECK, COMMANDS, RECORDING, INSTRUCTIONS])
188
+
189
+
190
+ def human_agent_install_sh() -> str:
191
+ return dedent(f"""
192
+ #!/usr/bin/env bash
193
+
194
+ # create installation directory
195
+ HUMAN_AGENT="{HUMAN_AGENT_DIR}"
196
+ mkdir -p $HUMAN_AGENT
197
+
198
+ # copy command script
199
+ cp {TASK_PY} $HUMAN_AGENT
200
+
201
+ # append to .bashrc
202
+ cat {BASHRC} >> ~/{BASHRC}
203
+ """)
204
+
205
+
206
+ async def checked_exec(
207
+ cmd: list[str],
208
+ input: str | bytes | None = None,
209
+ cwd: str | None = None,
210
+ ) -> str:
211
+ result = await sandbox().exec(cmd, input=input, cwd=cwd)
212
+ if not result.success:
213
+ raise RuntimeError(f"Error executing command {' '.join(cmd)}: {result.stderr}")
214
+ return result.stdout
215
+
216
+
217
+ async def checked_write_file(
218
+ file: str, contents: str, executable: bool = False
219
+ ) -> None:
220
+ await checked_exec(["tee", "--", file], input=contents)
221
+ if executable:
222
+ await checked_exec(["chmod", "+x", file])
@@ -0,0 +1,252 @@
1
+ from typing import cast
2
+
3
+ from textual.app import ComposeResult
4
+ from textual.containers import (
5
+ Container,
6
+ Horizontal,
7
+ VerticalScroll,
8
+ )
9
+ from textual.reactive import reactive
10
+ from textual.widgets import (
11
+ Button,
12
+ ContentSwitcher,
13
+ Label,
14
+ Link,
15
+ LoadingIndicator,
16
+ Static,
17
+ )
18
+
19
+ from inspect_ai._util.format import format_progress_time
20
+ from inspect_ai._util.vscode import (
21
+ VSCodeCommand,
22
+ can_execute_vscode_commands,
23
+ execute_vscode_commands,
24
+ )
25
+ from inspect_ai.util import InputPanel, SandboxConnection, throttle
26
+
27
+ from .state import HumanAgentState
28
+
29
+
30
+ class HumanAgentPanel(InputPanel):
31
+ DEFAULT_TITLE = "Human Agent"
32
+
33
+ SANDBOX_VIEW_ID = "human-agent-sandbox-view"
34
+ SANDBOX_INSTRUCTIONS_ID = "sandbox-instructions"
35
+ VSCODE_LINKS_ID = "vscode-links"
36
+ LOGIN_VSCODE_TERMINAL_ID = "login-vscode-terminal"
37
+ LOGIN_VSCODE_WINDOW_ID = "login-vscode-window"
38
+ COMMAND_INSTRUCTIONS_ID = "command-instructions"
39
+ SANDBOX_COMMAND_ID = "sandbox-command"
40
+
41
+ INSTRUCTIONS_CLASS = "instructions"
42
+ LINK_LABEL_CLASS = "link-label"
43
+
44
+ DEFAULT_CSS = f"""
45
+ #{SANDBOX_VIEW_ID} {{
46
+ scrollbar-size-vertical: 1;
47
+ }}
48
+ HumanAgentPanel .{INSTRUCTIONS_CLASS} {{
49
+ color: $text-muted;
50
+ margin-bottom: 1;
51
+ }}
52
+ #{SANDBOX_COMMAND_ID} {{
53
+ color: $secondary;
54
+ }}
55
+ HumanAgentPanel .{LINK_LABEL_CLASS} {{
56
+ color: $text-muted;
57
+ }}
58
+ HumanAgentPanel VSCodeLink {{
59
+ margin-left: 1;
60
+ margin-right: 2;
61
+ }}
62
+ HumanAgentPanel #{VSCODE_LINKS_ID} {{
63
+ height: 1;
64
+ margin-bottom: 1;
65
+ }}
66
+ """
67
+
68
+ connection: reactive[SandboxConnection | None] = reactive(None)
69
+
70
+ # implement HumanAgentView
71
+ def connect(self, connection: SandboxConnection) -> None:
72
+ self.connection = connection
73
+
74
+ @throttle(1)
75
+ def update_state(self, state: HumanAgentState) -> None:
76
+ status_bar = self.query_one(StatusBar)
77
+ status_bar.running = state.running
78
+ status_bar.time = state.time
79
+
80
+ def compose(self) -> ComposeResult:
81
+ with ContentSwitcher(initial=LoadingView.ID):
82
+ yield LoadingView()
83
+ with VerticalScroll(id=self.SANDBOX_VIEW_ID):
84
+ yield StatusBar()
85
+ yield Static(
86
+ id=self.SANDBOX_INSTRUCTIONS_ID, classes=self.INSTRUCTIONS_CLASS
87
+ )
88
+ with Horizontal(id=self.VSCODE_LINKS_ID):
89
+ yield Label("Login:", classes=self.LINK_LABEL_CLASS)
90
+ yield VSCodeLink(
91
+ "VS Code Window",
92
+ id=self.LOGIN_VSCODE_WINDOW_ID,
93
+ )
94
+ yield Label("Login:", classes=self.LINK_LABEL_CLASS)
95
+ yield VSCodeLink(
96
+ "VS Code Terminal",
97
+ id=self.LOGIN_VSCODE_TERMINAL_ID,
98
+ )
99
+ yield Static(
100
+ id=self.COMMAND_INSTRUCTIONS_ID, classes=self.INSTRUCTIONS_CLASS
101
+ )
102
+ yield Static(id=self.SANDBOX_COMMAND_ID)
103
+
104
+ def watch_connection(self, connection: SandboxConnection | None) -> None:
105
+ if connection:
106
+ # switch to sandbox view
107
+ self.query_one(ContentSwitcher).current = self.SANDBOX_VIEW_ID
108
+
109
+ # note whether we are in vscode
110
+ vscode = can_execute_vscode_commands()
111
+
112
+ # suffix for instructions based on whether we are in vscode
113
+ instructions_command = "Login to the system with the following command (hold down Alt or Option to select text for copying):"
114
+ instructions_vscode = (
115
+ "Use the links below to login to the system within VS Code:"
116
+ )
117
+
118
+ # update instructions
119
+ instructions_lbl = cast(
120
+ Static, self.query_one(f"#{self.SANDBOX_INSTRUCTIONS_ID}")
121
+ )
122
+ instructions = f"""You are completing a task on a Linux system (task instructions will be presented when you login). {instructions_vscode if vscode else instructions_command}"""
123
+ instructions_lbl.update(instructions)
124
+
125
+ # login: vscode terminal
126
+ vscode_links = self.query_one(f"#{self.VSCODE_LINKS_ID}")
127
+ vscode_links.display = vscode
128
+ terminal_btn = cast(
129
+ VSCodeLink, self.query_one(f"#{self.LOGIN_VSCODE_TERMINAL_ID}")
130
+ )
131
+ terminal_btn.commands = [
132
+ VSCodeCommand(
133
+ command="workbench.action.terminal.new", args=[{"location": 2}]
134
+ ),
135
+ VSCodeCommand(
136
+ command="workbench.action.terminal.sendSequence",
137
+ args=[{"text": f"{connection.command}\n"}],
138
+ ),
139
+ ]
140
+
141
+ # login: vscode window
142
+ window_btn = cast(
143
+ VSCodeLink, self.query_one(f"#{self.LOGIN_VSCODE_WINDOW_ID}")
144
+ )
145
+ if connection.vscode_command is not None:
146
+ window_btn.commands = [
147
+ VSCodeCommand(
148
+ command=connection.vscode_command[0],
149
+ args=connection.vscode_command[1:],
150
+ )
151
+ ]
152
+
153
+ # command (always available)
154
+ command_instructions_lbl = cast(
155
+ Static, self.query_one(f"#{self.COMMAND_INSTRUCTIONS_ID}")
156
+ )
157
+ command_instructions_lbl.display = vscode
158
+ command_instructions_lbl.update(
159
+ instructions_command.replace("Login", "Alternatively, login", 1)
160
+ )
161
+ command_lbl = cast(Static, self.query_one(f"#{self.SANDBOX_COMMAND_ID}"))
162
+ command_lbl.update(connection.command)
163
+
164
+
165
+ class StatusBar(Horizontal):
166
+ STATUS_ID = "task-status"
167
+ TIME_ID = "task-time"
168
+
169
+ LABEL_CLASS = "status-label"
170
+ VALUE_CLASS = "status-value"
171
+
172
+ DEFAULT_CSS = f"""
173
+ StatusBar {{
174
+ width: 1fr;
175
+ height: 1;
176
+ background: $surface;
177
+ margin-bottom: 1;
178
+ layout: grid;
179
+ grid-size: 4 1;
180
+ grid-columns: auto auto auto auto;
181
+ grid-gutter: 1;
182
+ }}
183
+ .{LABEL_CLASS} {{
184
+ color: $primary;
185
+ }}
186
+ .{VALUE_CLASS} {{
187
+ color: $foreground;
188
+ }}
189
+ StatusBar Link {{
190
+ dock: right;
191
+ margin-right: 1;
192
+ }}
193
+ """
194
+
195
+ running: reactive[bool] = reactive(True)
196
+ time: reactive[float] = reactive(0)
197
+
198
+ def __init__(self) -> None:
199
+ super().__init__()
200
+
201
+ def compose(self) -> ComposeResult:
202
+ yield Label("Status:", classes=self.LABEL_CLASS)
203
+ yield Static("Running", id=self.STATUS_ID, classes=self.VALUE_CLASS)
204
+ yield Label(" Time:", classes=self.LABEL_CLASS)
205
+ yield Static("0:00:00", id=self.TIME_ID, classes=self.VALUE_CLASS)
206
+
207
+ def watch_running(self, running: bool) -> None:
208
+ cast(Static, self.query_one(f"#{self.STATUS_ID}")).update(
209
+ "Running" if running else "Stopped"
210
+ )
211
+
212
+ def watch_time(self, time: float) -> None:
213
+ time_display = format_progress_time(time)
214
+ cast(Static, self.query_one(f"#{self.TIME_ID}")).update(time_display)
215
+
216
+
217
+ class LoadingView(Container):
218
+ ID = "human-agent-loading-view"
219
+
220
+ def __init__(self) -> None:
221
+ super().__init__(id=self.ID)
222
+
223
+ def compose(self) -> ComposeResult:
224
+ yield LoadingIndicator()
225
+ yield Button() # add focusable widget so the tab can activate
226
+
227
+
228
+ class VSCodeLink(Link):
229
+ def __init__(
230
+ self,
231
+ text: str,
232
+ *,
233
+ url: str | None = None,
234
+ tooltip: str | None = None,
235
+ name: str | None = None,
236
+ id: str | None = None,
237
+ classes: str | None = None,
238
+ disabled: bool = False,
239
+ ) -> None:
240
+ super().__init__(
241
+ text,
242
+ url=url,
243
+ tooltip=tooltip,
244
+ name=name,
245
+ id=id,
246
+ classes=classes,
247
+ disabled=disabled,
248
+ )
249
+ self.commands: list[VSCodeCommand] = []
250
+
251
+ def on_click(self) -> None:
252
+ execute_vscode_commands(self.commands)
@@ -0,0 +1,45 @@
1
+ from inspect_ai.model import ModelOutput
2
+ from inspect_ai.util._sandbox import sandbox
3
+ from inspect_ai.util._sandbox.service import sandbox_service
4
+
5
+ from .._task_state import TaskState
6
+ from .commands.command import HumanAgentCommand
7
+ from .state import HumanAgentState
8
+ from .view import HumanAgentView
9
+
10
+
11
+ async def run_human_agent_service(
12
+ state: TaskState, commands: list[HumanAgentCommand], view: HumanAgentView | None
13
+ ) -> TaskState:
14
+ # initialise agent state
15
+ instructions = "\n\n".join([message.text for message in state.messages]).strip()
16
+ agent_state = HumanAgentState(instructions=instructions)
17
+
18
+ # extract service methods from commands
19
+ methods = {
20
+ command.name: command.service(agent_state)
21
+ for command in commands
22
+ if "service" in command.contexts
23
+ }
24
+
25
+ # callback to check if task is completed (use this to periodically
26
+ # update the view with the current state)
27
+ def task_is_completed() -> bool:
28
+ if view:
29
+ view.update_state(agent_state)
30
+ return agent_state.answer is not None
31
+
32
+ # run the service
33
+ await sandbox_service(
34
+ name="human_agent",
35
+ methods=methods,
36
+ until=task_is_completed,
37
+ sandbox=sandbox(),
38
+ )
39
+
40
+ # set the answer if we have one
41
+ if agent_state.answer is not None:
42
+ state.output = ModelOutput.from_content("human_agent", agent_state.answer)
43
+
44
+ # return state
45
+ return state
@@ -0,0 +1,55 @@
1
+ from time import time as current_time
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from inspect_ai.scorer._metric import Score
6
+ from inspect_ai.util._store_model import StoreModel
7
+
8
+
9
+ class IntermediateScoring(BaseModel):
10
+ time: float
11
+ scores: list[Score]
12
+
13
+
14
+ class HumanAgentState(StoreModel):
15
+ instructions: str
16
+ """Task instructions."""
17
+
18
+ @property
19
+ def running(self) -> bool:
20
+ """Is the task currently running?"""
21
+ return self.running_state
22
+
23
+ @running.setter
24
+ def running(self, running: bool) -> None:
25
+ """Set current running state."""
26
+ # if we are flipping to running mode then update started running
27
+ if not self.running_state and running:
28
+ self.started_running = current_time()
29
+
30
+ # if we are exiting running mode then update accumulated time
31
+ if self.running_state and not running:
32
+ self.accumulated_time = self.time
33
+
34
+ # update running
35
+ self.running_state = running
36
+
37
+ @property
38
+ def time(self) -> float:
39
+ """Total time spend on task."""
40
+ running_time = current_time() - self.started_running if self.running else 0
41
+ return self.accumulated_time + running_time
42
+
43
+ scorings: list[IntermediateScoring] = Field(default_factory=list)
44
+ """Intermediate scorings yielded by `task score`"""
45
+
46
+ answer: str | None = Field(default=None)
47
+ """Final answer provided in `task submit`"""
48
+
49
+ logs: dict[str, str] = Field(default_factory=dict)
50
+ """Session logs generated by `script` """
51
+
52
+ # internal state variables used by running and time properties
53
+ running_state: bool = Field(default=True)
54
+ started_running: float = Field(default_factory=current_time)
55
+ accumulated_time: float = Field(default=0.0)
@@ -0,0 +1,24 @@
1
+ from typing import Protocol
2
+
3
+ from inspect_ai.util import SandboxConnection
4
+
5
+ from .state import HumanAgentState
6
+
7
+
8
+ class HumanAgentView(Protocol):
9
+ def connect(self, connection: SandboxConnection) -> None: ...
10
+ def update_state(self, state: HumanAgentState) -> None: ...
11
+
12
+
13
+ class ConsoleView(HumanAgentView):
14
+ """Fallback view for when we aren't running fullscreen UI."""
15
+
16
+ def connect(self, connection: SandboxConnection) -> None:
17
+ print(
18
+ "You are completing a task on a Linux system (task instructions will be presented "
19
+ + "when you login). Login to the system with the following command:\n"
20
+ )
21
+ print(f"{connection.command}\n")
22
+
23
+ def update_state(self, state: HumanAgentState) -> None:
24
+ pass
@@ -3,11 +3,11 @@ from contextvars import ContextVar
3
3
  from copy import deepcopy
4
4
  from dataclasses import dataclass
5
5
  from random import Random
6
- from typing import Any, Union, cast, overload
6
+ from typing import Any, Type, Union, cast, overload
7
7
 
8
8
  from pydantic_core import to_jsonable_python
9
9
 
10
- from inspect_ai.dataset._dataset import Sample
10
+ from inspect_ai.dataset._dataset import MT, Sample, metadata_as
11
11
  from inspect_ai.model import (
12
12
  ChatMessage,
13
13
  ChatMessageUser,
@@ -19,6 +19,7 @@ from inspect_ai.model._model import sample_total_tokens
19
19
  from inspect_ai.tool import Tool, ToolChoice
20
20
  from inspect_ai.tool._tool_def import ToolDef
21
21
  from inspect_ai.util._store import Store, store_jsonable
22
+ from inspect_ai.util._store_model import SMT
22
23
 
23
24
 
24
25
  @dataclass
@@ -349,6 +350,31 @@ class TaskState:
349
350
  for tool in tools:
350
351
  self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
351
352
 
353
+ def metadata_as(self, metadata_cls: Type[MT]) -> MT:
354
+ """Pydantic model interface to metadata.
355
+
356
+ Args:
357
+ metadata_cls: Pydantic model type
358
+
359
+ Returns:
360
+ BaseModel: Instance of metadata_cls bound to current Store.
361
+ """
362
+ if not self.metadata:
363
+ raise ValueError("Sample does not have metadata")
364
+
365
+ return metadata_as(self.metadata, metadata_cls)
366
+
367
+ def store_as(self, model_cls: Type[SMT]) -> SMT:
368
+ """Pydantic model interface to the store.
369
+
370
+ Args:
371
+ model_cls: Pydantic model type (must derive from StoreModel)
372
+
373
+ Returns:
374
+ StoreModel: Instance of model_cls bound to current Store.
375
+ """
376
+ return model_cls(store=self.store)
377
+
352
378
 
353
379
  def sample_state() -> TaskState | None:
354
380
  return _sample_state.get(None)
inspect_ai/tool/_tool.py CHANGED
@@ -11,6 +11,7 @@ from typing import (
11
11
  runtime_checkable,
12
12
  )
13
13
 
14
+ from inspect_ai._util.content import ContentImage, ContentText
14
15
  from inspect_ai._util.registry import (
15
16
  RegistryInfo,
16
17
  registry_add,
@@ -18,13 +19,20 @@ from inspect_ai._util.registry import (
18
19
  registry_tag,
19
20
  )
20
21
 
21
- from . import Content
22
22
  from ._tool_call import ToolCallViewer
23
23
 
24
24
  logger = getLogger(__name__)
25
25
 
26
26
 
27
- ToolResult = str | int | float | bool | list[Content]
27
+ ToolResult = (
28
+ str
29
+ | int
30
+ | float
31
+ | bool
32
+ | ContentText
33
+ | ContentImage
34
+ | list[ContentText | ContentImage]
35
+ )
28
36
 
29
37
 
30
38
  class ToolError(Exception):