inspect-eval-utils 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_eval_utils-0.4.0/.gitignore +9 -0
- inspect_eval_utils-0.4.0/LICENSE +21 -0
- inspect_eval_utils-0.4.0/PKG-INFO +521 -0
- inspect_eval_utils-0.4.0/README.md +496 -0
- inspect_eval_utils-0.4.0/pyproject.toml +92 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/__init__.py +1 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_cli.py +86 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_detect.py +156 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/pyproject.toml +16 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/__init__.py +6 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/_registry.py +3 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/assets/instructions.md +6 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/py.typed +0 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/Dockerfile +14 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/compose.yaml +12 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/task.py +51 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/version.py +1 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/common/__init__.py +31 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/common/sandbox_files.py +153 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/common/task_secrets.py +154 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/py.typed +0 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/report/__init__.py +25 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/report/assets/InstrumentSans.ttf +0 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/report/assets/OFL.txt +93 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/report/cost.py +23 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/report/events.py +62 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/report/html.py +86 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/report/plot.py +219 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/report/writer.py +68 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/scaffolder.py +509 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/setting/__init__.py +23 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/setting/_context.py +50 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/setting/_types.py +104 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/setting/_utils.py +64 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/tool_cli/__init__.py +19 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/tool_cli/_mechanism.py +715 -0
- inspect_eval_utils-0.4.0/src/inspect_eval_utils/tool_cli/_setting.py +55 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 METR
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inspect-eval-utils
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Shared utilities for METR Inspect AI eval repos: task scaffolder + common runtime helpers.
|
|
5
|
+
Project-URL: Repository, https://github.com/METR/inspect-eval-utils
|
|
6
|
+
Project-URL: Issues, https://github.com/METR/inspect-eval-utils/issues
|
|
7
|
+
Author-email: METR <rasmus.faber-espensen@metr.org>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Python: >=3.13
|
|
16
|
+
Requires-Dist: boto3>=1.40
|
|
17
|
+
Requires-Dist: inspect-ai>=0.3.200
|
|
18
|
+
Requires-Dist: jinja2>=3.0
|
|
19
|
+
Requires-Dist: libcst>=1.5
|
|
20
|
+
Requires-Dist: tomlkit>=0.13
|
|
21
|
+
Provides-Extra: report
|
|
22
|
+
Requires-Dist: matplotlib>=3.8; extra == 'report'
|
|
23
|
+
Requires-Dist: universal-pathlib>=0.2; extra == 'report'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# inspect-eval-utils
|
|
27
|
+
|
|
28
|
+
Shared utilities for METR Inspect AI eval repos -- used by both task authors
|
|
29
|
+
and agent scaffolding:
|
|
30
|
+
|
|
31
|
+
- `inspect_eval_utils.setting`: the `Setting` protocol that lets tasks declare
|
|
32
|
+
what they need from agent scaffolding (workspaces, tools, callbacks,
|
|
33
|
+
environment features). Imported by *both* tasks and the scaffolding that
|
|
34
|
+
consumes them.
|
|
35
|
+
- `new_task` CLI: scaffold a new Inspect AI task into any compatible repo.
|
|
36
|
+
- `inspect_eval_utils.common`: runtime helpers for tasks (`get_sandbox_files`,
|
|
37
|
+
`expand_template`, `load_text_file`, etc.).
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
### Install (recommended)
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
uv tool install inspect-eval-utils
|
|
45
|
+
new_task my_eval
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### For one-off use without installing
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
uvx --from inspect-eval-utils new_task my_eval
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
## Shared task secrets
|
|
56
|
+
|
|
57
|
+
Updated evals can read shared task secrets directly while still working in old
|
|
58
|
+
Hawk and local workflows. Use `get_task_secret()` for values that might come
|
|
59
|
+
from either an environment variable or AWS Secrets Manager:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from inspect_eval_utils.common import get_task_secret
|
|
63
|
+
|
|
64
|
+
hf_token = get_task_secret("HF_TOKEN")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Lookup order is:
|
|
68
|
+
|
|
69
|
+
1. Return the environment variable named `HF_TOKEN` if it is set.
|
|
70
|
+
2. Otherwise fetch AWS Secrets Manager secret
|
|
71
|
+
`${INSPECT_TASK_SECRETS_DEFAULT_ARN_PREFIX}HF_TOKEN`.
|
|
72
|
+
|
|
73
|
+
The secret name suffix is verbatim, so `HF_TOKEN` maps to
|
|
74
|
+
`inspect-tasks/HF_TOKEN`, not a lowercased variant. In normal Hawk runs and
|
|
75
|
+
`hawk local`, the default prefix is provided through
|
|
76
|
+
`INSPECT_TASK_SECRETS_DEFAULT_ARN_PREFIX`. The prefix must include the trailing
|
|
77
|
+
slash, for example
|
|
78
|
+
`arn:aws:secretsmanager:us-west-2:123456789012:secret:inspect-tasks/`. When
|
|
79
|
+
running `inspect eval` directly, set that variable yourself or pass `arn=` to
|
|
80
|
+
`get_task_secret()`.
|
|
81
|
+
|
|
82
|
+
## Setting protocol
|
|
83
|
+
|
|
84
|
+
`Setting` is the contract a task publishes to agent scaffolding. It answers
|
|
85
|
+
one question: *what does the agent need to operate on this task?* The
|
|
86
|
+
scaffolding reads a Setting and wires up the agent accordingly. Neither side
|
|
87
|
+
needs to know the other's internals.
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from inspect_eval_utils.setting import Setting, Workspace, Features
|
|
91
|
+
|
|
92
|
+
Setting(
|
|
93
|
+
workspaces=(Workspace(name="default", description="Your working environment"),),
|
|
94
|
+
tools=(check_flag(),),
|
|
95
|
+
features=Features(internet=True),
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Concepts
|
|
100
|
+
|
|
101
|
+
#### What a Workspace is (and isn't)
|
|
102
|
+
|
|
103
|
+
A Workspace is like an SSH login handed to the agent. It names a sandbox to
|
|
104
|
+
which the agent should have direct shell and file access. For each workspace,
|
|
105
|
+
the scaffolding should create a new instance of each of its normal environment
|
|
106
|
+
interaction tools (e.g. a `bash` and a `python` tool) that is bound to that
|
|
107
|
+
sandbox.
|
|
108
|
+
|
|
109
|
+
**Not every sandbox is a Workspace.** A CTF task might have three containers --
|
|
110
|
+
an attacker box, a target web server, and a database. Only the attacker box is a
|
|
111
|
+
Workspace. The target and database are infrastructure; the agent reaches them
|
|
112
|
+
over the network or through task tools. By leaving them out of `workspaces`, the
|
|
113
|
+
task hides them from the agent by design.
|
|
114
|
+
|
|
115
|
+
> If a human would SSH into it, it's a Workspace. If the agent attacks it over
|
|
116
|
+
> the network, it's not.
|
|
117
|
+
|
|
118
|
+
#### Setting is exhaustive
|
|
119
|
+
|
|
120
|
+
When a Setting is present, it is authoritative. **Empty `workspaces` means no
|
|
121
|
+
bash/python tools.** A task that wants shell access *and* custom tools must
|
|
122
|
+
declare both:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
# Wrong -- custom tool but no shell access
|
|
126
|
+
Setting(tools=(my_tool(),))
|
|
127
|
+
|
|
128
|
+
# Right -- explicit about both
|
|
129
|
+
Setting(workspaces=(Workspace(),), tools=(my_tool(),))
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
A pure-API task (call an endpoint, evaluate the result) genuinely has no
|
|
133
|
+
workspace. Scaffolding that silently adds shell tools would undermine that
|
|
134
|
+
constraint.
|
|
135
|
+
|
|
136
|
+
#### Three layers of tools
|
|
137
|
+
|
|
138
|
+
The agent's tool surface has three distinct origins:
|
|
139
|
+
|
|
140
|
+
| Layer | Source | Examples |
|
|
141
|
+
|---|---|---|
|
|
142
|
+
| Task tools | `Setting.tools` | `check_flag`, `submit_image` |
|
|
143
|
+
| Workspace tools | Scaffolding, per workspace | `bash`, `python` |
|
|
144
|
+
| Framework tools | Scaffolding's own concerns | `set_timeout`, `submit` |
|
|
145
|
+
|
|
146
|
+
The task owns the first layer. The scaffolding owns the other two.
|
|
147
|
+
|
|
148
|
+
#### Per-turn callbacks: `on_turn` and `monitor`
|
|
149
|
+
|
|
150
|
+
Some tasks need to do work between agent turns: check progress, advance a
|
|
151
|
+
simulated clock, deliver a queued message, log score evolution. `Setting`
|
|
152
|
+
exposes two callbacks for this, which differ in *who decides when they fire*
|
|
153
|
+
and *whether they can steer the agent*.
|
|
154
|
+
|
|
155
|
+
**`on_turn`** runs at the start of every agent-loop iteration, before the model
|
|
156
|
+
generates. The task author owns the cadence: it's guaranteed to fire once per
|
|
157
|
+
turn. It can steer the loop by what it returns:
|
|
158
|
+
|
|
159
|
+
- `False` -- stop the loop (task is over: solved, irretrievably failed, time up)
|
|
160
|
+
- `str` -- inject this string as a user message before the next generation
|
|
161
|
+
(for example: "you have a new email", "10 simulated minutes passed")
|
|
162
|
+
- `None` / `True` -- proceed normally
|
|
163
|
+
|
|
164
|
+
This is useful when the task model needs to *react to the turn happening*: a
|
|
165
|
+
mailbox task that surfaces new messages, a clock-driven simulation that ticks
|
|
166
|
+
on each step, an end-condition check that the task wants to evaluate before
|
|
167
|
+
spending another model call.
|
|
168
|
+
|
|
169
|
+
**`monitor`** is observational. It returns `None` and cannot steer the loop.
|
|
170
|
+
The scaffolding decides when to call it -- typically at turn boundaries for
|
|
171
|
+
LLM agents, or on a wall-clock schedule for human/Claude-Code style agents
|
|
172
|
+
where there are no clear turns. Use it for things that should run regardless
|
|
173
|
+
of agent type and where missing a tick (or getting an extra one) is fine:
|
|
174
|
+
periodic score logging, transcript annotations, sandbox health checks.
|
|
175
|
+
|
|
176
|
+
Rule of thumb: if the task needs to *control* what happens next, use
|
|
177
|
+
`on_turn`. If it just needs to *watch*, use `monitor`.
|
|
178
|
+
|
|
179
|
+
#### Features vs. tools
|
|
180
|
+
|
|
181
|
+
`Features` are boolean flags about the *environment* -- `vision`, `internet`.
|
|
182
|
+
They tell scaffolding "this task involves images" or "this environment has
|
|
183
|
+
network access." The scaffolding responds by providing generic tools
|
|
184
|
+
(`view_image`, web search) if the model supports them. If the scaffolding
|
|
185
|
+
doesn't support a feature, the task still runs -- scores reflect the outcome.
|
|
186
|
+
|
|
187
|
+
`Setting.tools` is the other side of the split: tools that belong to the
|
|
188
|
+
*task*. Think of an agent on a task as a carpenter on a job site: hammer,
|
|
189
|
+
saw, and screwdriver belong to the carpenter; walls, doors, and windows
|
|
190
|
+
belong to the house. The carpenter operates on all six, but ownership is
|
|
191
|
+
clean. Scaffolding tools (`bash`, `view_image`, web search) are the
|
|
192
|
+
carpenter's kit, lit up by `Features` when the task says what kind of job
|
|
193
|
+
this is. `Setting.tools` (`check_flag`, `make_move`, `submit_design`) ship
|
|
194
|
+
with the task itself -- outside it, they're meaningless.
|
|
195
|
+
|
|
196
|
+
The test:
|
|
197
|
+
|
|
198
|
+
> Would this tool still make sense on a different task? If yes -- gate it on
|
|
199
|
+
> a Feature. If no -- it belongs in `Setting.tools`.
|
|
200
|
+
|
|
201
|
+
### For task authors
|
|
202
|
+
|
|
203
|
+
#### Declaring a task environment
|
|
204
|
+
|
|
205
|
+
Construct a `Setting` and pass it to `use_setting()` in your task's setup:
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
from inspect_eval_utils.setting import Setting, Workspace, Features, use_setting
|
|
209
|
+
|
|
210
|
+
Task(
|
|
211
|
+
setup=use_setting(Setting(
|
|
212
|
+
workspaces=(Workspace(name="default", user="agent"),),
|
|
213
|
+
tools=(check_flag(),),
|
|
214
|
+
on_turn=my_callback,
|
|
215
|
+
features=Features(vision=True),
|
|
216
|
+
)),
|
|
217
|
+
solver=my_agent(),
|
|
218
|
+
)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
`use_setting` also accepts a factory for per-sample Settings:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
use_setting(lambda sample: Setting(
|
|
225
|
+
workspaces=(Workspace(name="default", user=sample.metadata["user"]),),
|
|
226
|
+
))
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
#### Examples
|
|
230
|
+
|
|
231
|
+
**Simple coding task.** One workspace, no extras.
|
|
232
|
+
```python
|
|
233
|
+
Setting(workspaces=(Workspace(name="dev"),))
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
**CTF task.** Attacker workspace, a scoring tool, no internet. Target machine is
|
|
237
|
+
NOT listed -- it's infrastructure.
|
|
238
|
+
```python
|
|
239
|
+
Setting(
|
|
240
|
+
workspaces=(Workspace(name="attacker", description="Your attack machine", user="hacker"),),
|
|
241
|
+
tools=(check_flag(),),
|
|
242
|
+
)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
**Creative task with vision.** Workspace for building, vision enabled so
|
|
246
|
+
scaffolding provides image viewing.
|
|
247
|
+
```python
|
|
248
|
+
Setting(
|
|
249
|
+
workspaces=(Workspace(name="default", user="agent"),),
|
|
250
|
+
features=Features(vision=True),
|
|
251
|
+
)
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
**Pure-API task.** No workspace, just a custom tool.
|
|
255
|
+
```python
|
|
256
|
+
Setting(tools=(call_api(),))
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
**Dynamic tools via ToolSource.** When the available tools depend on task state
|
|
260
|
+
(e.g. a game where legal moves change each turn), use a `ToolSource`:
|
|
261
|
+
```python
|
|
262
|
+
class GameToolSource(ToolSource):
|
|
263
|
+
async def tools(self) -> list[Tool]:
|
|
264
|
+
return [move for move in legal_moves()]
|
|
265
|
+
|
|
266
|
+
Setting(tools=(GameToolSource(),))
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
Scaffolding calls `tools()` before each generation, so the set stays current.
|
|
270
|
+
|
|
271
|
+
When `inspect_eval_utils.tool_cli.setting_tool_cli_running()` exposes these tools
|
|
272
|
+
inside a sandbox, `ToolSource` is resolved dynamically at CLI invocation time.
|
|
273
|
+
Use:
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
tools list
|
|
277
|
+
tools describe <tool-name>
|
|
278
|
+
tools call <tool-name> [args...]
|
|
279
|
+
tools <tool-name> [args...]
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
`tools <tool-name>` is shorthand for `tools call <tool-name>`. If a tool name
|
|
283
|
+
conflicts with a built-in command such as `list`, `describe`, or `call`, use
|
|
284
|
+
`tools call <tool-name>`.
|
|
285
|
+
|
|
286
|
+
For schemas that are awkward to express as shell flags, pass a JSON object:
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
tools call <tool-name> --json-args '{"arg": "value"}'
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
The CLI keeps a short cache for list/help/completion metadata, but tool calls
|
|
293
|
+
refresh the current `ToolSource` before execution.
|
|
294
|
+
|
|
295
|
+
#### Common mistakes
|
|
296
|
+
|
|
297
|
+
- **Listing infrastructure sandboxes as Workspaces.** Only list sandboxes the
|
|
298
|
+
agent needs direct shell/file access to. Targets, databases, and services
|
|
299
|
+
should be omitted.
|
|
300
|
+
- **Assuming empty `workspaces` means "use defaults."** It means no workspaces.
|
|
301
|
+
The agent gets no bash/python.
|
|
302
|
+
- **Putting generic capabilities in `Setting.tools`.** Tools like `view_image`
|
|
303
|
+
are scaffolding concerns gated on Features, not task tools.
|
|
304
|
+
|
|
305
|
+
### For scaffolding developers
|
|
306
|
+
|
|
307
|
+
#### Reading the Setting
|
|
308
|
+
|
|
309
|
+
The Setting lives in a `ContextVar`, set per-sample by `use_setting()`. When
|
|
310
|
+
`setting()` returns `None`, the task predates this protocol -- scaffolding must
|
|
311
|
+
remain functional without it.
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
from inspect_eval_utils.setting import setting
|
|
315
|
+
|
|
316
|
+
s = setting() # returns Setting | None
|
|
317
|
+
if s is not None:
|
|
318
|
+
# Use Setting-aware tool creation
|
|
319
|
+
tools.append(s.tools)
|
|
320
|
+
else:
|
|
321
|
+
# Fall back to existing behavior
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
#### Creating tools from workspaces
|
|
325
|
+
|
|
326
|
+
Each Workspace declares a sandbox name and user. The scaffolding creates
|
|
327
|
+
whatever tools it wants for each workspace:
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
for ws in s.workspaces:
|
|
331
|
+
tools.append(bash(sandbox=ws.name, user=ws.user, timeout=timeout))
|
|
332
|
+
tools.append(python(sandbox=ws.name, user=ws.user, timeout=timeout))
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
#### Handling on_turn callbacks
|
|
336
|
+
|
|
337
|
+
Call `handle_on_turn()` at the top of each agent loop iteration, before
|
|
338
|
+
generating:
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
from inspect_eval_utils.setting import handle_on_turn
|
|
342
|
+
|
|
343
|
+
result = await handle_on_turn() # returns OnTurnResult
|
|
344
|
+
# result.action: "break" | "notify" | "proceed"
|
|
345
|
+
# result.message: str | None (only for "notify")
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
- `"break"` -- stop the agent loop
|
|
349
|
+
- `"notify"` -- inject `result.message` as a user message, then continue
|
|
350
|
+
- `"proceed"` -- continue normally (also returned when no Setting or no on_turn)
|
|
351
|
+
|
|
352
|
+
#### Reading Features
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
if s.features.vision:
|
|
356
|
+
tools.append(my_view_image_tool())
|
|
357
|
+
if s.features.internet:
|
|
358
|
+
tools.append(my_web_search_tool())
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
Features are advisory. If the scaffolding doesn't support a feature, skip it
|
|
362
|
+
gracefully -- don't error.
|
|
363
|
+
|
|
364
|
+
## Scaffolding a new task
|
|
365
|
+
|
|
366
|
+
From inside a target repo (e.g. `inspect-eval-examples`):
|
|
367
|
+
|
|
368
|
+
```bash
|
|
369
|
+
new_task my_eval
|
|
370
|
+
uv sync --group tasks
|
|
371
|
+
uv run inspect eval my_eval --model mockllm/replay --limit 1
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### What gets created
|
|
375
|
+
|
|
376
|
+
After running `new_task my_eval`, you'll see a new package under `tasks/`:
|
|
377
|
+
|
|
378
|
+
```
|
|
379
|
+
tasks/my_eval/
|
|
380
|
+
├── pyproject.toml
|
|
381
|
+
├── README.md
|
|
382
|
+
└── src/
|
|
383
|
+
└── metr_tasks/ # or harder_tasks/, etc., based on the target's namespace
|
|
384
|
+
└── my_eval/
|
|
385
|
+
├── __init__.py
|
|
386
|
+
├── _registry.py
|
|
387
|
+
├── task.py
|
|
388
|
+
├── version.py
|
|
389
|
+
├── py.typed
|
|
390
|
+
├── sandbox/
|
|
391
|
+
│ ├── compose.yaml
|
|
392
|
+
│ └── Dockerfile
|
|
393
|
+
└── assets/
|
|
394
|
+
└── instructions.md
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
The scaffolder also edits the target's root `pyproject.toml` to wire the new
|
|
398
|
+
task into the workspace: it appends the package to `dependency-groups.tasks`
|
|
399
|
+
and adds an entry under `tool.uv.sources` (`<package> = { workspace = true }`).
|
|
400
|
+
It does NOT modify `[tool.uv.workspace].members` — that's typically a glob like
|
|
401
|
+
`["tasks/*"]` which automatically picks up the new directory. This is the most
|
|
402
|
+
common surprise — the scaffolder modifies a file outside `tasks/my_eval/`, so
|
|
403
|
+
review the diff before committing.
|
|
404
|
+
|
|
405
|
+
### How substitution works
|
|
406
|
+
|
|
407
|
+
The scaffolder rewrites two things in the same pass:
|
|
408
|
+
|
|
409
|
+
1. **Task name**: every reference to `template` in the source (file names,
|
|
410
|
+
function names, imports, project name, etc.) is renamed to your new task
|
|
411
|
+
name.
|
|
412
|
+
2. **Namespace**: imports like `from metr_tasks.template.task import template`
|
|
413
|
+
are rewritten to use your repo's actual Python namespace (e.g.
|
|
414
|
+
`from harder_tasks.my_eval.task import my_eval`). This is what makes the
|
|
415
|
+
same canonical template work for any METR repo.
|
|
416
|
+
|
|
417
|
+
### Template selection
|
|
418
|
+
|
|
419
|
+
The scaffolder uses, in order:
|
|
420
|
+
|
|
421
|
+
1. `--template <path>` if specified.
|
|
422
|
+
2. `<target>/tasks/template/` if it exists.
|
|
423
|
+
3. The bundled canonical template (a known-good `metr_tasks` template).
|
|
424
|
+
|
|
425
|
+
### Per-repo target configuration
|
|
426
|
+
|
|
427
|
+
The scaffolder needs to know your target repo's Python namespace and project
|
|
428
|
+
prefix. It picks them up via the following decision tree:
|
|
429
|
+
|
|
430
|
+
- **Auto-detected (no config needed)**: if the target repo already has at
|
|
431
|
+
least one task under `tasks/`, the scaffolder reads its namespace and
|
|
432
|
+
project prefix from there.
|
|
433
|
+
- **Required config**: if the target repo has no existing tasks under `tasks/`
|
|
434
|
+
for the scaffolder to inspect, you must declare the namespace explicitly.
|
|
435
|
+
Add the following to the root `pyproject.toml` (use whatever namespace your
|
|
436
|
+
repo uses; it's `metr_tasks` for `inspect-eval-examples`, `harder_tasks` for
|
|
437
|
+
`harder-tasks`, etc.). Without this, the scaffolder errors out on a fresh
|
|
438
|
+
repo even if you'd be using `metr_tasks`:
|
|
439
|
+
|
|
440
|
+
```toml
|
|
441
|
+
[tool.task-scaffolder]
|
|
442
|
+
namespace = "your_namespace"
|
|
443
|
+
# project-prefix optional, defaults to namespace.replace("_", "-") + "-"
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
- **CLI override**: `--namespace` and `--project-prefix` flags always win,
|
|
447
|
+
useful for one-offs.
|
|
448
|
+
|
|
449
|
+
### Examples
|
|
450
|
+
|
|
451
|
+
#### Example 1 — canonical `metr_tasks` repo (e.g. `inspect-eval-examples`)
|
|
452
|
+
|
|
453
|
+
```bash
|
|
454
|
+
cd ~/src/metr/inspect-eval-examples
|
|
455
|
+
new_task my_eval
|
|
456
|
+
uv sync
|
|
457
|
+
uv run inspect eval my_eval --model mockllm/replay --limit 1
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
What you get: `tasks/my_eval/` with the `metr_tasks.my_eval` namespace.
|
|
461
|
+
|
|
462
|
+
#### Example 2 — cross-namespace repo (e.g. `harder-tasks`)
|
|
463
|
+
|
|
464
|
+
First, ensure the target's root `pyproject.toml` has:
|
|
465
|
+
|
|
466
|
+
```toml
|
|
467
|
+
[tool.task-scaffolder]
|
|
468
|
+
namespace = "harder_tasks"
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
(Skip this if the repo already has tasks the scaffolder can detect from.)
|
|
472
|
+
|
|
473
|
+
Then:
|
|
474
|
+
|
|
475
|
+
```bash
|
|
476
|
+
cd ~/src/metr/harder-tasks
|
|
477
|
+
new_task my_eval
|
|
478
|
+
uv sync
|
|
479
|
+
uv run inspect eval my_eval --model mockllm/replay --limit 1
|
|
480
|
+
```
|
|
481
|
+
|
|
482
|
+
What you get: `tasks/my_eval/` with the `harder_tasks.my_eval` namespace,
|
|
483
|
+
automatically rewritten from the canonical `metr_tasks` template.
|
|
484
|
+
|
|
485
|
+
### Troubleshooting
|
|
486
|
+
|
|
487
|
+
- **"target has no pyproject.toml"** — the resolved target directory doesn't
|
|
488
|
+
contain a `pyproject.toml`. You're either not in the repo root, or
|
|
489
|
+
`--target <path>` pointed somewhere wrong. `cd` to the repo root, or pass
|
|
490
|
+
the correct `--target`.
|
|
491
|
+
- **"task name 'template' matches the template name; choose a different
|
|
492
|
+
name"** — pick something else. `template` is reserved.
|
|
493
|
+
- **"<path> already exists (use --force to overwrite)"** — pass `--force` if
|
|
494
|
+
you want to overwrite the existing task directory.
|
|
495
|
+
|
|
496
|
+
## Common helpers
|
|
497
|
+
|
|
498
|
+
```python
|
|
499
|
+
from inspect_eval_utils.common import (
|
|
500
|
+
get_sandbox_files,
|
|
501
|
+
expand_template,
|
|
502
|
+
load_text_file,
|
|
503
|
+
)
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
These were ported from `harder-tasks` and are now shared across METR
|
|
507
|
+
Inspect AI eval repos.
|
|
508
|
+
|
|
509
|
+
## Development
|
|
510
|
+
|
|
511
|
+
```bash
|
|
512
|
+
uv sync
|
|
513
|
+
uv run pytest # fast tests
|
|
514
|
+
uv run pytest --runslow # + slow end-to-end
|
|
515
|
+
uv run ruff check .
|
|
516
|
+
uv run basedpyright
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
## License
|
|
520
|
+
|
|
521
|
+
MIT. See [LICENSE](LICENSE).
|