inspect-eval-utils 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/PKG-INFO +47 -1
  2. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/README.md +46 -0
  3. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/pyproject.toml +1 -0
  4. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_cli.py +3 -1
  5. inspect_eval_utils-1.3.0/src/inspect_eval_utils/report/plot.py +234 -0
  6. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/scaffolder.py +142 -0
  7. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/__init__.py +6 -0
  8. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/_mechanism.py +226 -50
  9. inspect_eval_utils-1.2.0/src/inspect_eval_utils/report/plot.py +0 -219
  10. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/.gitignore +0 -0
  11. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/LICENSE +0 -0
  12. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/__init__.py +0 -0
  13. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_detect.py +0 -0
  14. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/pyproject.toml +0 -0
  15. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/__init__.py +0 -0
  16. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/_registry.py +0 -0
  17. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/assets/instructions.md +0 -0
  18. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/py.typed +0 -0
  19. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/Dockerfile +0 -0
  20. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/compose.yaml +0 -0
  21. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/task.py +0 -0
  22. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/version.py +0 -0
  23. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/artifacts.py +0 -0
  24. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/__init__.py +0 -0
  25. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/sandbox_files.py +0 -0
  26. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/task_secrets.py +0 -0
  27. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/py.typed +0 -0
  28. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/__init__.py +0 -0
  29. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/assets/InstrumentSans.ttf +0 -0
  30. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/assets/OFL.txt +0 -0
  31. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/cost.py +0 -0
  32. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/events.py +0 -0
  33. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/html.py +0 -0
  34. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/__init__.py +0 -0
  35. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_context.py +0 -0
  36. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_types.py +0 -0
  37. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_utils.py +0 -0
  38. {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/_setting.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: inspect-eval-utils
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Shared utilities for METR Inspect AI eval repos: task scaffolder + common runtime helpers.
5
5
  Project-URL: Repository, https://github.com/METR/inspect-eval-utils
6
6
  Project-URL: Issues, https://github.com/METR/inspect-eval-utils/issues
@@ -297,6 +297,31 @@ tools call <tool-name> --json-args '{"arg": "value"}'
297
297
  The CLI keeps a short cache for list/help/completion metadata, but tool calls
298
298
  refresh the current `ToolSource` before execution.
299
299
 
300
+ #### Running the tool CLI from a task setup solver
301
+
302
+ Use `start_tool_cli` to expose `Setting`/task tools as a `tools` command for the
303
+ agent in one line. It installs the CLI, starts the RPC service in the background,
304
+ and returns once it's ready (raising if startup fails):
305
+
306
+ ```python
307
+ from inspect_eval_utils.tool_cli import start_tool_cli
308
+ from inspect_ai.util import sandbox
309
+
310
+ @solver
311
+ def setup() -> Solver:
312
+ async def solve(state: TaskState, generate: Generate) -> TaskState:
313
+ await start_tool_cli(MY_TOOLS, sandbox("default"), user="agent")
314
+ return state
315
+ return solve
316
+ ```
317
+
318
+ The command is resolved two ways: *interactive* shells (e.g. `human_cli`) pick it
319
+ up via a `.bashrc` alias + tab-completion; *non-interactive* shells (the model
320
+ agent's `bash()` tool, `sandbox.exec`) find it on `PATH` at
321
+ `/usr/local/bin/<command_name>`. Pass `on_path=False` to skip the PATH wrapper, or
322
+ `bin_dir=...` to relocate it. `run_tool_cli_service` and `setting_tool_cli_running`
323
+ install the PATH wrapper too (default-on).
324
+
300
325
  #### Common mistakes
301
326
 
302
327
  - **Listing infrastructure sandboxes as Workspaces.** Only list sandboxes the
@@ -407,6 +432,27 @@ It does NOT modify `[tool.uv.workspace].members` — that's typically a glob lik
407
432
  common surprise — the scaffolder modifies a file outside `tasks/my_eval/`, so
408
433
  review the diff before committing.
409
434
 
435
+ ### Generated eval-set config
436
+
437
+ The scaffolder also writes a minimal Hawk eval-set skeleton to
438
+ `eval_sets/<name>.eval-set.yaml` (creating `eval_sets/` if needed). This is the
439
+ config you run a batch grid with:
440
+
441
+ ```bash
442
+ hawk eval-set eval_sets/my_eval.eval-set.yaml
443
+ ```
444
+
445
+ The task `package` URL is derived from the target repo's git `origin` remote and
446
+ current branch, e.g.
447
+ `git+ssh://git@github.com/METR/<repo>@<branch>#subdirectory=tasks/my_eval`. When
448
+ the metadata can't be determined, a TODO marker is left in its place:
449
+
450
+ - no `origin` remote → the whole `package` value is a `TODO:` string,
451
+ - detached HEAD (no branch) → the ref becomes `TODO-set-ref`.
452
+
453
+ The skeleton is intentionally minimal (one model, one solver). An existing
454
+ `eval_sets/<name>.eval-set.yaml` is only overwritten with `--force`.
455
+
410
456
  ### How substitution works
411
457
 
412
458
  The scaffolder rewrites two things in the same pass:
@@ -272,6 +272,31 @@ tools call <tool-name> --json-args '{"arg": "value"}'
272
272
  The CLI keeps a short cache for list/help/completion metadata, but tool calls
273
273
  refresh the current `ToolSource` before execution.
274
274
 
275
+ #### Running the tool CLI from a task setup solver
276
+
277
+ Use `start_tool_cli` to expose `Setting`/task tools as a `tools` command for the
278
+ agent in one line. It installs the CLI, starts the RPC service in the background,
279
+ and returns once it's ready (raising if startup fails):
280
+
281
+ ```python
282
+ from inspect_eval_utils.tool_cli import start_tool_cli
283
+ from inspect_ai.util import sandbox
284
+
285
+ @solver
286
+ def setup() -> Solver:
287
+ async def solve(state: TaskState, generate: Generate) -> TaskState:
288
+ await start_tool_cli(MY_TOOLS, sandbox("default"), user="agent")
289
+ return state
290
+ return solve
291
+ ```
292
+
293
+ The command is resolved two ways: *interactive* shells (e.g. `human_cli`) pick it
294
+ up via a `.bashrc` alias + tab-completion; *non-interactive* shells (the model
295
+ agent's `bash()` tool, `sandbox.exec`) find it on `PATH` at
296
+ `/usr/local/bin/<command_name>`. Pass `on_path=False` to skip the PATH wrapper, or
297
+ `bin_dir=...` to relocate it. `run_tool_cli_service` and `setting_tool_cli_running`
298
+ install the PATH wrapper too (default-on).
299
+
275
300
  #### Common mistakes
276
301
 
277
302
  - **Listing infrastructure sandboxes as Workspaces.** Only list sandboxes the
@@ -382,6 +407,27 @@ It does NOT modify `[tool.uv.workspace].members` — that's typically a glob lik
382
407
  common surprise — the scaffolder modifies a file outside `tasks/my_eval/`, so
383
408
  review the diff before committing.
384
409
 
410
+ ### Generated eval-set config
411
+
412
+ The scaffolder also writes a minimal Hawk eval-set skeleton to
413
+ `eval_sets/<name>.eval-set.yaml` (creating `eval_sets/` if needed). This is the
414
+ config you run a batch grid with:
415
+
416
+ ```bash
417
+ hawk eval-set eval_sets/my_eval.eval-set.yaml
418
+ ```
419
+
420
+ The task `package` URL is derived from the target repo's git `origin` remote and
421
+ current branch, e.g.
422
+ `git+ssh://git@github.com/METR/<repo>@<branch>#subdirectory=tasks/my_eval`. When
423
+ the metadata can't be determined, a TODO marker is left in its place:
424
+
425
+ - no `origin` remote → the whole `package` value is a `TODO:` string,
426
+ - detached HEAD (no branch) → the ref becomes `TODO-set-ref`.
427
+
428
+ The skeleton is intentionally minimal (one model, one solver). An existing
429
+ `eval_sets/<name>.eval-set.yaml` is only overwritten with `--force`.
430
+
385
431
  ### How substitution works
386
432
 
387
433
  The scaffolder rewrites two things in the same pass:
@@ -62,6 +62,7 @@ dev = [
62
62
  "pytest-timeout>=2.3",
63
63
  "ruff>=0.11",
64
64
  "basedpyright>=1.37",
65
+ "pyyaml>=6.0",
65
66
  ]
66
67
 
67
68
  [tool.ruff]
@@ -49,7 +49,7 @@ def main(argv: list[str] | None = None) -> None:
49
49
  parser.add_argument(
50
50
  "--force",
51
51
  action="store_true",
52
- help="Overwrite an existing tasks/<name>/",
52
+ help="Overwrite an existing tasks/<name>/ and eval_sets/<name>.eval-set.yaml",
53
53
  )
54
54
  args = parser.parse_args(argv)
55
55
 
@@ -85,6 +85,8 @@ def main(argv: list[str] | None = None) -> None:
85
85
  print(f" cd {target_dir}")
86
86
  print(" uv sync --group tasks")
87
87
  print(f" uv run inspect eval {snake} --model mockllm/replay --limit 1")
88
+ print(f"Also generated eval_sets/{snake}.eval-set.yaml (Hawk batch config).")
89
+ print(f" Batch run: hawk eval-set eval_sets/{snake}.eval-set.yaml")
88
90
 
89
91
 
90
92
  if __name__ == "__main__":
@@ -0,0 +1,234 @@
1
+ """Render the score-vs-cost matplotlib plot as PNG bytes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import logging
7
+ import math
8
+ import threading
9
+ from collections.abc import Sequence
10
+ from importlib.resources import files
11
+
12
+ from inspect_eval_utils.report.cost import cumulative_cost
13
+ from inspect_eval_utils.report.events import ReportEvent
14
+
15
+ # Matplotlib logs "generated new fontManager" at INFO the first time its font
16
+ # cache is built. Quiet it so eval scoring transcripts stay clean.
17
+ logging.getLogger("matplotlib.font_manager").setLevel(logging.WARNING)
18
+
19
+ # Color palette derived from the METR May 2026 brand guide.
20
+ _LEAD_GREEN_500 = "#589885"
21
+ _GREEN_700 = "#2A6912"
22
+ _GRAY_300 = "#D9DCE2"
23
+ _GRAY_700 = "#3D424D"
24
+ _GRAY_800 = "#282C33"
25
+ _GRAY_900 = "#1B1D22"
26
+
27
+ _BUNDLED_FONT_FAMILY = ["Instrument Sans", "DejaVu Sans"]
28
+
29
+ # Guards the one-time mutation of matplotlib's global font registry so
30
+ # concurrent build_plot callers don't race the check-then-addfont below.
31
+ _FONT_LOCK = threading.Lock()
32
+ # Set once registration has succeeded; lets the common case skip the lock.
33
+ _font_registered = False
34
+
35
+
36
+ def _register_bundled_font() -> None:
37
+ """Register the vendored Instrument Sans TTF with matplotlib (best-effort).
38
+
39
+ Quietly returns if already registered or if the asset is missing. Uses
40
+ double-checked locking so that, after the one-time registration, concurrent
41
+ callers take the lock-free fast path instead of serializing on every render.
42
+ """
43
+ global _font_registered
44
+ if _font_registered:
45
+ return
46
+
47
+ from matplotlib import font_manager
48
+
49
+ with _FONT_LOCK:
50
+ if _font_registered:
51
+ return
52
+ installed = {f.name for f in font_manager.fontManager.ttflist}
53
+ if "Instrument Sans" not in installed:
54
+ try:
55
+ font_path = files("inspect_eval_utils.report") / "assets" / "InstrumentSans.ttf"
56
+ font_manager.fontManager.addfont(str(font_path))
57
+ except Exception: # noqa: BLE001
58
+ # Asset missing or unreadable; leave the flag unset so a later
59
+ # call retries. Callers proceed with matplotlib's DejaVu Sans
60
+ # fallback in the meantime.
61
+ return
62
+ _font_registered = True
63
+
64
+
65
+ def build_plot(
66
+ events: Sequence[ReportEvent],
67
+ *,
68
+ model: str,
69
+ title: str,
70
+ y_label: str,
71
+ line_label: str = "Best score",
72
+ current_score_label: str | None = None,
73
+ x_label_money: str = "Cumulative model cost ($)",
74
+ x_label_tokens: str = "Cumulative tokens (cost unavailable)",
75
+ marker_event_kind: str | None,
76
+ ) -> bytes:
77
+ """Render the score-vs-cost plot as PNG bytes.
78
+
79
+ The line plots best-so-far `score_update` values, starting at `(0, 0)`,
80
+ against cumulative model cost for `model`. If Inspect AI has no pricing for
81
+ the model, the x-axis falls back to cumulative token count instead.
82
+
83
+ `title`, `y_label`, `line_label`, `x_label_money`, and `x_label_tokens`
84
+ provide the plot, legend, and axis copy.
85
+
86
+ `marker_event_kind` selects which non-score events delimit episodic spans
87
+ (e.g. `"attempt_start"`); pass `None` to disable. When set, the plot area
88
+ is shaded into alternating background bands — one per span — so band
89
+ *width* visually encodes the compute spent in each span.
90
+
91
+ When `current_score_label` is provided, a second (non-monotonic) line is
92
+ drawn through the raw per-event score values and labelled accordingly in
93
+ the legend.
94
+
95
+ The bundled Instrument Sans font is registered best-effort and used with
96
+ DejaVu Sans as a fallback. Returns PNG bytes.
97
+ """
98
+ from matplotlib.backends.backend_agg import FigureCanvasAgg
99
+ from matplotlib.figure import Figure
100
+ from matplotlib.font_manager import FontProperties
101
+
102
+ _register_bundled_font()
103
+ font_family = _BUNDLED_FONT_FAMILY
104
+
105
+ has_usage = False
106
+ cost_available = True
107
+ xs_line: list[float] = [0.0]
108
+ ys_line: list[float] = [0.0]
109
+ xs_current: list[float] = [0.0]
110
+ ys_current: list[float] = [0.0]
111
+ marker_xs: list[float] = []
112
+
113
+ best_so_far = 0.0
114
+ for ev in events:
115
+ if ev.usage is None:
116
+ continue
117
+ has_usage = True
118
+ x, available = cumulative_cost(ev.usage, model)
119
+ cost_available = cost_available and available
120
+ if ev.event_type == "score_update":
121
+ best_so_far = max(best_so_far, ev.score)
122
+ xs_line.append(x)
123
+ ys_line.append(best_so_far)
124
+ xs_current.append(x)
125
+ ys_current.append(ev.score)
126
+ elif marker_event_kind is not None and ev.event_type == marker_event_kind:
127
+ marker_xs.append(x)
128
+ # Break the current-score line at episodic boundaries so it
129
+ # renders as separate segments per attempt instead of a vertical
130
+ # drop back to the new attempt's starting floor.
131
+ xs_current.append(x)
132
+ ys_current.append(float("nan"))
133
+
134
+ label_font = FontProperties(family=font_family, size=14)
135
+ title_font = FontProperties(family=font_family, size=15, weight="medium")
136
+ legend_font = FontProperties(family=font_family, size=11)
137
+
138
+ # Object-oriented (non-pyplot) API: a standalone Figure with an explicit
139
+ # Agg canvas keeps this function thread-safe. pyplot's global figure
140
+ # registry and `rc_context`'s process-wide rcParams mutation both race
141
+ # under concurrent calls, so we render off a local Figure and apply every
142
+ # style per-artist instead of via global rcParams.
143
+ fig = Figure(figsize=(10, 6))
144
+ FigureCanvasAgg(fig) # attaches an Agg canvas (sets fig.canvas)
145
+ ax = fig.subplots()
146
+
147
+ if current_score_label is not None:
148
+ ax.plot(
149
+ xs_current,
150
+ ys_current,
151
+ "--",
152
+ color=_LEAD_GREEN_500,
153
+ linewidth=1.5,
154
+ label=current_score_label,
155
+ zorder=1,
156
+ )
157
+ ax.plot(
158
+ xs_line,
159
+ ys_line,
160
+ "-",
161
+ color=_GREEN_700,
162
+ linewidth=2,
163
+ label=line_label,
164
+ zorder=2,
165
+ )
166
+ if marker_xs:
167
+ # Render each marker_event_kind span as a background band. Band
168
+ # *width* encodes the compute spent in that span, so clustering
169
+ # naturally shows as a squeeze of narrow bands.
170
+ sorted_starts = sorted(marker_xs)
171
+ finite_xs = xs_line + [v for v in xs_current if not math.isnan(v)] + marker_xs
172
+ band_end = max(finite_xs) if finite_xs else 0.0
173
+ boundaries = sorted_starts + [band_end]
174
+ for k in range(len(sorted_starts)):
175
+ if k % 2 == 1:
176
+ ax.axvspan(
177
+ boundaries[k],
178
+ boundaries[k + 1],
179
+ color=_GRAY_300,
180
+ alpha=0.25,
181
+ zorder=0,
182
+ )
183
+
184
+ x_label = x_label_money if (has_usage and cost_available) else x_label_tokens
185
+ ax.set_xlabel(x_label, color=_GRAY_800, fontproperties=label_font)
186
+ ax.set_ylabel(y_label, color=_GRAY_800, rotation=90, fontproperties=label_font)
187
+ ax.set_ylim(0, 1.05)
188
+ ax.set_xlim(left=0)
189
+
190
+ ax.spines["top"].set_visible(False)
191
+ ax.spines["right"].set_visible(False)
192
+ ax.spines["bottom"].set_color(_GRAY_700)
193
+ ax.spines["left"].set_color(_GRAY_700)
194
+ ax.spines["bottom"].set_linewidth(0.8)
195
+ ax.spines["left"].set_linewidth(0.8)
196
+ ax.tick_params(
197
+ colors=_GRAY_700,
198
+ labelsize=12,
199
+ width=0.5,
200
+ length=0,
201
+ labelfontfamily=font_family,
202
+ )
203
+
204
+ ax.grid(
205
+ True,
206
+ color=_GRAY_300,
207
+ linewidth=0.8,
208
+ linestyle=(0, (4, 2)),
209
+ zorder=0,
210
+ )
211
+ ax.set_axisbelow(True)
212
+
213
+ ax.set_title(title, color=_GRAY_900, fontproperties=title_font, pad=12)
214
+ legend = ax.legend(
215
+ loc="upper left",
216
+ frameon=True,
217
+ fancybox=False,
218
+ edgecolor=_GRAY_300,
219
+ framealpha=1.0,
220
+ borderpad=0.6,
221
+ prop=legend_font,
222
+ )
223
+ legend.get_frame().set_linewidth(0.5)
224
+ legend.get_frame().set_facecolor("white")
225
+
226
+ buf = io.BytesIO()
227
+ fig.savefig(
228
+ buf,
229
+ format="png",
230
+ dpi=300,
231
+ bbox_inches="tight",
232
+ facecolor="white",
233
+ )
234
+ return buf.getvalue()
@@ -308,6 +308,132 @@ def render_readme(*, snake: str, description: str) -> str:
308
308
  return README_TEMPLATE.format(snake=snake, description=description)
309
309
 
310
310
 
311
+ EVAL_SET_TEMPLATE = """\
312
+ name: {name}
313
+ tasks:
314
+ - package: "{package_url}"
315
+ name: {namespace}
316
+ items:
317
+ - name: {name}
318
+ args: []
319
+
320
+ epochs: 4
321
+ token_limit: 40000000
322
+
323
+ models:
324
+ - package: anthropic
325
+ name: anthropic
326
+ items:
327
+ - name: claude-opus-4-5-20251101
328
+ args:
329
+ config:
330
+ max_tokens: 32000
331
+ reasoning_tokens: 16000
332
+ max_connections: 60
333
+
334
+ solvers:
335
+ - package: "git+https://github.com/METR/inspect-agents@metr_agents/v0.3.5#subdirectory=packages/agents"
336
+ name: metr_agents
337
+ items:
338
+ - name: react
339
+ args:
340
+ tools:
341
+ required:
342
+ - inspect_ai/bash
343
+ - metr_agents/set_timeout
344
+ optional:
345
+ - inspect_ai/python
346
+ truncation: disabled
347
+ compaction: CompactionSummary
348
+ compaction_threshold: 0.75
349
+ """
350
+
351
+
352
+ def render_eval_set(*, name: str, namespace: str, package_url: str) -> str:
353
+ """Render a minimal Hawk eval-set skeleton for a scaffolded task."""
354
+ return EVAL_SET_TEMPLATE.format(name=name, namespace=namespace, package_url=package_url)
355
+
356
+
357
+ def _read_origin_url(git_dir: Path) -> str | None:
358
+ """Return the `[remote "origin"] url` value from a .git/config, or None.
359
+
360
+ Hand-parsed rather than via configparser: git indents entries with tabs,
361
+ which configparser misreads as multi-line value continuations.
362
+ """
363
+ config_path = git_dir / "config"
364
+ if not config_path.is_file():
365
+ return None
366
+ try:
367
+ lines = config_path.read_text().splitlines()
368
+ except (OSError, UnicodeDecodeError):
369
+ return None
370
+ in_origin = False
371
+ for line in lines:
372
+ stripped = line.strip()
373
+ if stripped.startswith("[") and stripped.endswith("]"):
374
+ in_origin = stripped.replace(" ", "") == '[remote"origin"]'
375
+ continue
376
+ if in_origin and "=" in stripped:
377
+ key, _, value = stripped.partition("=")
378
+ if key.strip() == "url":
379
+ return value.strip()
380
+ return None
381
+
382
+
383
+ def _read_current_branch(git_dir: Path) -> str | None:
384
+ """Return the current branch name from .git/HEAD, or None if detached/missing."""
385
+ head_path = git_dir / "HEAD"
386
+ if not head_path.is_file():
387
+ return None
388
+ try:
389
+ content = head_path.read_text().strip()
390
+ except (OSError, UnicodeDecodeError):
391
+ return None
392
+ prefix = "ref: refs/heads/"
393
+ if content.startswith(prefix):
394
+ return content[len(prefix) :]
395
+ return None
396
+
397
+
398
+ def _parse_remote_url(url: str) -> tuple[str, str] | None:
399
+ """Parse a git remote URL into (host, 'org/repo'). None if unrecognized."""
400
+ url = url.strip()
401
+ if url.endswith(".git"):
402
+ url = url[:-4]
403
+ for pattern in (
404
+ r"^git@([^:]+):(.+)$",
405
+ r"^ssh://git@([^/]+)/(.+)$",
406
+ r"^https://([^/]+)/(.+)$",
407
+ ):
408
+ m = re.match(pattern, url)
409
+ if m:
410
+ return m.group(1), m.group(2)
411
+ return None
412
+
413
+
414
+ def derive_package_url(target_dir: Path, task_name: str) -> str:
415
+ """Build the eval-set task package URL from the target repo's git metadata.
416
+
417
+ Returns a `git+ssh://...#subdirectory=tasks/<task_name>` URL. Any piece that
418
+ cannot be determined is filled with a TODO marker so the result is never
419
+ silently wrong:
420
+ - no readable origin remote -> the whole value is a TODO string
421
+ - detached HEAD (no branch) -> the ref slot becomes `TODO-set-ref`
422
+ """
423
+ git_dir = target_dir / ".git"
424
+ url = _read_origin_url(git_dir)
425
+ parsed = _parse_remote_url(url) if url else None
426
+ if parsed is None:
427
+ return (
428
+ "TODO: set git+ssh package URL, e.g. "
429
+ f"git+ssh://git@github.com/<org>/<repo>@<branch>"
430
+ f"#subdirectory=tasks/{task_name}"
431
+ )
432
+ host, path = parsed
433
+ branch = _read_current_branch(git_dir) or "TODO-set-ref"
434
+ return f"git+ssh://git@{host}/{path}@{branch}#subdirectory=tasks/{task_name}"
435
+
436
+
311
437
  def edit_root_pyproject(src: str, *, target_pkg_name: str, new_task_dir_name: str) -> str:
312
438
  """Add the new task to dependency-groups.tasks and tool.uv.sources, and
313
439
  ensure [tool.uv.workspace].members covers tasks/<new_task_dir_name>.
@@ -461,6 +587,12 @@ def scaffold_into(
461
587
  new_task_dir_name=target.new_task_name,
462
588
  )
463
589
 
590
+ # Validate the eval-set destination up front too, so a conflict aborts
591
+ # before any file writes (mirrors the dest_root / root-pyproject checks).
592
+ eval_set_path = target_dir / "eval_sets" / f"{target.new_task_name}.eval-set.yaml"
593
+ if eval_set_path.exists() and not force:
594
+ sys.exit(f"{eval_set_path} already exists (use --force to overwrite)")
595
+
464
596
  if dest_root.exists():
465
597
  if not force:
466
598
  sys.exit(f"{dest_root} already exists (use --force to overwrite)")
@@ -518,5 +650,15 @@ def scaffold_into(
518
650
  # Write the (already-validated) edited root pyproject.toml.
519
651
  root_pyproject.write_text(new_root_pyproject)
520
652
 
653
+ # Generated eval-set skeleton at the repo root (not inside tasks/<name>/).
654
+ eval_set_path.parent.mkdir(parents=True, exist_ok=True)
655
+ eval_set_path.write_text(
656
+ render_eval_set(
657
+ name=target.new_task_name,
658
+ namespace=target.namespace,
659
+ package_url=derive_package_url(target_dir, target.new_task_name),
660
+ )
661
+ )
662
+
521
663
  # Audit.
522
664
  audit_generated_tree(dest_root, source=source)
@@ -7,13 +7,19 @@ in the sandbox shell.
7
7
  """
8
8
 
9
9
  from inspect_eval_utils.tool_cli._mechanism import (
10
+ generate_tool_cli_script,
10
11
  install_tool_cli,
11
12
  run_tool_cli_service,
13
+ start_tool_cli,
14
+ tool_cli_service_methods,
12
15
  )
13
16
  from inspect_eval_utils.tool_cli._setting import setting_tool_cli_running
14
17
 
15
18
  __all__ = [
19
+ "generate_tool_cli_script",
16
20
  "install_tool_cli",
17
21
  "run_tool_cli_service",
18
22
  "setting_tool_cli_running",
23
+ "start_tool_cli",
24
+ "tool_cli_service_methods",
19
25
  ]
@@ -5,6 +5,7 @@ with an RPC bridge back to the host for actual tool execution.
5
5
  """
6
6
 
7
7
  import json
8
+ import logging
8
9
  import re
9
10
  import shlex
10
11
  import time
@@ -16,10 +17,19 @@ import anyio
16
17
  from inspect_ai.model import ChatMessage, ChatMessageAssistant, ChatMessageTool, execute_tools
17
18
  from inspect_ai.tool import Tool, ToolCall, ToolDef, ToolSource
18
19
  from inspect_ai.tool._tool_def import tool_defs
19
- from inspect_ai.util import SandboxEnvironment, sandbox_service
20
+ from inspect_ai.util import (
21
+ SandboxEnvironment,
22
+ background,
23
+ sandbox_service,
24
+ )
25
+ from inspect_ai.util import (
26
+ sandbox as _get_sandbox,
27
+ )
20
28
  from inspect_ai.util._sandbox.service import SandboxServiceMethod
21
29
  from pydantic import JsonValue
22
30
 
31
+ logger = logging.getLogger(__name__)
32
+
23
33
 
24
34
  class _ToolCliResolver:
25
35
  def __init__(
@@ -62,6 +72,8 @@ async def install_tool_cli(
62
72
  service_name: str = "tool_cli",
63
73
  install_dir: str = "/opt/tool_cli",
64
74
  user: str | None = None,
75
+ on_path: bool = True,
76
+ bin_dir: str = "/usr/local/bin",
65
77
  ) -> dict[str, SandboxServiceMethod]:
66
78
  """Generate a CLI script, install it into a sandbox, and return service methods.
67
79
 
@@ -75,6 +87,9 @@ async def install_tool_cli(
75
87
  service_name: Name for the sandbox service (used for RPC).
76
88
  install_dir: Directory in the sandbox to install the CLI script.
77
89
  user: Sandbox user to install as.
90
+ on_path: Install a wrapper for the command in ``bin_dir`` so it resolves on
91
+ PATH for non-interactive shells (e.g. the agent's bash() tool).
92
+ bin_dir: Directory on PATH to install the wrapper into.
78
93
 
79
94
  Returns:
80
95
  A dict of service methods to pass to ``sandbox_service()``.
@@ -89,6 +104,8 @@ async def install_tool_cli(
89
104
  command_name=command_name,
90
105
  install_dir=install_dir,
91
106
  user=user,
107
+ on_path=on_path,
108
+ bin_dir=bin_dir,
92
109
  )
93
110
 
94
111
  return methods
@@ -103,6 +120,8 @@ async def run_tool_cli_service(
103
120
  service_name: str = "tool_cli",
104
121
  install_dir: str = "/opt/tool_cli",
105
122
  user: str | None = None,
123
+ on_path: bool = True,
124
+ bin_dir: str = "/usr/local/bin",
106
125
  polling_interval: float | None = None,
107
126
  started: anyio.Event | None = None,
108
127
  ) -> None:
@@ -118,6 +137,9 @@ async def run_tool_cli_service(
118
137
  service_name: Name for the sandbox service (used for RPC).
119
138
  install_dir: Directory in the sandbox to install the CLI script.
120
139
  user: Sandbox user to install as.
140
+ on_path: Install a wrapper for the command in ``bin_dir`` so it resolves on
141
+ PATH for non-interactive shells (e.g. the agent's bash() tool).
142
+ bin_dir: Directory on PATH to install the wrapper into.
121
143
  polling_interval: Polling interval for RPC request checking.
122
144
  started: Event set once the sandbox service is ready.
123
145
  """
@@ -128,6 +150,8 @@ async def run_tool_cli_service(
128
150
  service_name=service_name,
129
151
  install_dir=install_dir,
130
152
  user=user,
153
+ on_path=on_path,
154
+ bin_dir=bin_dir,
131
155
  )
132
156
  await sandbox_service(
133
157
  service_name,
@@ -140,6 +164,98 @@ async def run_tool_cli_service(
140
164
  )
141
165
 
142
166
 
167
+ async def start_tool_cli(
168
+ tools: Sequence[Tool | ToolDef | ToolSource],
169
+ sandbox: SandboxEnvironment | None = None,
170
+ *,
171
+ command_name: str = "tools",
172
+ service_name: str = "tool_cli",
173
+ install_dir: str = "/opt/tool_cli",
174
+ user: str | None = None,
175
+ on_path: bool = True,
176
+ bin_dir: str = "/usr/local/bin",
177
+ polling_interval: float | None = None,
178
+ ) -> None:
179
+ """Install the tool CLI and run its sandbox service in the background.
180
+
181
+ Fire-and-forget helper for task **setup solvers**: it installs the CLI in the
182
+ foreground (so install errors propagate to you), starts the RPC service in the
183
+ background, and returns once the service is ready. The service then runs until
184
+ the sample ends. By default the command is exposed on PATH (see ``on_path``) so
185
+ the model agent's non-interactive ``bash()`` tool can run it.
186
+
187
+ Unlike a bare ``background(run_tool_cli_service(...))`` + ``started.wait()``,
188
+ this surfaces startup failures as an exception instead of hanging.
189
+
190
+ Args:
191
+ tools: Tools to expose as CLI commands.
192
+ sandbox: Sandbox to install into. Defaults to ``sandbox("default")``.
193
+ command_name: Name of the CLI command (and the PATH wrapper).
194
+ service_name: Sandbox-service name used for RPC.
195
+ install_dir: Directory in the sandbox to install the CLI script.
196
+ user: Sandbox user the service runs as (e.g. the agent's user).
197
+ on_path: Expose ``command_name`` on PATH (default True).
198
+ bin_dir: Directory on PATH for the wrapper.
199
+ polling_interval: RPC request polling interval.
200
+
201
+ Example:
202
+ ```python
203
+ @solver
204
+ def setup() -> Solver:
205
+ async def solve(state: TaskState, generate: Generate) -> TaskState:
206
+ await start_tool_cli(MY_TOOLS, sandbox("default"), user="agent")
207
+ return state
208
+ return solve
209
+ ```
210
+ """
211
+ sbx = sandbox if sandbox is not None else _get_sandbox("default")
212
+
213
+ # Foreground: install errors propagate to the caller (no deadlock).
214
+ methods = await install_tool_cli(
215
+ tools,
216
+ sbx,
217
+ command_name=command_name,
218
+ service_name=service_name,
219
+ install_dir=install_dir,
220
+ user=user,
221
+ on_path=on_path,
222
+ bin_dir=bin_dir,
223
+ )
224
+
225
+ started = anyio.Event()
226
+ startup_error: dict[str, BaseException] = {}
227
+
228
+ async def _serve() -> None:
229
+ try:
230
+ await sandbox_service(
231
+ service_name,
232
+ methods,
233
+ lambda: False, # run for the lifetime of the sample
234
+ sbx,
235
+ user=user,
236
+ polling_interval=polling_interval,
237
+ started=started,
238
+ )
239
+ except anyio.get_cancelled_exc_class():
240
+ raise
241
+ except BaseException as exc: # noqa: BLE001 - re-raised on the caller's task
242
+ if not started.is_set():
243
+ # Startup failure: record it and unblock the waiter so the caller
244
+ # raises a clean error instead of hanging on started.wait().
245
+ startup_error["error"] = exc
246
+ started.set()
247
+ else:
248
+ # Failure after startup: let background() log/propagate it.
249
+ raise
250
+
251
+ background(_serve)
252
+ await started.wait()
253
+ if "error" in startup_error:
254
+ raise RuntimeError(f"tool_cli service {service_name!r} failed to start") from startup_error[
255
+ "error"
256
+ ]
257
+
258
+
143
259
  def generate_tool_cli_script(service_name: str = "tool_cli") -> str:
144
260
  """Generate a Python CLI script that calls tools via sandbox service RPC.
145
261
 
@@ -211,7 +327,7 @@ def _add_dynamic_arg(parser, name, param, required):
211
327
  parser.add_argument(flag, dest=dest, nargs="?", const=True, default=None, type=_parse_bool, help=description)
212
328
  return
213
329
  if type_str in ("array", "object"):
214
- parser.add_argument(_flag_name(name), dest=dest, type=str, required=required, default=None if not required else None, help=description)
330
+ parser.add_argument(_flag_name(name), dest=dest, type=str, required=required, default=None, help=description)
215
331
  return
216
332
  type_map = {{"string": str, "integer": int, "number": float}}
217
333
  py_type = type_map.get(type_str or "string", str)
@@ -277,15 +393,20 @@ def _required_bool_names(tool):
277
393
 
278
394
 
279
395
  def _call_rpc(method, *args, **kwargs):
396
+ # The RPC client is keyword-only after `method`; pass args by parameter name.
280
397
  try:
281
398
  if method == "list_tools":
282
399
  return call_{service_name}('list_tools')
283
400
  if method == "describe_tool":
284
- return call_{service_name}('describe_tool', *args, **kwargs)
401
+ return call_{service_name}('describe_tool', tool_name=args[0])
285
402
  if method == "describe_tool_for_call":
286
- return call_{service_name}('describe_tool_for_call', *args, **kwargs)
403
+ return call_{service_name}('describe_tool_for_call', tool_name=args[0])
287
404
  if method == "call_tool":
288
- return call_{service_name}('call_tool', *args, **kwargs)
405
+ if len(args) > 2:
406
+ return call_{service_name}(
407
+ 'call_tool', tool_name=args[0], arguments=args[1], snapshot_token=args[2]
408
+ )
409
+ return call_{service_name}('call_tool', tool_name=args[0], arguments=args[1])
289
410
  return call_{service_name}(method, *args, **kwargs)
290
411
  except Exception as exc:
291
412
  print(str(exc), file=sys.stderr)
@@ -472,6 +593,30 @@ def _check_duplicate_tool_names(tool_defs_list: Sequence[ToolDef]) -> None:
472
593
  raise ValueError(f"Duplicate tool names: {names}")
473
594
 
474
595
 
596
+ class _SnapshotStore:
597
+ """Bounded token->snapshot store; evicts oldest entries past ``max_size``.
598
+
599
+ Guards against unbounded growth when a CLI ``call`` is abandoned between
600
+ ``describe_tool_for_call`` (which stores a snapshot) and ``call_tool``
601
+ (which pops it).
602
+ """
603
+
604
+ def __init__(self, max_size: int = 128) -> None:
605
+ self._max = max_size
606
+ self._data: dict[str, list[ToolDef]] = {}
607
+
608
+ def put(self, token: str, value: list[ToolDef]) -> None:
609
+ self._data[token] = value
610
+ while len(self._data) > self._max:
611
+ del self._data[next(iter(self._data))] # dicts preserve insertion order
612
+
613
+ def pop(self, token: str) -> list[ToolDef] | None:
614
+ return self._data.pop(token, None)
615
+
616
+ def __len__(self) -> int:
617
+ return len(self._data)
618
+
619
+
475
620
  def tool_cli_service_methods(
476
621
  tools: Sequence[Tool | ToolDef | ToolSource],
477
622
  *,
@@ -487,7 +632,7 @@ def tool_cli_service_methods(
487
632
  A dict mapping method names to async handler functions.
488
633
  """
489
634
  resolver = _ToolCliResolver(tools, cache_ttl=cache_ttl)
490
- call_snapshots: dict[str, list[ToolDef]] = {}
635
+ call_snapshots = _SnapshotStore()
491
636
 
492
637
  async def list_tools() -> JsonValue:
493
638
  resolved = await resolver.resolve(use_cache=True)
@@ -509,7 +654,7 @@ def tool_cli_service_methods(
509
654
  if td is None:
510
655
  raise ValueError(f"Unknown tool: {tool_name}")
511
656
  snapshot_token = uuid4().hex
512
- call_snapshots[snapshot_token] = resolved
657
+ call_snapshots.put(snapshot_token, resolved)
513
658
  description = _tool_description(td)
514
659
  description["_call_snapshot"] = snapshot_token
515
660
  return description
@@ -522,7 +667,7 @@ def tool_cli_service_methods(
522
667
  if snapshot_token is None:
523
668
  resolved = await resolver.resolve(use_cache=False)
524
669
  else:
525
- resolved = call_snapshots.pop(snapshot_token, None)
670
+ resolved = call_snapshots.pop(snapshot_token)
526
671
  if resolved is None:
527
672
  resolved = await resolver.resolve(use_cache=False)
528
673
  tools_by_name = _tools_by_name(resolved)
@@ -633,10 +778,18 @@ async def _install_script(
633
778
  command_name: str,
634
779
  install_dir: str,
635
780
  user: str | None,
781
+ on_path: bool = True,
782
+ bin_dir: str = "/usr/local/bin",
636
783
  ) -> None:
637
784
  """Install the CLI script into the sandbox."""
638
785
  _validate_command_name(command_name)
639
786
 
787
+ # Validate python3 before any writes so a missing interpreter fails cleanly
788
+ # (the CLI script and PATH wrapper both invoke python3).
789
+ python_check = await sandbox.exec(["sh", "-c", "command -v python3"], user=user)
790
+ if not python_check.success:
791
+ raise RuntimeError("tool_cli requires python3 in the sandbox but none was found on PATH.")
792
+
640
793
  # create install dir
641
794
  await _checked_exec(sandbox, ["mkdir", "-p", install_dir], user="root")
642
795
  if user and user != "root":
@@ -648,55 +801,78 @@ async def _install_script(
648
801
  await _checked_exec(sandbox, ["tee", "--", script_path], input=script, user=user)
649
802
  await _checked_exec(sandbox, ["chmod", "+x", script_path], user=user)
650
803
 
651
- # determine user's home directory for .bashrc
652
- if user:
653
- result = await sandbox.exec(["getent", "passwd", user], user=user)
654
- if result.success and result.stdout.strip():
655
- fields = result.stdout.strip().split(":")
656
- home_dir = fields[5] if len(fields) > 5 and fields[5] else f"/home/{user}"
804
+ # Expose the command on PATH so non-interactive shells (the model agent's
805
+ # bash() tool) can find it; the .bashrc alias only helps interactive shells.
806
+ # Written as root because /usr/local/bin is not writable by the agent user.
807
+ if on_path:
808
+ wrapper_path = f"{bin_dir}/{command_name}"
809
+ wrapper = f'#!/bin/sh\nexec python3 {shlex.quote(script_path)} "$@"\n'
810
+ await _checked_exec(sandbox, ["mkdir", "-p", bin_dir], user="root")
811
+ await _checked_exec(sandbox, ["tee", "--", wrapper_path], input=wrapper, user="root")
812
+ await _checked_exec(sandbox, ["chmod", "+x", wrapper_path], user="root")
813
+
814
+ # Interactive shell alias + tab completion (best-effort: only benefits the
815
+ # interactive human_cli shell; the PATH wrapper is what model agents use).
816
+ try:
817
+ # determine user's home directory for .bashrc
818
+ if user:
819
+ result = await sandbox.exec(["getent", "passwd", user], user=user)
820
+ if result.success and result.stdout.strip():
821
+ fields = result.stdout.strip().split(":")
822
+ home_dir = fields[5] if len(fields) > 5 and fields[5] else f"/home/{user}"
823
+ else:
824
+ home_dir = f"/home/{user}"
657
825
  else:
658
- home_dir = f"/home/{user}"
659
- else:
660
- result = await sandbox.exec(["bash", "-c", "echo $HOME"], user=user)
661
- home_dir = result.stdout.strip() if result.success and result.stdout.strip() else "/root"
662
-
663
- # build bash alias and tab completion
664
- shell_setup_path = f"{home_dir}/.tool_cli_bashrc"
665
- shell_setup_source = (
666
- f"[ -f {shlex.quote(shell_setup_path)} ] && . {shlex.quote(shell_setup_path)}"
667
- )
668
- bashrc_addition = dedent(f"""
669
- # Tool CLI alias and completion
670
- alias {command_name}={shlex.quote(f"python3 {script_path}")}
671
-
672
- _{command_name}_completion() {{
673
- local cur candidate
674
- cur="${{COMP_WORDS[COMP_CWORD]}}"
675
- COMPREPLY=()
676
- while IFS= read -r candidate; do
677
- [[ $candidate == "$cur"* ]] && COMPREPLY+=("$candidate")
678
- done < <(python3 {shlex.quote(script_path)} __complete "$COMP_CWORD" "${{COMP_WORDS[@]}}" 2>/dev/null)
679
- }}
680
- complete -F _{command_name}_completion {command_name}
681
- """)
682
-
683
- await _checked_exec(
684
- sandbox,
685
- ["tee", "--", shell_setup_path],
686
- input=bashrc_addition,
687
- user=user,
688
- )
826
+ result = await sandbox.exec(["bash", "-c", "echo $HOME"], user=user)
827
+ home_dir = (
828
+ result.stdout.strip() if result.success and result.stdout.strip() else "/root"
829
+ )
830
+
831
+ # build bash alias and tab completion
832
+ shell_setup_path = f"{home_dir}/.tool_cli_bashrc"
833
+ shell_setup_source = (
834
+ f"[ -f {shlex.quote(shell_setup_path)} ] && . {shlex.quote(shell_setup_path)}"
835
+ )
836
+ bashrc_addition = dedent(f"""
837
+ # Tool CLI alias and completion
838
+ alias {command_name}={shlex.quote(f"python3 {script_path}")}
839
+
840
+ _{command_name}_completion() {{
841
+ local cur candidate
842
+ cur="${{COMP_WORDS[COMP_CWORD]}}"
843
+ COMPREPLY=()
844
+ while IFS= read -r candidate; do
845
+ [[ $candidate == "$cur"* ]] && COMPREPLY+=("$candidate")
846
+ done < <(python3 {shlex.quote(script_path)} __complete "$COMP_CWORD" "${{COMP_WORDS[@]}}" 2>/dev/null)
847
+ }}
848
+ complete -F _{command_name}_completion {command_name}
849
+ """)
689
850
 
690
- bashrc_path = f"{home_dir}/.bashrc"
691
- result = await sandbox.exec(["grep", "-qxF", shell_setup_source, bashrc_path], user=user)
692
- if not result.success:
693
851
  await _checked_exec(
694
852
  sandbox,
695
- ["tee", "-a", bashrc_path],
696
- input=f"\n{shell_setup_source}\n",
853
+ ["tee", "--", shell_setup_path],
854
+ input=bashrc_addition,
697
855
  user=user,
698
856
  )
699
857
 
858
+ bashrc_path = f"{home_dir}/.bashrc"
859
+ result = await sandbox.exec(["grep", "-qxF", shell_setup_source, bashrc_path], user=user)
860
+ if not result.success:
861
+ await _checked_exec(
862
+ sandbox,
863
+ ["tee", "-a", bashrc_path],
864
+ input=f"\n{shell_setup_source}\n",
865
+ user=user,
866
+ )
867
+ except Exception as exc: # noqa: BLE001 - alias is best-effort
868
+ logger.warning(
869
+ "tool_cli: could not install the interactive shell alias (%s); "
870
+ "the %r command is still available on PATH.",
871
+ exc,
872
+ command_name,
873
+ exc_info=True,
874
+ )
875
+
700
876
 
701
877
  async def _checked_exec(
702
878
  sandbox: SandboxEnvironment,
@@ -1,219 +0,0 @@
1
- # Matplotlib's API is partially untyped; these suppressions apply only to
2
- # build_plot below.
3
- # pyright: reportUnknownMemberType=false
4
- # pyright: reportUnknownVariableType=false
5
- """Render the score-vs-cost matplotlib plot as PNG bytes."""
6
-
7
- from __future__ import annotations
8
-
9
- import io
10
- import logging
11
- import math
12
- from collections.abc import Sequence
13
- from importlib.resources import files
14
-
15
- from inspect_eval_utils.report.cost import cumulative_cost
16
- from inspect_eval_utils.report.events import ReportEvent
17
-
18
- # Matplotlib logs "generated new fontManager" at INFO the first time its font
19
- # cache is built. Quiet it so eval scoring transcripts stay clean.
20
- logging.getLogger("matplotlib.font_manager").setLevel(logging.WARNING)
21
-
22
- # Color palette derived from the METR May 2026 brand guide.
23
- _LEAD_GREEN_500 = "#589885"
24
- _GREEN_700 = "#2A6912"
25
- _GRAY_300 = "#D9DCE2"
26
- _GRAY_700 = "#3D424D"
27
- _GRAY_800 = "#282C33"
28
- _GRAY_900 = "#1B1D22"
29
-
30
- _BUNDLED_FONT_FAMILY = ["Instrument Sans", "DejaVu Sans"]
31
-
32
-
33
- def _register_bundled_font() -> None:
34
- """Register the vendored Instrument Sans TTF with matplotlib (best-effort).
35
-
36
- Quietly returns if already registered or if the asset is missing.
37
- """
38
- from matplotlib import font_manager
39
-
40
- installed = {f.name for f in font_manager.fontManager.ttflist}
41
- if "Instrument Sans" in installed:
42
- return
43
- try:
44
- font_path = files("inspect_eval_utils.report") / "assets" / "InstrumentSans.ttf"
45
- font_manager.fontManager.addfont(str(font_path))
46
- except Exception: # noqa: BLE001
47
- # Asset missing or unreadable; caller can still proceed with the
48
- # DejaVu Sans fallback that matplotlib supplies.
49
- return
50
-
51
-
52
- def build_plot(
53
- events: Sequence[ReportEvent],
54
- *,
55
- model: str,
56
- title: str,
57
- y_label: str,
58
- line_label: str = "Best score",
59
- current_score_label: str | None = None,
60
- x_label_money: str = "Cumulative model cost ($)",
61
- x_label_tokens: str = "Cumulative tokens (cost unavailable)",
62
- marker_event_kind: str | None,
63
- ) -> bytes:
64
- """Render the score-vs-cost plot as PNG bytes.
65
-
66
- The line plots best-so-far `score_update` values, starting at `(0, 0)`,
67
- against cumulative model cost for `model`. If Inspect AI has no pricing for
68
- the model, the x-axis falls back to cumulative token count instead.
69
-
70
- `title`, `y_label`, `line_label`, `x_label_money`, and `x_label_tokens`
71
- provide the plot, legend, and axis copy.
72
-
73
- `marker_event_kind` selects which non-score events delimit episodic spans
74
- (e.g. `"attempt_start"`); pass `None` to disable. When set, the plot area
75
- is shaded into alternating background bands — one per span — so band
76
- *width* visually encodes the compute spent in each span.
77
-
78
- When `current_score_label` is provided, a second (non-monotonic) line is
79
- drawn through the raw per-event score values and labelled accordingly in
80
- the legend.
81
-
82
- The bundled Instrument Sans font is registered best-effort and used with
83
- DejaVu Sans as a fallback. Returns PNG bytes.
84
- """
85
- import matplotlib
86
-
87
- matplotlib.use("Agg")
88
- import matplotlib.pyplot as plt
89
-
90
- _register_bundled_font()
91
- font_family = _BUNDLED_FONT_FAMILY
92
-
93
- has_usage = False
94
- cost_available = True
95
- xs_line: list[float] = [0.0]
96
- ys_line: list[float] = [0.0]
97
- xs_current: list[float] = [0.0]
98
- ys_current: list[float] = [0.0]
99
- marker_xs: list[float] = []
100
-
101
- best_so_far = 0.0
102
- for ev in events:
103
- if ev.usage is None:
104
- continue
105
- has_usage = True
106
- x, available = cumulative_cost(ev.usage, model)
107
- cost_available = cost_available and available
108
- if ev.event_type == "score_update":
109
- best_so_far = max(best_so_far, ev.score)
110
- xs_line.append(x)
111
- ys_line.append(best_so_far)
112
- xs_current.append(x)
113
- ys_current.append(ev.score)
114
- elif marker_event_kind is not None and ev.event_type == marker_event_kind:
115
- marker_xs.append(x)
116
- # Break the current-score line at episodic boundaries so it
117
- # renders as separate segments per attempt instead of a vertical
118
- # drop back to the new attempt's starting floor.
119
- xs_current.append(x)
120
- ys_current.append(float("nan"))
121
-
122
- rc_overrides = {
123
- "font.family": font_family,
124
- "font.size": 13,
125
- "axes.labelsize": 14,
126
- "axes.titlesize": 15,
127
- "xtick.labelsize": 12,
128
- "ytick.labelsize": 12,
129
- "legend.fontsize": 11,
130
- "axes.linewidth": 0.8,
131
- "xtick.major.width": 0.5,
132
- "ytick.major.width": 0.5,
133
- "xtick.major.size": 0,
134
- "ytick.major.size": 0,
135
- }
136
- with plt.rc_context(rc_overrides):
137
- fig, ax = plt.subplots(figsize=(10, 6))
138
- if current_score_label is not None:
139
- ax.plot(
140
- xs_current,
141
- ys_current,
142
- "--",
143
- color=_LEAD_GREEN_500,
144
- linewidth=1.5,
145
- label=current_score_label,
146
- zorder=1,
147
- )
148
- ax.plot(
149
- xs_line,
150
- ys_line,
151
- "-",
152
- color=_GREEN_700,
153
- linewidth=2,
154
- label=line_label,
155
- zorder=2,
156
- )
157
- if marker_xs:
158
- # Render each marker_event_kind span as a background band. Band
159
- # *width* encodes the compute spent in that span, so clustering
160
- # naturally shows as a squeeze of narrow bands.
161
- sorted_starts = sorted(marker_xs)
162
- finite_xs = xs_line + [v for v in xs_current if not math.isnan(v)] + marker_xs
163
- band_end = max(finite_xs) if finite_xs else 0.0
164
- boundaries = sorted_starts + [band_end]
165
- for k in range(len(sorted_starts)):
166
- if k % 2 == 1:
167
- ax.axvspan(
168
- boundaries[k],
169
- boundaries[k + 1],
170
- color=_GRAY_300,
171
- alpha=0.25,
172
- zorder=0,
173
- )
174
-
175
- x_label = x_label_money if (has_usage and cost_available) else x_label_tokens
176
- ax.set_xlabel(x_label, color=_GRAY_800)
177
- ax.set_ylabel(y_label, color=_GRAY_800, rotation=90)
178
- ax.set_ylim(0, 1.05)
179
- ax.set_xlim(left=0)
180
-
181
- ax.spines["top"].set_visible(False)
182
- ax.spines["right"].set_visible(False)
183
- ax.spines["bottom"].set_color(_GRAY_700)
184
- ax.spines["left"].set_color(_GRAY_700)
185
- ax.spines["bottom"].set_linewidth(0.8)
186
- ax.spines["left"].set_linewidth(0.8)
187
- ax.tick_params(colors=_GRAY_700)
188
-
189
- ax.grid(
190
- True,
191
- color=_GRAY_300,
192
- linewidth=0.8,
193
- linestyle=(0, (4, 2)),
194
- zorder=0,
195
- )
196
- ax.set_axisbelow(True)
197
-
198
- ax.set_title(title, color=_GRAY_900, fontweight="medium", pad=12)
199
- legend = ax.legend(
200
- loc="upper left",
201
- frameon=True,
202
- fancybox=False,
203
- edgecolor=_GRAY_300,
204
- framealpha=1.0,
205
- borderpad=0.6,
206
- )
207
- legend.get_frame().set_linewidth(0.5)
208
- legend.get_frame().set_facecolor("white")
209
-
210
- buf = io.BytesIO()
211
- fig.savefig(
212
- buf,
213
- format="png",
214
- dpi=300,
215
- bbox_inches="tight",
216
- facecolor="white",
217
- )
218
- plt.close(fig)
219
- return buf.getvalue()