inspect-eval-utils 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/PKG-INFO +47 -1
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/README.md +46 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/pyproject.toml +1 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_cli.py +3 -1
- inspect_eval_utils-1.3.0/src/inspect_eval_utils/report/plot.py +234 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/scaffolder.py +142 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/__init__.py +6 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/_mechanism.py +226 -50
- inspect_eval_utils-1.2.0/src/inspect_eval_utils/report/plot.py +0 -219
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/.gitignore +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/LICENSE +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/__init__.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_detect.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/pyproject.toml +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/__init__.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/_registry.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/assets/instructions.md +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/py.typed +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/Dockerfile +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/compose.yaml +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/task.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/_templates/default/src/metr_tasks/template/version.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/artifacts.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/__init__.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/sandbox_files.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/task_secrets.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/py.typed +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/__init__.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/assets/InstrumentSans.ttf +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/assets/OFL.txt +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/cost.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/events.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/html.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/__init__.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_context.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_types.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_utils.py +0 -0
- {inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/_setting.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: inspect-eval-utils
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Shared utilities for METR Inspect AI eval repos: task scaffolder + common runtime helpers.
|
|
5
5
|
Project-URL: Repository, https://github.com/METR/inspect-eval-utils
|
|
6
6
|
Project-URL: Issues, https://github.com/METR/inspect-eval-utils/issues
|
|
@@ -297,6 +297,31 @@ tools call <tool-name> --json-args '{"arg": "value"}'
|
|
|
297
297
|
The CLI keeps a short cache for list/help/completion metadata, but tool calls
|
|
298
298
|
refresh the current `ToolSource` before execution.
|
|
299
299
|
|
|
300
|
+
#### Running the tool CLI from a task setup solver
|
|
301
|
+
|
|
302
|
+
Use `start_tool_cli` to expose `Setting`/task tools as a `tools` command for the
|
|
303
|
+
agent in one line. It installs the CLI, starts the RPC service in the background,
|
|
304
|
+
and returns once it's ready (raising if startup fails):
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
from inspect_eval_utils.tool_cli import start_tool_cli
|
|
308
|
+
from inspect_ai.util import sandbox
|
|
309
|
+
|
|
310
|
+
@solver
|
|
311
|
+
def setup() -> Solver:
|
|
312
|
+
async def solve(state: TaskState, generate: Generate) -> TaskState:
|
|
313
|
+
await start_tool_cli(MY_TOOLS, sandbox("default"), user="agent")
|
|
314
|
+
return state
|
|
315
|
+
return solve
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
The command is resolved two ways: *interactive* shells (e.g. `human_cli`) pick it
|
|
319
|
+
up via a `.bashrc` alias + tab-completion; *non-interactive* shells (the model
|
|
320
|
+
agent's `bash()` tool, `sandbox.exec`) find it on `PATH` at
|
|
321
|
+
`/usr/local/bin/<command_name>`. Pass `on_path=False` to skip the PATH wrapper, or
|
|
322
|
+
`bin_dir=...` to relocate it. `run_tool_cli_service` and `setting_tool_cli_running`
|
|
323
|
+
install the PATH wrapper too (default-on).
|
|
324
|
+
|
|
300
325
|
#### Common mistakes
|
|
301
326
|
|
|
302
327
|
- **Listing infrastructure sandboxes as Workspaces.** Only list sandboxes the
|
|
@@ -407,6 +432,27 @@ It does NOT modify `[tool.uv.workspace].members` — that's typically a glob lik
|
|
|
407
432
|
common surprise — the scaffolder modifies a file outside `tasks/my_eval/`, so
|
|
408
433
|
review the diff before committing.
|
|
409
434
|
|
|
435
|
+
### Generated eval-set config
|
|
436
|
+
|
|
437
|
+
The scaffolder also writes a minimal Hawk eval-set skeleton to
|
|
438
|
+
`eval_sets/<name>.eval-set.yaml` (creating `eval_sets/` if needed). This is the
|
|
439
|
+
config you run a batch grid with:
|
|
440
|
+
|
|
441
|
+
```bash
|
|
442
|
+
hawk eval-set eval_sets/my_eval.eval-set.yaml
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
The task `package` URL is derived from the target repo's git `origin` remote and
|
|
446
|
+
current branch, e.g.
|
|
447
|
+
`git+ssh://git@github.com/METR/<repo>@<branch>#subdirectory=tasks/my_eval`. When
|
|
448
|
+
the metadata can't be determined, a TODO marker is left in its place:
|
|
449
|
+
|
|
450
|
+
- no `origin` remote → the whole `package` value is a `TODO:` string,
|
|
451
|
+
- detached HEAD (no branch) → the ref becomes `TODO-set-ref`.
|
|
452
|
+
|
|
453
|
+
The skeleton is intentionally minimal (one model, one solver). An existing
|
|
454
|
+
`eval_sets/<name>.eval-set.yaml` is only overwritten with `--force`.
|
|
455
|
+
|
|
410
456
|
### How substitution works
|
|
411
457
|
|
|
412
458
|
The scaffolder rewrites two things in the same pass:
|
|
@@ -272,6 +272,31 @@ tools call <tool-name> --json-args '{"arg": "value"}'
|
|
|
272
272
|
The CLI keeps a short cache for list/help/completion metadata, but tool calls
|
|
273
273
|
refresh the current `ToolSource` before execution.
|
|
274
274
|
|
|
275
|
+
#### Running the tool CLI from a task setup solver
|
|
276
|
+
|
|
277
|
+
Use `start_tool_cli` to expose `Setting`/task tools as a `tools` command for the
|
|
278
|
+
agent in one line. It installs the CLI, starts the RPC service in the background,
|
|
279
|
+
and returns once it's ready (raising if startup fails):
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
from inspect_eval_utils.tool_cli import start_tool_cli
|
|
283
|
+
from inspect_ai.util import sandbox
|
|
284
|
+
|
|
285
|
+
@solver
|
|
286
|
+
def setup() -> Solver:
|
|
287
|
+
async def solve(state: TaskState, generate: Generate) -> TaskState:
|
|
288
|
+
await start_tool_cli(MY_TOOLS, sandbox("default"), user="agent")
|
|
289
|
+
return state
|
|
290
|
+
return solve
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
The command is resolved two ways: *interactive* shells (e.g. `human_cli`) pick it
|
|
294
|
+
up via a `.bashrc` alias + tab-completion; *non-interactive* shells (the model
|
|
295
|
+
agent's `bash()` tool, `sandbox.exec`) find it on `PATH` at
|
|
296
|
+
`/usr/local/bin/<command_name>`. Pass `on_path=False` to skip the PATH wrapper, or
|
|
297
|
+
`bin_dir=...` to relocate it. `run_tool_cli_service` and `setting_tool_cli_running`
|
|
298
|
+
install the PATH wrapper too (default-on).
|
|
299
|
+
|
|
275
300
|
#### Common mistakes
|
|
276
301
|
|
|
277
302
|
- **Listing infrastructure sandboxes as Workspaces.** Only list sandboxes the
|
|
@@ -382,6 +407,27 @@ It does NOT modify `[tool.uv.workspace].members` — that's typically a glob lik
|
|
|
382
407
|
common surprise — the scaffolder modifies a file outside `tasks/my_eval/`, so
|
|
383
408
|
review the diff before committing.
|
|
384
409
|
|
|
410
|
+
### Generated eval-set config
|
|
411
|
+
|
|
412
|
+
The scaffolder also writes a minimal Hawk eval-set skeleton to
|
|
413
|
+
`eval_sets/<name>.eval-set.yaml` (creating `eval_sets/` if needed). This is the
|
|
414
|
+
config you run a batch grid with:
|
|
415
|
+
|
|
416
|
+
```bash
|
|
417
|
+
hawk eval-set eval_sets/my_eval.eval-set.yaml
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
The task `package` URL is derived from the target repo's git `origin` remote and
|
|
421
|
+
current branch, e.g.
|
|
422
|
+
`git+ssh://git@github.com/METR/<repo>@<branch>#subdirectory=tasks/my_eval`. When
|
|
423
|
+
the metadata can't be determined, a TODO marker is left in its place:
|
|
424
|
+
|
|
425
|
+
- no `origin` remote → the whole `package` value is a `TODO:` string,
|
|
426
|
+
- detached HEAD (no branch) → the ref becomes `TODO-set-ref`.
|
|
427
|
+
|
|
428
|
+
The skeleton is intentionally minimal (one model, one solver). An existing
|
|
429
|
+
`eval_sets/<name>.eval-set.yaml` is only overwritten with `--force`.
|
|
430
|
+
|
|
385
431
|
### How substitution works
|
|
386
432
|
|
|
387
433
|
The scaffolder rewrites two things in the same pass:
|
|
@@ -49,7 +49,7 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
49
49
|
parser.add_argument(
|
|
50
50
|
"--force",
|
|
51
51
|
action="store_true",
|
|
52
|
-
help="Overwrite an existing tasks/<name>/",
|
|
52
|
+
help="Overwrite an existing tasks/<name>/ and eval_sets/<name>.eval-set.yaml",
|
|
53
53
|
)
|
|
54
54
|
args = parser.parse_args(argv)
|
|
55
55
|
|
|
@@ -85,6 +85,8 @@ def main(argv: list[str] | None = None) -> None:
|
|
|
85
85
|
print(f" cd {target_dir}")
|
|
86
86
|
print(" uv sync --group tasks")
|
|
87
87
|
print(f" uv run inspect eval {snake} --model mockllm/replay --limit 1")
|
|
88
|
+
print(f"Also generated eval_sets/{snake}.eval-set.yaml (Hawk batch config).")
|
|
89
|
+
print(f" Batch run: hawk eval-set eval_sets/{snake}.eval-set.yaml")
|
|
88
90
|
|
|
89
91
|
|
|
90
92
|
if __name__ == "__main__":
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Render the score-vs-cost matplotlib plot as PNG bytes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
import math
|
|
8
|
+
import threading
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
from importlib.resources import files
|
|
11
|
+
|
|
12
|
+
from inspect_eval_utils.report.cost import cumulative_cost
|
|
13
|
+
from inspect_eval_utils.report.events import ReportEvent
|
|
14
|
+
|
|
15
|
+
# Matplotlib logs "generated new fontManager" at INFO the first time its font
|
|
16
|
+
# cache is built. Quiet it so eval scoring transcripts stay clean.
|
|
17
|
+
logging.getLogger("matplotlib.font_manager").setLevel(logging.WARNING)
|
|
18
|
+
|
|
19
|
+
# Color palette derived from the METR May 2026 brand guide.
|
|
20
|
+
_LEAD_GREEN_500 = "#589885"
|
|
21
|
+
_GREEN_700 = "#2A6912"
|
|
22
|
+
_GRAY_300 = "#D9DCE2"
|
|
23
|
+
_GRAY_700 = "#3D424D"
|
|
24
|
+
_GRAY_800 = "#282C33"
|
|
25
|
+
_GRAY_900 = "#1B1D22"
|
|
26
|
+
|
|
27
|
+
_BUNDLED_FONT_FAMILY = ["Instrument Sans", "DejaVu Sans"]
|
|
28
|
+
|
|
29
|
+
# Guards the one-time mutation of matplotlib's global font registry so
|
|
30
|
+
# concurrent build_plot callers don't race the check-then-addfont below.
|
|
31
|
+
_FONT_LOCK = threading.Lock()
|
|
32
|
+
# Set once registration has succeeded; lets the common case skip the lock.
|
|
33
|
+
_font_registered = False
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _register_bundled_font() -> None:
|
|
37
|
+
"""Register the vendored Instrument Sans TTF with matplotlib (best-effort).
|
|
38
|
+
|
|
39
|
+
Quietly returns if already registered or if the asset is missing. Uses
|
|
40
|
+
double-checked locking so that, after the one-time registration, concurrent
|
|
41
|
+
callers take the lock-free fast path instead of serializing on every render.
|
|
42
|
+
"""
|
|
43
|
+
global _font_registered
|
|
44
|
+
if _font_registered:
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
from matplotlib import font_manager
|
|
48
|
+
|
|
49
|
+
with _FONT_LOCK:
|
|
50
|
+
if _font_registered:
|
|
51
|
+
return
|
|
52
|
+
installed = {f.name for f in font_manager.fontManager.ttflist}
|
|
53
|
+
if "Instrument Sans" not in installed:
|
|
54
|
+
try:
|
|
55
|
+
font_path = files("inspect_eval_utils.report") / "assets" / "InstrumentSans.ttf"
|
|
56
|
+
font_manager.fontManager.addfont(str(font_path))
|
|
57
|
+
except Exception: # noqa: BLE001
|
|
58
|
+
# Asset missing or unreadable; leave the flag unset so a later
|
|
59
|
+
# call retries. Callers proceed with matplotlib's DejaVu Sans
|
|
60
|
+
# fallback in the meantime.
|
|
61
|
+
return
|
|
62
|
+
_font_registered = True
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def build_plot(
|
|
66
|
+
events: Sequence[ReportEvent],
|
|
67
|
+
*,
|
|
68
|
+
model: str,
|
|
69
|
+
title: str,
|
|
70
|
+
y_label: str,
|
|
71
|
+
line_label: str = "Best score",
|
|
72
|
+
current_score_label: str | None = None,
|
|
73
|
+
x_label_money: str = "Cumulative model cost ($)",
|
|
74
|
+
x_label_tokens: str = "Cumulative tokens (cost unavailable)",
|
|
75
|
+
marker_event_kind: str | None,
|
|
76
|
+
) -> bytes:
|
|
77
|
+
"""Render the score-vs-cost plot as PNG bytes.
|
|
78
|
+
|
|
79
|
+
The line plots best-so-far `score_update` values, starting at `(0, 0)`,
|
|
80
|
+
against cumulative model cost for `model`. If Inspect AI has no pricing for
|
|
81
|
+
the model, the x-axis falls back to cumulative token count instead.
|
|
82
|
+
|
|
83
|
+
`title`, `y_label`, `line_label`, `x_label_money`, and `x_label_tokens`
|
|
84
|
+
provide the plot, legend, and axis copy.
|
|
85
|
+
|
|
86
|
+
`marker_event_kind` selects which non-score events delimit episodic spans
|
|
87
|
+
(e.g. `"attempt_start"`); pass `None` to disable. When set, the plot area
|
|
88
|
+
is shaded into alternating background bands — one per span — so band
|
|
89
|
+
*width* visually encodes the compute spent in each span.
|
|
90
|
+
|
|
91
|
+
When `current_score_label` is provided, a second (non-monotonic) line is
|
|
92
|
+
drawn through the raw per-event score values and labelled accordingly in
|
|
93
|
+
the legend.
|
|
94
|
+
|
|
95
|
+
The bundled Instrument Sans font is registered best-effort and used with
|
|
96
|
+
DejaVu Sans as a fallback. Returns PNG bytes.
|
|
97
|
+
"""
|
|
98
|
+
from matplotlib.backends.backend_agg import FigureCanvasAgg
|
|
99
|
+
from matplotlib.figure import Figure
|
|
100
|
+
from matplotlib.font_manager import FontProperties
|
|
101
|
+
|
|
102
|
+
_register_bundled_font()
|
|
103
|
+
font_family = _BUNDLED_FONT_FAMILY
|
|
104
|
+
|
|
105
|
+
has_usage = False
|
|
106
|
+
cost_available = True
|
|
107
|
+
xs_line: list[float] = [0.0]
|
|
108
|
+
ys_line: list[float] = [0.0]
|
|
109
|
+
xs_current: list[float] = [0.0]
|
|
110
|
+
ys_current: list[float] = [0.0]
|
|
111
|
+
marker_xs: list[float] = []
|
|
112
|
+
|
|
113
|
+
best_so_far = 0.0
|
|
114
|
+
for ev in events:
|
|
115
|
+
if ev.usage is None:
|
|
116
|
+
continue
|
|
117
|
+
has_usage = True
|
|
118
|
+
x, available = cumulative_cost(ev.usage, model)
|
|
119
|
+
cost_available = cost_available and available
|
|
120
|
+
if ev.event_type == "score_update":
|
|
121
|
+
best_so_far = max(best_so_far, ev.score)
|
|
122
|
+
xs_line.append(x)
|
|
123
|
+
ys_line.append(best_so_far)
|
|
124
|
+
xs_current.append(x)
|
|
125
|
+
ys_current.append(ev.score)
|
|
126
|
+
elif marker_event_kind is not None and ev.event_type == marker_event_kind:
|
|
127
|
+
marker_xs.append(x)
|
|
128
|
+
# Break the current-score line at episodic boundaries so it
|
|
129
|
+
# renders as separate segments per attempt instead of a vertical
|
|
130
|
+
# drop back to the new attempt's starting floor.
|
|
131
|
+
xs_current.append(x)
|
|
132
|
+
ys_current.append(float("nan"))
|
|
133
|
+
|
|
134
|
+
label_font = FontProperties(family=font_family, size=14)
|
|
135
|
+
title_font = FontProperties(family=font_family, size=15, weight="medium")
|
|
136
|
+
legend_font = FontProperties(family=font_family, size=11)
|
|
137
|
+
|
|
138
|
+
# Object-oriented (non-pyplot) API: a standalone Figure with an explicit
|
|
139
|
+
# Agg canvas keeps this function thread-safe. pyplot's global figure
|
|
140
|
+
# registry and `rc_context`'s process-wide rcParams mutation both race
|
|
141
|
+
# under concurrent calls, so we render off a local Figure and apply every
|
|
142
|
+
# style per-artist instead of via global rcParams.
|
|
143
|
+
fig = Figure(figsize=(10, 6))
|
|
144
|
+
FigureCanvasAgg(fig) # attaches an Agg canvas (sets fig.canvas)
|
|
145
|
+
ax = fig.subplots()
|
|
146
|
+
|
|
147
|
+
if current_score_label is not None:
|
|
148
|
+
ax.plot(
|
|
149
|
+
xs_current,
|
|
150
|
+
ys_current,
|
|
151
|
+
"--",
|
|
152
|
+
color=_LEAD_GREEN_500,
|
|
153
|
+
linewidth=1.5,
|
|
154
|
+
label=current_score_label,
|
|
155
|
+
zorder=1,
|
|
156
|
+
)
|
|
157
|
+
ax.plot(
|
|
158
|
+
xs_line,
|
|
159
|
+
ys_line,
|
|
160
|
+
"-",
|
|
161
|
+
color=_GREEN_700,
|
|
162
|
+
linewidth=2,
|
|
163
|
+
label=line_label,
|
|
164
|
+
zorder=2,
|
|
165
|
+
)
|
|
166
|
+
if marker_xs:
|
|
167
|
+
# Render each marker_event_kind span as a background band. Band
|
|
168
|
+
# *width* encodes the compute spent in that span, so clustering
|
|
169
|
+
# naturally shows as a squeeze of narrow bands.
|
|
170
|
+
sorted_starts = sorted(marker_xs)
|
|
171
|
+
finite_xs = xs_line + [v for v in xs_current if not math.isnan(v)] + marker_xs
|
|
172
|
+
band_end = max(finite_xs) if finite_xs else 0.0
|
|
173
|
+
boundaries = sorted_starts + [band_end]
|
|
174
|
+
for k in range(len(sorted_starts)):
|
|
175
|
+
if k % 2 == 1:
|
|
176
|
+
ax.axvspan(
|
|
177
|
+
boundaries[k],
|
|
178
|
+
boundaries[k + 1],
|
|
179
|
+
color=_GRAY_300,
|
|
180
|
+
alpha=0.25,
|
|
181
|
+
zorder=0,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
x_label = x_label_money if (has_usage and cost_available) else x_label_tokens
|
|
185
|
+
ax.set_xlabel(x_label, color=_GRAY_800, fontproperties=label_font)
|
|
186
|
+
ax.set_ylabel(y_label, color=_GRAY_800, rotation=90, fontproperties=label_font)
|
|
187
|
+
ax.set_ylim(0, 1.05)
|
|
188
|
+
ax.set_xlim(left=0)
|
|
189
|
+
|
|
190
|
+
ax.spines["top"].set_visible(False)
|
|
191
|
+
ax.spines["right"].set_visible(False)
|
|
192
|
+
ax.spines["bottom"].set_color(_GRAY_700)
|
|
193
|
+
ax.spines["left"].set_color(_GRAY_700)
|
|
194
|
+
ax.spines["bottom"].set_linewidth(0.8)
|
|
195
|
+
ax.spines["left"].set_linewidth(0.8)
|
|
196
|
+
ax.tick_params(
|
|
197
|
+
colors=_GRAY_700,
|
|
198
|
+
labelsize=12,
|
|
199
|
+
width=0.5,
|
|
200
|
+
length=0,
|
|
201
|
+
labelfontfamily=font_family,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
ax.grid(
|
|
205
|
+
True,
|
|
206
|
+
color=_GRAY_300,
|
|
207
|
+
linewidth=0.8,
|
|
208
|
+
linestyle=(0, (4, 2)),
|
|
209
|
+
zorder=0,
|
|
210
|
+
)
|
|
211
|
+
ax.set_axisbelow(True)
|
|
212
|
+
|
|
213
|
+
ax.set_title(title, color=_GRAY_900, fontproperties=title_font, pad=12)
|
|
214
|
+
legend = ax.legend(
|
|
215
|
+
loc="upper left",
|
|
216
|
+
frameon=True,
|
|
217
|
+
fancybox=False,
|
|
218
|
+
edgecolor=_GRAY_300,
|
|
219
|
+
framealpha=1.0,
|
|
220
|
+
borderpad=0.6,
|
|
221
|
+
prop=legend_font,
|
|
222
|
+
)
|
|
223
|
+
legend.get_frame().set_linewidth(0.5)
|
|
224
|
+
legend.get_frame().set_facecolor("white")
|
|
225
|
+
|
|
226
|
+
buf = io.BytesIO()
|
|
227
|
+
fig.savefig(
|
|
228
|
+
buf,
|
|
229
|
+
format="png",
|
|
230
|
+
dpi=300,
|
|
231
|
+
bbox_inches="tight",
|
|
232
|
+
facecolor="white",
|
|
233
|
+
)
|
|
234
|
+
return buf.getvalue()
|
|
@@ -308,6 +308,132 @@ def render_readme(*, snake: str, description: str) -> str:
|
|
|
308
308
|
return README_TEMPLATE.format(snake=snake, description=description)
|
|
309
309
|
|
|
310
310
|
|
|
311
|
+
EVAL_SET_TEMPLATE = """\
|
|
312
|
+
name: {name}
|
|
313
|
+
tasks:
|
|
314
|
+
- package: "{package_url}"
|
|
315
|
+
name: {namespace}
|
|
316
|
+
items:
|
|
317
|
+
- name: {name}
|
|
318
|
+
args: []
|
|
319
|
+
|
|
320
|
+
epochs: 4
|
|
321
|
+
token_limit: 40000000
|
|
322
|
+
|
|
323
|
+
models:
|
|
324
|
+
- package: anthropic
|
|
325
|
+
name: anthropic
|
|
326
|
+
items:
|
|
327
|
+
- name: claude-opus-4-5-20251101
|
|
328
|
+
args:
|
|
329
|
+
config:
|
|
330
|
+
max_tokens: 32000
|
|
331
|
+
reasoning_tokens: 16000
|
|
332
|
+
max_connections: 60
|
|
333
|
+
|
|
334
|
+
solvers:
|
|
335
|
+
- package: "git+https://github.com/METR/inspect-agents@metr_agents/v0.3.5#subdirectory=packages/agents"
|
|
336
|
+
name: metr_agents
|
|
337
|
+
items:
|
|
338
|
+
- name: react
|
|
339
|
+
args:
|
|
340
|
+
tools:
|
|
341
|
+
required:
|
|
342
|
+
- inspect_ai/bash
|
|
343
|
+
- metr_agents/set_timeout
|
|
344
|
+
optional:
|
|
345
|
+
- inspect_ai/python
|
|
346
|
+
truncation: disabled
|
|
347
|
+
compaction: CompactionSummary
|
|
348
|
+
compaction_threshold: 0.75
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def render_eval_set(*, name: str, namespace: str, package_url: str) -> str:
|
|
353
|
+
"""Render a minimal Hawk eval-set skeleton for a scaffolded task."""
|
|
354
|
+
return EVAL_SET_TEMPLATE.format(name=name, namespace=namespace, package_url=package_url)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _read_origin_url(git_dir: Path) -> str | None:
|
|
358
|
+
"""Return the `[remote "origin"] url` value from a .git/config, or None.
|
|
359
|
+
|
|
360
|
+
Hand-parsed rather than via configparser: git indents entries with tabs,
|
|
361
|
+
which configparser misreads as multi-line value continuations.
|
|
362
|
+
"""
|
|
363
|
+
config_path = git_dir / "config"
|
|
364
|
+
if not config_path.is_file():
|
|
365
|
+
return None
|
|
366
|
+
try:
|
|
367
|
+
lines = config_path.read_text().splitlines()
|
|
368
|
+
except (OSError, UnicodeDecodeError):
|
|
369
|
+
return None
|
|
370
|
+
in_origin = False
|
|
371
|
+
for line in lines:
|
|
372
|
+
stripped = line.strip()
|
|
373
|
+
if stripped.startswith("[") and stripped.endswith("]"):
|
|
374
|
+
in_origin = stripped.replace(" ", "") == '[remote"origin"]'
|
|
375
|
+
continue
|
|
376
|
+
if in_origin and "=" in stripped:
|
|
377
|
+
key, _, value = stripped.partition("=")
|
|
378
|
+
if key.strip() == "url":
|
|
379
|
+
return value.strip()
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _read_current_branch(git_dir: Path) -> str | None:
|
|
384
|
+
"""Return the current branch name from .git/HEAD, or None if detached/missing."""
|
|
385
|
+
head_path = git_dir / "HEAD"
|
|
386
|
+
if not head_path.is_file():
|
|
387
|
+
return None
|
|
388
|
+
try:
|
|
389
|
+
content = head_path.read_text().strip()
|
|
390
|
+
except (OSError, UnicodeDecodeError):
|
|
391
|
+
return None
|
|
392
|
+
prefix = "ref: refs/heads/"
|
|
393
|
+
if content.startswith(prefix):
|
|
394
|
+
return content[len(prefix) :]
|
|
395
|
+
return None
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _parse_remote_url(url: str) -> tuple[str, str] | None:
|
|
399
|
+
"""Parse a git remote URL into (host, 'org/repo'). None if unrecognized."""
|
|
400
|
+
url = url.strip()
|
|
401
|
+
if url.endswith(".git"):
|
|
402
|
+
url = url[:-4]
|
|
403
|
+
for pattern in (
|
|
404
|
+
r"^git@([^:]+):(.+)$",
|
|
405
|
+
r"^ssh://git@([^/]+)/(.+)$",
|
|
406
|
+
r"^https://([^/]+)/(.+)$",
|
|
407
|
+
):
|
|
408
|
+
m = re.match(pattern, url)
|
|
409
|
+
if m:
|
|
410
|
+
return m.group(1), m.group(2)
|
|
411
|
+
return None
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def derive_package_url(target_dir: Path, task_name: str) -> str:
|
|
415
|
+
"""Build the eval-set task package URL from the target repo's git metadata.
|
|
416
|
+
|
|
417
|
+
Returns a `git+ssh://...#subdirectory=tasks/<task_name>` URL. Any piece that
|
|
418
|
+
cannot be determined is filled with a TODO marker so the result is never
|
|
419
|
+
silently wrong:
|
|
420
|
+
- no readable origin remote -> the whole value is a TODO string
|
|
421
|
+
- detached HEAD (no branch) -> the ref slot becomes `TODO-set-ref`
|
|
422
|
+
"""
|
|
423
|
+
git_dir = target_dir / ".git"
|
|
424
|
+
url = _read_origin_url(git_dir)
|
|
425
|
+
parsed = _parse_remote_url(url) if url else None
|
|
426
|
+
if parsed is None:
|
|
427
|
+
return (
|
|
428
|
+
"TODO: set git+ssh package URL, e.g. "
|
|
429
|
+
f"git+ssh://git@github.com/<org>/<repo>@<branch>"
|
|
430
|
+
f"#subdirectory=tasks/{task_name}"
|
|
431
|
+
)
|
|
432
|
+
host, path = parsed
|
|
433
|
+
branch = _read_current_branch(git_dir) or "TODO-set-ref"
|
|
434
|
+
return f"git+ssh://git@{host}/{path}@{branch}#subdirectory=tasks/{task_name}"
|
|
435
|
+
|
|
436
|
+
|
|
311
437
|
def edit_root_pyproject(src: str, *, target_pkg_name: str, new_task_dir_name: str) -> str:
|
|
312
438
|
"""Add the new task to dependency-groups.tasks and tool.uv.sources, and
|
|
313
439
|
ensure [tool.uv.workspace].members covers tasks/<new_task_dir_name>.
|
|
@@ -461,6 +587,12 @@ def scaffold_into(
|
|
|
461
587
|
new_task_dir_name=target.new_task_name,
|
|
462
588
|
)
|
|
463
589
|
|
|
590
|
+
# Validate the eval-set destination up front too, so a conflict aborts
|
|
591
|
+
# before any file writes (mirrors the dest_root / root-pyproject checks).
|
|
592
|
+
eval_set_path = target_dir / "eval_sets" / f"{target.new_task_name}.eval-set.yaml"
|
|
593
|
+
if eval_set_path.exists() and not force:
|
|
594
|
+
sys.exit(f"{eval_set_path} already exists (use --force to overwrite)")
|
|
595
|
+
|
|
464
596
|
if dest_root.exists():
|
|
465
597
|
if not force:
|
|
466
598
|
sys.exit(f"{dest_root} already exists (use --force to overwrite)")
|
|
@@ -518,5 +650,15 @@ def scaffold_into(
|
|
|
518
650
|
# Write the (already-validated) edited root pyproject.toml.
|
|
519
651
|
root_pyproject.write_text(new_root_pyproject)
|
|
520
652
|
|
|
653
|
+
# Generated eval-set skeleton at the repo root (not inside tasks/<name>/).
|
|
654
|
+
eval_set_path.parent.mkdir(parents=True, exist_ok=True)
|
|
655
|
+
eval_set_path.write_text(
|
|
656
|
+
render_eval_set(
|
|
657
|
+
name=target.new_task_name,
|
|
658
|
+
namespace=target.namespace,
|
|
659
|
+
package_url=derive_package_url(target_dir, target.new_task_name),
|
|
660
|
+
)
|
|
661
|
+
)
|
|
662
|
+
|
|
521
663
|
# Audit.
|
|
522
664
|
audit_generated_tree(dest_root, source=source)
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/__init__.py
RENAMED
|
@@ -7,13 +7,19 @@ in the sandbox shell.
|
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
from inspect_eval_utils.tool_cli._mechanism import (
|
|
10
|
+
generate_tool_cli_script,
|
|
10
11
|
install_tool_cli,
|
|
11
12
|
run_tool_cli_service,
|
|
13
|
+
start_tool_cli,
|
|
14
|
+
tool_cli_service_methods,
|
|
12
15
|
)
|
|
13
16
|
from inspect_eval_utils.tool_cli._setting import setting_tool_cli_running
|
|
14
17
|
|
|
15
18
|
__all__ = [
|
|
19
|
+
"generate_tool_cli_script",
|
|
16
20
|
"install_tool_cli",
|
|
17
21
|
"run_tool_cli_service",
|
|
18
22
|
"setting_tool_cli_running",
|
|
23
|
+
"start_tool_cli",
|
|
24
|
+
"tool_cli_service_methods",
|
|
19
25
|
]
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/_mechanism.py
RENAMED
|
@@ -5,6 +5,7 @@ with an RPC bridge back to the host for actual tool execution.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import json
|
|
8
|
+
import logging
|
|
8
9
|
import re
|
|
9
10
|
import shlex
|
|
10
11
|
import time
|
|
@@ -16,10 +17,19 @@ import anyio
|
|
|
16
17
|
from inspect_ai.model import ChatMessage, ChatMessageAssistant, ChatMessageTool, execute_tools
|
|
17
18
|
from inspect_ai.tool import Tool, ToolCall, ToolDef, ToolSource
|
|
18
19
|
from inspect_ai.tool._tool_def import tool_defs
|
|
19
|
-
from inspect_ai.util import
|
|
20
|
+
from inspect_ai.util import (
|
|
21
|
+
SandboxEnvironment,
|
|
22
|
+
background,
|
|
23
|
+
sandbox_service,
|
|
24
|
+
)
|
|
25
|
+
from inspect_ai.util import (
|
|
26
|
+
sandbox as _get_sandbox,
|
|
27
|
+
)
|
|
20
28
|
from inspect_ai.util._sandbox.service import SandboxServiceMethod
|
|
21
29
|
from pydantic import JsonValue
|
|
22
30
|
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
23
33
|
|
|
24
34
|
class _ToolCliResolver:
|
|
25
35
|
def __init__(
|
|
@@ -62,6 +72,8 @@ async def install_tool_cli(
|
|
|
62
72
|
service_name: str = "tool_cli",
|
|
63
73
|
install_dir: str = "/opt/tool_cli",
|
|
64
74
|
user: str | None = None,
|
|
75
|
+
on_path: bool = True,
|
|
76
|
+
bin_dir: str = "/usr/local/bin",
|
|
65
77
|
) -> dict[str, SandboxServiceMethod]:
|
|
66
78
|
"""Generate a CLI script, install it into a sandbox, and return service methods.
|
|
67
79
|
|
|
@@ -75,6 +87,9 @@ async def install_tool_cli(
|
|
|
75
87
|
service_name: Name for the sandbox service (used for RPC).
|
|
76
88
|
install_dir: Directory in the sandbox to install the CLI script.
|
|
77
89
|
user: Sandbox user to install as.
|
|
90
|
+
on_path: Install a wrapper for the command in ``bin_dir`` so it resolves on
|
|
91
|
+
PATH for non-interactive shells (e.g. the agent's bash() tool).
|
|
92
|
+
bin_dir: Directory on PATH to install the wrapper into.
|
|
78
93
|
|
|
79
94
|
Returns:
|
|
80
95
|
A dict of service methods to pass to ``sandbox_service()``.
|
|
@@ -89,6 +104,8 @@ async def install_tool_cli(
|
|
|
89
104
|
command_name=command_name,
|
|
90
105
|
install_dir=install_dir,
|
|
91
106
|
user=user,
|
|
107
|
+
on_path=on_path,
|
|
108
|
+
bin_dir=bin_dir,
|
|
92
109
|
)
|
|
93
110
|
|
|
94
111
|
return methods
|
|
@@ -103,6 +120,8 @@ async def run_tool_cli_service(
|
|
|
103
120
|
service_name: str = "tool_cli",
|
|
104
121
|
install_dir: str = "/opt/tool_cli",
|
|
105
122
|
user: str | None = None,
|
|
123
|
+
on_path: bool = True,
|
|
124
|
+
bin_dir: str = "/usr/local/bin",
|
|
106
125
|
polling_interval: float | None = None,
|
|
107
126
|
started: anyio.Event | None = None,
|
|
108
127
|
) -> None:
|
|
@@ -118,6 +137,9 @@ async def run_tool_cli_service(
|
|
|
118
137
|
service_name: Name for the sandbox service (used for RPC).
|
|
119
138
|
install_dir: Directory in the sandbox to install the CLI script.
|
|
120
139
|
user: Sandbox user to install as.
|
|
140
|
+
on_path: Install a wrapper for the command in ``bin_dir`` so it resolves on
|
|
141
|
+
PATH for non-interactive shells (e.g. the agent's bash() tool).
|
|
142
|
+
bin_dir: Directory on PATH to install the wrapper into.
|
|
121
143
|
polling_interval: Polling interval for RPC request checking.
|
|
122
144
|
started: Event set once the sandbox service is ready.
|
|
123
145
|
"""
|
|
@@ -128,6 +150,8 @@ async def run_tool_cli_service(
|
|
|
128
150
|
service_name=service_name,
|
|
129
151
|
install_dir=install_dir,
|
|
130
152
|
user=user,
|
|
153
|
+
on_path=on_path,
|
|
154
|
+
bin_dir=bin_dir,
|
|
131
155
|
)
|
|
132
156
|
await sandbox_service(
|
|
133
157
|
service_name,
|
|
@@ -140,6 +164,98 @@ async def run_tool_cli_service(
|
|
|
140
164
|
)
|
|
141
165
|
|
|
142
166
|
|
|
167
|
+
async def start_tool_cli(
|
|
168
|
+
tools: Sequence[Tool | ToolDef | ToolSource],
|
|
169
|
+
sandbox: SandboxEnvironment | None = None,
|
|
170
|
+
*,
|
|
171
|
+
command_name: str = "tools",
|
|
172
|
+
service_name: str = "tool_cli",
|
|
173
|
+
install_dir: str = "/opt/tool_cli",
|
|
174
|
+
user: str | None = None,
|
|
175
|
+
on_path: bool = True,
|
|
176
|
+
bin_dir: str = "/usr/local/bin",
|
|
177
|
+
polling_interval: float | None = None,
|
|
178
|
+
) -> None:
|
|
179
|
+
"""Install the tool CLI and run its sandbox service in the background.
|
|
180
|
+
|
|
181
|
+
Fire-and-forget helper for task **setup solvers**: it installs the CLI in the
|
|
182
|
+
foreground (so install errors propagate to you), starts the RPC service in the
|
|
183
|
+
background, and returns once the service is ready. The service then runs until
|
|
184
|
+
the sample ends. By default the command is exposed on PATH (see ``on_path``) so
|
|
185
|
+
the model agent's non-interactive ``bash()`` tool can run it.
|
|
186
|
+
|
|
187
|
+
Unlike a bare ``background(run_tool_cli_service(...))`` + ``started.wait()``,
|
|
188
|
+
this surfaces startup failures as an exception instead of hanging.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
tools: Tools to expose as CLI commands.
|
|
192
|
+
sandbox: Sandbox to install into. Defaults to ``sandbox("default")``.
|
|
193
|
+
command_name: Name of the CLI command (and the PATH wrapper).
|
|
194
|
+
service_name: Sandbox-service name used for RPC.
|
|
195
|
+
install_dir: Directory in the sandbox to install the CLI script.
|
|
196
|
+
user: Sandbox user the service runs as (e.g. the agent's user).
|
|
197
|
+
on_path: Expose ``command_name`` on PATH (default True).
|
|
198
|
+
bin_dir: Directory on PATH for the wrapper.
|
|
199
|
+
polling_interval: RPC request polling interval.
|
|
200
|
+
|
|
201
|
+
Example:
|
|
202
|
+
```python
|
|
203
|
+
@solver
|
|
204
|
+
def setup() -> Solver:
|
|
205
|
+
async def solve(state: TaskState, generate: Generate) -> TaskState:
|
|
206
|
+
await start_tool_cli(MY_TOOLS, sandbox("default"), user="agent")
|
|
207
|
+
return state
|
|
208
|
+
return solve
|
|
209
|
+
```
|
|
210
|
+
"""
|
|
211
|
+
sbx = sandbox if sandbox is not None else _get_sandbox("default")
|
|
212
|
+
|
|
213
|
+
# Foreground: install errors propagate to the caller (no deadlock).
|
|
214
|
+
methods = await install_tool_cli(
|
|
215
|
+
tools,
|
|
216
|
+
sbx,
|
|
217
|
+
command_name=command_name,
|
|
218
|
+
service_name=service_name,
|
|
219
|
+
install_dir=install_dir,
|
|
220
|
+
user=user,
|
|
221
|
+
on_path=on_path,
|
|
222
|
+
bin_dir=bin_dir,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
started = anyio.Event()
|
|
226
|
+
startup_error: dict[str, BaseException] = {}
|
|
227
|
+
|
|
228
|
+
async def _serve() -> None:
|
|
229
|
+
try:
|
|
230
|
+
await sandbox_service(
|
|
231
|
+
service_name,
|
|
232
|
+
methods,
|
|
233
|
+
lambda: False, # run for the lifetime of the sample
|
|
234
|
+
sbx,
|
|
235
|
+
user=user,
|
|
236
|
+
polling_interval=polling_interval,
|
|
237
|
+
started=started,
|
|
238
|
+
)
|
|
239
|
+
except anyio.get_cancelled_exc_class():
|
|
240
|
+
raise
|
|
241
|
+
except BaseException as exc: # noqa: BLE001 - re-raised on the caller's task
|
|
242
|
+
if not started.is_set():
|
|
243
|
+
# Startup failure: record it and unblock the waiter so the caller
|
|
244
|
+
# raises a clean error instead of hanging on started.wait().
|
|
245
|
+
startup_error["error"] = exc
|
|
246
|
+
started.set()
|
|
247
|
+
else:
|
|
248
|
+
# Failure after startup: let background() log/propagate it.
|
|
249
|
+
raise
|
|
250
|
+
|
|
251
|
+
background(_serve)
|
|
252
|
+
await started.wait()
|
|
253
|
+
if "error" in startup_error:
|
|
254
|
+
raise RuntimeError(f"tool_cli service {service_name!r} failed to start") from startup_error[
|
|
255
|
+
"error"
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
|
|
143
259
|
def generate_tool_cli_script(service_name: str = "tool_cli") -> str:
|
|
144
260
|
"""Generate a Python CLI script that calls tools via sandbox service RPC.
|
|
145
261
|
|
|
@@ -211,7 +327,7 @@ def _add_dynamic_arg(parser, name, param, required):
|
|
|
211
327
|
parser.add_argument(flag, dest=dest, nargs="?", const=True, default=None, type=_parse_bool, help=description)
|
|
212
328
|
return
|
|
213
329
|
if type_str in ("array", "object"):
|
|
214
|
-
parser.add_argument(_flag_name(name), dest=dest, type=str, required=required, default=None
|
|
330
|
+
parser.add_argument(_flag_name(name), dest=dest, type=str, required=required, default=None, help=description)
|
|
215
331
|
return
|
|
216
332
|
type_map = {{"string": str, "integer": int, "number": float}}
|
|
217
333
|
py_type = type_map.get(type_str or "string", str)
|
|
@@ -277,15 +393,20 @@ def _required_bool_names(tool):
|
|
|
277
393
|
|
|
278
394
|
|
|
279
395
|
def _call_rpc(method, *args, **kwargs):
|
|
396
|
+
# The RPC client is keyword-only after `method`; pass args by parameter name.
|
|
280
397
|
try:
|
|
281
398
|
if method == "list_tools":
|
|
282
399
|
return call_{service_name}('list_tools')
|
|
283
400
|
if method == "describe_tool":
|
|
284
|
-
return call_{service_name}('describe_tool',
|
|
401
|
+
return call_{service_name}('describe_tool', tool_name=args[0])
|
|
285
402
|
if method == "describe_tool_for_call":
|
|
286
|
-
return call_{service_name}('describe_tool_for_call',
|
|
403
|
+
return call_{service_name}('describe_tool_for_call', tool_name=args[0])
|
|
287
404
|
if method == "call_tool":
|
|
288
|
-
|
|
405
|
+
if len(args) > 2:
|
|
406
|
+
return call_{service_name}(
|
|
407
|
+
'call_tool', tool_name=args[0], arguments=args[1], snapshot_token=args[2]
|
|
408
|
+
)
|
|
409
|
+
return call_{service_name}('call_tool', tool_name=args[0], arguments=args[1])
|
|
289
410
|
return call_{service_name}(method, *args, **kwargs)
|
|
290
411
|
except Exception as exc:
|
|
291
412
|
print(str(exc), file=sys.stderr)
|
|
@@ -472,6 +593,30 @@ def _check_duplicate_tool_names(tool_defs_list: Sequence[ToolDef]) -> None:
|
|
|
472
593
|
raise ValueError(f"Duplicate tool names: {names}")
|
|
473
594
|
|
|
474
595
|
|
|
596
|
+
class _SnapshotStore:
|
|
597
|
+
"""Bounded token->snapshot store; evicts oldest entries past ``max_size``.
|
|
598
|
+
|
|
599
|
+
Guards against unbounded growth when a CLI ``call`` is abandoned between
|
|
600
|
+
``describe_tool_for_call`` (which stores a snapshot) and ``call_tool``
|
|
601
|
+
(which pops it).
|
|
602
|
+
"""
|
|
603
|
+
|
|
604
|
+
def __init__(self, max_size: int = 128) -> None:
|
|
605
|
+
self._max = max_size
|
|
606
|
+
self._data: dict[str, list[ToolDef]] = {}
|
|
607
|
+
|
|
608
|
+
def put(self, token: str, value: list[ToolDef]) -> None:
|
|
609
|
+
self._data[token] = value
|
|
610
|
+
while len(self._data) > self._max:
|
|
611
|
+
del self._data[next(iter(self._data))] # dicts preserve insertion order
|
|
612
|
+
|
|
613
|
+
def pop(self, token: str) -> list[ToolDef] | None:
|
|
614
|
+
return self._data.pop(token, None)
|
|
615
|
+
|
|
616
|
+
def __len__(self) -> int:
|
|
617
|
+
return len(self._data)
|
|
618
|
+
|
|
619
|
+
|
|
475
620
|
def tool_cli_service_methods(
|
|
476
621
|
tools: Sequence[Tool | ToolDef | ToolSource],
|
|
477
622
|
*,
|
|
@@ -487,7 +632,7 @@ def tool_cli_service_methods(
|
|
|
487
632
|
A dict mapping method names to async handler functions.
|
|
488
633
|
"""
|
|
489
634
|
resolver = _ToolCliResolver(tools, cache_ttl=cache_ttl)
|
|
490
|
-
call_snapshots
|
|
635
|
+
call_snapshots = _SnapshotStore()
|
|
491
636
|
|
|
492
637
|
async def list_tools() -> JsonValue:
|
|
493
638
|
resolved = await resolver.resolve(use_cache=True)
|
|
@@ -509,7 +654,7 @@ def tool_cli_service_methods(
|
|
|
509
654
|
if td is None:
|
|
510
655
|
raise ValueError(f"Unknown tool: {tool_name}")
|
|
511
656
|
snapshot_token = uuid4().hex
|
|
512
|
-
call_snapshots
|
|
657
|
+
call_snapshots.put(snapshot_token, resolved)
|
|
513
658
|
description = _tool_description(td)
|
|
514
659
|
description["_call_snapshot"] = snapshot_token
|
|
515
660
|
return description
|
|
@@ -522,7 +667,7 @@ def tool_cli_service_methods(
|
|
|
522
667
|
if snapshot_token is None:
|
|
523
668
|
resolved = await resolver.resolve(use_cache=False)
|
|
524
669
|
else:
|
|
525
|
-
resolved = call_snapshots.pop(snapshot_token
|
|
670
|
+
resolved = call_snapshots.pop(snapshot_token)
|
|
526
671
|
if resolved is None:
|
|
527
672
|
resolved = await resolver.resolve(use_cache=False)
|
|
528
673
|
tools_by_name = _tools_by_name(resolved)
|
|
@@ -633,10 +778,18 @@ async def _install_script(
|
|
|
633
778
|
command_name: str,
|
|
634
779
|
install_dir: str,
|
|
635
780
|
user: str | None,
|
|
781
|
+
on_path: bool = True,
|
|
782
|
+
bin_dir: str = "/usr/local/bin",
|
|
636
783
|
) -> None:
|
|
637
784
|
"""Install the CLI script into the sandbox."""
|
|
638
785
|
_validate_command_name(command_name)
|
|
639
786
|
|
|
787
|
+
# Validate python3 before any writes so a missing interpreter fails cleanly
|
|
788
|
+
# (the CLI script and PATH wrapper both invoke python3).
|
|
789
|
+
python_check = await sandbox.exec(["sh", "-c", "command -v python3"], user=user)
|
|
790
|
+
if not python_check.success:
|
|
791
|
+
raise RuntimeError("tool_cli requires python3 in the sandbox but none was found on PATH.")
|
|
792
|
+
|
|
640
793
|
# create install dir
|
|
641
794
|
await _checked_exec(sandbox, ["mkdir", "-p", install_dir], user="root")
|
|
642
795
|
if user and user != "root":
|
|
@@ -648,55 +801,78 @@ async def _install_script(
|
|
|
648
801
|
await _checked_exec(sandbox, ["tee", "--", script_path], input=script, user=user)
|
|
649
802
|
await _checked_exec(sandbox, ["chmod", "+x", script_path], user=user)
|
|
650
803
|
|
|
651
|
-
#
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
804
|
+
# Expose the command on PATH so non-interactive shells (the model agent's
|
|
805
|
+
# bash() tool) can find it; the .bashrc alias only helps interactive shells.
|
|
806
|
+
# Written as root because /usr/local/bin is not writable by the agent user.
|
|
807
|
+
if on_path:
|
|
808
|
+
wrapper_path = f"{bin_dir}/{command_name}"
|
|
809
|
+
wrapper = f'#!/bin/sh\nexec python3 {shlex.quote(script_path)} "$@"\n'
|
|
810
|
+
await _checked_exec(sandbox, ["mkdir", "-p", bin_dir], user="root")
|
|
811
|
+
await _checked_exec(sandbox, ["tee", "--", wrapper_path], input=wrapper, user="root")
|
|
812
|
+
await _checked_exec(sandbox, ["chmod", "+x", wrapper_path], user="root")
|
|
813
|
+
|
|
814
|
+
# Interactive shell alias + tab completion (best-effort: only benefits the
|
|
815
|
+
# interactive human_cli shell; the PATH wrapper is what model agents use).
|
|
816
|
+
try:
|
|
817
|
+
# determine user's home directory for .bashrc
|
|
818
|
+
if user:
|
|
819
|
+
result = await sandbox.exec(["getent", "passwd", user], user=user)
|
|
820
|
+
if result.success and result.stdout.strip():
|
|
821
|
+
fields = result.stdout.strip().split(":")
|
|
822
|
+
home_dir = fields[5] if len(fields) > 5 and fields[5] else f"/home/{user}"
|
|
823
|
+
else:
|
|
824
|
+
home_dir = f"/home/{user}"
|
|
657
825
|
else:
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
await _checked_exec(
|
|
684
|
-
sandbox,
|
|
685
|
-
["tee", "--", shell_setup_path],
|
|
686
|
-
input=bashrc_addition,
|
|
687
|
-
user=user,
|
|
688
|
-
)
|
|
826
|
+
result = await sandbox.exec(["bash", "-c", "echo $HOME"], user=user)
|
|
827
|
+
home_dir = (
|
|
828
|
+
result.stdout.strip() if result.success and result.stdout.strip() else "/root"
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
# build bash alias and tab completion
|
|
832
|
+
shell_setup_path = f"{home_dir}/.tool_cli_bashrc"
|
|
833
|
+
shell_setup_source = (
|
|
834
|
+
f"[ -f {shlex.quote(shell_setup_path)} ] && . {shlex.quote(shell_setup_path)}"
|
|
835
|
+
)
|
|
836
|
+
bashrc_addition = dedent(f"""
|
|
837
|
+
# Tool CLI alias and completion
|
|
838
|
+
alias {command_name}={shlex.quote(f"python3 {script_path}")}
|
|
839
|
+
|
|
840
|
+
_{command_name}_completion() {{
|
|
841
|
+
local cur candidate
|
|
842
|
+
cur="${{COMP_WORDS[COMP_CWORD]}}"
|
|
843
|
+
COMPREPLY=()
|
|
844
|
+
while IFS= read -r candidate; do
|
|
845
|
+
[[ $candidate == "$cur"* ]] && COMPREPLY+=("$candidate")
|
|
846
|
+
done < <(python3 {shlex.quote(script_path)} __complete "$COMP_CWORD" "${{COMP_WORDS[@]}}" 2>/dev/null)
|
|
847
|
+
}}
|
|
848
|
+
complete -F _{command_name}_completion {command_name}
|
|
849
|
+
""")
|
|
689
850
|
|
|
690
|
-
bashrc_path = f"{home_dir}/.bashrc"
|
|
691
|
-
result = await sandbox.exec(["grep", "-qxF", shell_setup_source, bashrc_path], user=user)
|
|
692
|
-
if not result.success:
|
|
693
851
|
await _checked_exec(
|
|
694
852
|
sandbox,
|
|
695
|
-
["tee", "
|
|
696
|
-
input=
|
|
853
|
+
["tee", "--", shell_setup_path],
|
|
854
|
+
input=bashrc_addition,
|
|
697
855
|
user=user,
|
|
698
856
|
)
|
|
699
857
|
|
|
858
|
+
bashrc_path = f"{home_dir}/.bashrc"
|
|
859
|
+
result = await sandbox.exec(["grep", "-qxF", shell_setup_source, bashrc_path], user=user)
|
|
860
|
+
if not result.success:
|
|
861
|
+
await _checked_exec(
|
|
862
|
+
sandbox,
|
|
863
|
+
["tee", "-a", bashrc_path],
|
|
864
|
+
input=f"\n{shell_setup_source}\n",
|
|
865
|
+
user=user,
|
|
866
|
+
)
|
|
867
|
+
except Exception as exc: # noqa: BLE001 - alias is best-effort
|
|
868
|
+
logger.warning(
|
|
869
|
+
"tool_cli: could not install the interactive shell alias (%s); "
|
|
870
|
+
"the %r command is still available on PATH.",
|
|
871
|
+
exc,
|
|
872
|
+
command_name,
|
|
873
|
+
exc_info=True,
|
|
874
|
+
)
|
|
875
|
+
|
|
700
876
|
|
|
701
877
|
async def _checked_exec(
|
|
702
878
|
sandbox: SandboxEnvironment,
|
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
# Matplotlib's API is partially untyped; these suppressions apply only to
|
|
2
|
-
# build_plot below.
|
|
3
|
-
# pyright: reportUnknownMemberType=false
|
|
4
|
-
# pyright: reportUnknownVariableType=false
|
|
5
|
-
"""Render the score-vs-cost matplotlib plot as PNG bytes."""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import io
|
|
10
|
-
import logging
|
|
11
|
-
import math
|
|
12
|
-
from collections.abc import Sequence
|
|
13
|
-
from importlib.resources import files
|
|
14
|
-
|
|
15
|
-
from inspect_eval_utils.report.cost import cumulative_cost
|
|
16
|
-
from inspect_eval_utils.report.events import ReportEvent
|
|
17
|
-
|
|
18
|
-
# Matplotlib logs "generated new fontManager" at INFO the first time its font
|
|
19
|
-
# cache is built. Quiet it so eval scoring transcripts stay clean.
|
|
20
|
-
logging.getLogger("matplotlib.font_manager").setLevel(logging.WARNING)
|
|
21
|
-
|
|
22
|
-
# Color palette derived from the METR May 2026 brand guide.
|
|
23
|
-
_LEAD_GREEN_500 = "#589885"
|
|
24
|
-
_GREEN_700 = "#2A6912"
|
|
25
|
-
_GRAY_300 = "#D9DCE2"
|
|
26
|
-
_GRAY_700 = "#3D424D"
|
|
27
|
-
_GRAY_800 = "#282C33"
|
|
28
|
-
_GRAY_900 = "#1B1D22"
|
|
29
|
-
|
|
30
|
-
_BUNDLED_FONT_FAMILY = ["Instrument Sans", "DejaVu Sans"]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def _register_bundled_font() -> None:
|
|
34
|
-
"""Register the vendored Instrument Sans TTF with matplotlib (best-effort).
|
|
35
|
-
|
|
36
|
-
Quietly returns if already registered or if the asset is missing.
|
|
37
|
-
"""
|
|
38
|
-
from matplotlib import font_manager
|
|
39
|
-
|
|
40
|
-
installed = {f.name for f in font_manager.fontManager.ttflist}
|
|
41
|
-
if "Instrument Sans" in installed:
|
|
42
|
-
return
|
|
43
|
-
try:
|
|
44
|
-
font_path = files("inspect_eval_utils.report") / "assets" / "InstrumentSans.ttf"
|
|
45
|
-
font_manager.fontManager.addfont(str(font_path))
|
|
46
|
-
except Exception: # noqa: BLE001
|
|
47
|
-
# Asset missing or unreadable; caller can still proceed with the
|
|
48
|
-
# DejaVu Sans fallback that matplotlib supplies.
|
|
49
|
-
return
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def build_plot(
|
|
53
|
-
events: Sequence[ReportEvent],
|
|
54
|
-
*,
|
|
55
|
-
model: str,
|
|
56
|
-
title: str,
|
|
57
|
-
y_label: str,
|
|
58
|
-
line_label: str = "Best score",
|
|
59
|
-
current_score_label: str | None = None,
|
|
60
|
-
x_label_money: str = "Cumulative model cost ($)",
|
|
61
|
-
x_label_tokens: str = "Cumulative tokens (cost unavailable)",
|
|
62
|
-
marker_event_kind: str | None,
|
|
63
|
-
) -> bytes:
|
|
64
|
-
"""Render the score-vs-cost plot as PNG bytes.
|
|
65
|
-
|
|
66
|
-
The line plots best-so-far `score_update` values, starting at `(0, 0)`,
|
|
67
|
-
against cumulative model cost for `model`. If Inspect AI has no pricing for
|
|
68
|
-
the model, the x-axis falls back to cumulative token count instead.
|
|
69
|
-
|
|
70
|
-
`title`, `y_label`, `line_label`, `x_label_money`, and `x_label_tokens`
|
|
71
|
-
provide the plot, legend, and axis copy.
|
|
72
|
-
|
|
73
|
-
`marker_event_kind` selects which non-score events delimit episodic spans
|
|
74
|
-
(e.g. `"attempt_start"`); pass `None` to disable. When set, the plot area
|
|
75
|
-
is shaded into alternating background bands — one per span — so band
|
|
76
|
-
*width* visually encodes the compute spent in each span.
|
|
77
|
-
|
|
78
|
-
When `current_score_label` is provided, a second (non-monotonic) line is
|
|
79
|
-
drawn through the raw per-event score values and labelled accordingly in
|
|
80
|
-
the legend.
|
|
81
|
-
|
|
82
|
-
The bundled Instrument Sans font is registered best-effort and used with
|
|
83
|
-
DejaVu Sans as a fallback. Returns PNG bytes.
|
|
84
|
-
"""
|
|
85
|
-
import matplotlib
|
|
86
|
-
|
|
87
|
-
matplotlib.use("Agg")
|
|
88
|
-
import matplotlib.pyplot as plt
|
|
89
|
-
|
|
90
|
-
_register_bundled_font()
|
|
91
|
-
font_family = _BUNDLED_FONT_FAMILY
|
|
92
|
-
|
|
93
|
-
has_usage = False
|
|
94
|
-
cost_available = True
|
|
95
|
-
xs_line: list[float] = [0.0]
|
|
96
|
-
ys_line: list[float] = [0.0]
|
|
97
|
-
xs_current: list[float] = [0.0]
|
|
98
|
-
ys_current: list[float] = [0.0]
|
|
99
|
-
marker_xs: list[float] = []
|
|
100
|
-
|
|
101
|
-
best_so_far = 0.0
|
|
102
|
-
for ev in events:
|
|
103
|
-
if ev.usage is None:
|
|
104
|
-
continue
|
|
105
|
-
has_usage = True
|
|
106
|
-
x, available = cumulative_cost(ev.usage, model)
|
|
107
|
-
cost_available = cost_available and available
|
|
108
|
-
if ev.event_type == "score_update":
|
|
109
|
-
best_so_far = max(best_so_far, ev.score)
|
|
110
|
-
xs_line.append(x)
|
|
111
|
-
ys_line.append(best_so_far)
|
|
112
|
-
xs_current.append(x)
|
|
113
|
-
ys_current.append(ev.score)
|
|
114
|
-
elif marker_event_kind is not None and ev.event_type == marker_event_kind:
|
|
115
|
-
marker_xs.append(x)
|
|
116
|
-
# Break the current-score line at episodic boundaries so it
|
|
117
|
-
# renders as separate segments per attempt instead of a vertical
|
|
118
|
-
# drop back to the new attempt's starting floor.
|
|
119
|
-
xs_current.append(x)
|
|
120
|
-
ys_current.append(float("nan"))
|
|
121
|
-
|
|
122
|
-
rc_overrides = {
|
|
123
|
-
"font.family": font_family,
|
|
124
|
-
"font.size": 13,
|
|
125
|
-
"axes.labelsize": 14,
|
|
126
|
-
"axes.titlesize": 15,
|
|
127
|
-
"xtick.labelsize": 12,
|
|
128
|
-
"ytick.labelsize": 12,
|
|
129
|
-
"legend.fontsize": 11,
|
|
130
|
-
"axes.linewidth": 0.8,
|
|
131
|
-
"xtick.major.width": 0.5,
|
|
132
|
-
"ytick.major.width": 0.5,
|
|
133
|
-
"xtick.major.size": 0,
|
|
134
|
-
"ytick.major.size": 0,
|
|
135
|
-
}
|
|
136
|
-
with plt.rc_context(rc_overrides):
|
|
137
|
-
fig, ax = plt.subplots(figsize=(10, 6))
|
|
138
|
-
if current_score_label is not None:
|
|
139
|
-
ax.plot(
|
|
140
|
-
xs_current,
|
|
141
|
-
ys_current,
|
|
142
|
-
"--",
|
|
143
|
-
color=_LEAD_GREEN_500,
|
|
144
|
-
linewidth=1.5,
|
|
145
|
-
label=current_score_label,
|
|
146
|
-
zorder=1,
|
|
147
|
-
)
|
|
148
|
-
ax.plot(
|
|
149
|
-
xs_line,
|
|
150
|
-
ys_line,
|
|
151
|
-
"-",
|
|
152
|
-
color=_GREEN_700,
|
|
153
|
-
linewidth=2,
|
|
154
|
-
label=line_label,
|
|
155
|
-
zorder=2,
|
|
156
|
-
)
|
|
157
|
-
if marker_xs:
|
|
158
|
-
# Render each marker_event_kind span as a background band. Band
|
|
159
|
-
# *width* encodes the compute spent in that span, so clustering
|
|
160
|
-
# naturally shows as a squeeze of narrow bands.
|
|
161
|
-
sorted_starts = sorted(marker_xs)
|
|
162
|
-
finite_xs = xs_line + [v for v in xs_current if not math.isnan(v)] + marker_xs
|
|
163
|
-
band_end = max(finite_xs) if finite_xs else 0.0
|
|
164
|
-
boundaries = sorted_starts + [band_end]
|
|
165
|
-
for k in range(len(sorted_starts)):
|
|
166
|
-
if k % 2 == 1:
|
|
167
|
-
ax.axvspan(
|
|
168
|
-
boundaries[k],
|
|
169
|
-
boundaries[k + 1],
|
|
170
|
-
color=_GRAY_300,
|
|
171
|
-
alpha=0.25,
|
|
172
|
-
zorder=0,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
x_label = x_label_money if (has_usage and cost_available) else x_label_tokens
|
|
176
|
-
ax.set_xlabel(x_label, color=_GRAY_800)
|
|
177
|
-
ax.set_ylabel(y_label, color=_GRAY_800, rotation=90)
|
|
178
|
-
ax.set_ylim(0, 1.05)
|
|
179
|
-
ax.set_xlim(left=0)
|
|
180
|
-
|
|
181
|
-
ax.spines["top"].set_visible(False)
|
|
182
|
-
ax.spines["right"].set_visible(False)
|
|
183
|
-
ax.spines["bottom"].set_color(_GRAY_700)
|
|
184
|
-
ax.spines["left"].set_color(_GRAY_700)
|
|
185
|
-
ax.spines["bottom"].set_linewidth(0.8)
|
|
186
|
-
ax.spines["left"].set_linewidth(0.8)
|
|
187
|
-
ax.tick_params(colors=_GRAY_700)
|
|
188
|
-
|
|
189
|
-
ax.grid(
|
|
190
|
-
True,
|
|
191
|
-
color=_GRAY_300,
|
|
192
|
-
linewidth=0.8,
|
|
193
|
-
linestyle=(0, (4, 2)),
|
|
194
|
-
zorder=0,
|
|
195
|
-
)
|
|
196
|
-
ax.set_axisbelow(True)
|
|
197
|
-
|
|
198
|
-
ax.set_title(title, color=_GRAY_900, fontweight="medium", pad=12)
|
|
199
|
-
legend = ax.legend(
|
|
200
|
-
loc="upper left",
|
|
201
|
-
frameon=True,
|
|
202
|
-
fancybox=False,
|
|
203
|
-
edgecolor=_GRAY_300,
|
|
204
|
-
framealpha=1.0,
|
|
205
|
-
borderpad=0.6,
|
|
206
|
-
)
|
|
207
|
-
legend.get_frame().set_linewidth(0.5)
|
|
208
|
-
legend.get_frame().set_facecolor("white")
|
|
209
|
-
|
|
210
|
-
buf = io.BytesIO()
|
|
211
|
-
fig.savefig(
|
|
212
|
-
buf,
|
|
213
|
-
format="png",
|
|
214
|
-
dpi=300,
|
|
215
|
-
bbox_inches="tight",
|
|
216
|
-
facecolor="white",
|
|
217
|
-
)
|
|
218
|
-
plt.close(fig)
|
|
219
|
-
return buf.getvalue()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/__init__.py
RENAMED
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/sandbox_files.py
RENAMED
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/common/task_secrets.py
RENAMED
|
File without changes
|
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/assets/OFL.txt
RENAMED
|
File without changes
|
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/report/events.py
RENAMED
|
File without changes
|
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/__init__.py
RENAMED
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_context.py
RENAMED
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_types.py
RENAMED
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/setting/_utils.py
RENAMED
|
File without changes
|
{inspect_eval_utils-1.2.0 → inspect_eval_utils-1.3.0}/src/inspect_eval_utils/tool_cli/_setting.py
RENAMED
|
File without changes
|