inspect-eval-utils 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. inspect_eval_utils/__init__.py +1 -0
  2. inspect_eval_utils/_cli.py +86 -0
  3. inspect_eval_utils/_detect.py +156 -0
  4. inspect_eval_utils/_templates/default/pyproject.toml +16 -0
  5. inspect_eval_utils/_templates/default/src/metr_tasks/template/__init__.py +6 -0
  6. inspect_eval_utils/_templates/default/src/metr_tasks/template/_registry.py +3 -0
  7. inspect_eval_utils/_templates/default/src/metr_tasks/template/assets/instructions.md +6 -0
  8. inspect_eval_utils/_templates/default/src/metr_tasks/template/py.typed +0 -0
  9. inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/Dockerfile +14 -0
  10. inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/compose.yaml +12 -0
  11. inspect_eval_utils/_templates/default/src/metr_tasks/template/task.py +51 -0
  12. inspect_eval_utils/_templates/default/src/metr_tasks/template/version.py +1 -0
  13. inspect_eval_utils/common/__init__.py +31 -0
  14. inspect_eval_utils/common/sandbox_files.py +153 -0
  15. inspect_eval_utils/common/task_secrets.py +154 -0
  16. inspect_eval_utils/py.typed +0 -0
  17. inspect_eval_utils/report/__init__.py +25 -0
  18. inspect_eval_utils/report/assets/InstrumentSans.ttf +0 -0
  19. inspect_eval_utils/report/assets/OFL.txt +93 -0
  20. inspect_eval_utils/report/cost.py +23 -0
  21. inspect_eval_utils/report/events.py +62 -0
  22. inspect_eval_utils/report/html.py +86 -0
  23. inspect_eval_utils/report/plot.py +219 -0
  24. inspect_eval_utils/report/writer.py +68 -0
  25. inspect_eval_utils/scaffolder.py +509 -0
  26. inspect_eval_utils/setting/__init__.py +23 -0
  27. inspect_eval_utils/setting/_context.py +50 -0
  28. inspect_eval_utils/setting/_types.py +104 -0
  29. inspect_eval_utils/setting/_utils.py +64 -0
  30. inspect_eval_utils/tool_cli/__init__.py +19 -0
  31. inspect_eval_utils/tool_cli/_mechanism.py +715 -0
  32. inspect_eval_utils/tool_cli/_setting.py +55 -0
  33. inspect_eval_utils-0.4.0.dist-info/METADATA +521 -0
  34. inspect_eval_utils-0.4.0.dist-info/RECORD +37 -0
  35. inspect_eval_utils-0.4.0.dist-info/WHEEL +4 -0
  36. inspect_eval_utils-0.4.0.dist-info/entry_points.txt +2 -0
  37. inspect_eval_utils-0.4.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1 @@
1
+ """inspect-eval-utils: shared utilities for METR Inspect AI eval repos."""
@@ -0,0 +1,86 @@
1
+ """CLI entry point for the new_task scaffolder."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from inspect_eval_utils._detect import detect_target_context, detect_template_context
10
+ from inspect_eval_utils.scaffolder import (
11
+ canonical_template_path,
12
+ normalize_name,
13
+ scaffold_into,
14
+ )
15
+
16
+
17
+ def _resolve_template(target_dir: Path, override: Path | None) -> Path:
18
+ if override is not None:
19
+ return override
20
+ local = target_dir / "tasks" / "template"
21
+ if local.is_dir():
22
+ return local
23
+ return canonical_template_path()
24
+
25
+
26
+ def main(argv: list[str] | None = None) -> None:
27
+ parser = argparse.ArgumentParser(
28
+ prog="new_task",
29
+ description="Scaffold a new Inspect AI task.",
30
+ )
31
+ parser.add_argument("name", help="Task name (snake_case or kebab-case)")
32
+ parser.add_argument(
33
+ "--target", type=Path, default=Path.cwd(),
34
+ help="Target repo (default: current directory)",
35
+ )
36
+ parser.add_argument(
37
+ "--template", type=Path, default=None,
38
+ help="Custom template directory (default: <target>/tasks/template/, else canonical)",
39
+ )
40
+ parser.add_argument("--namespace", default=None, help="Override target's Python namespace")
41
+ parser.add_argument("--project-prefix", default=None, help="Override target's project name prefix")
42
+ parser.add_argument("--description", default="TODO: describe this eval")
43
+ parser.add_argument(
44
+ "--force", action="store_true",
45
+ help="Overwrite an existing tasks/<name>/",
46
+ )
47
+ args = parser.parse_args(argv)
48
+
49
+ target_dir = args.target.resolve()
50
+ if not target_dir.is_dir():
51
+ sys.exit(f"target is not a directory: {target_dir}")
52
+ if not (target_dir / "pyproject.toml").is_file():
53
+ sys.exit(f"target has no pyproject.toml: {target_dir}")
54
+
55
+ snake, _kebab = normalize_name(args.name)
56
+ template_dir = _resolve_template(target_dir, args.template)
57
+ source = detect_template_context(template_dir)
58
+ if snake == source.template_name:
59
+ sys.exit(
60
+ f"task name {snake!r} matches the template name; choose a different name"
61
+ )
62
+ target = detect_target_context(
63
+ target_dir,
64
+ new_task_name=snake,
65
+ override_namespace=args.namespace,
66
+ override_prefix=args.project_prefix,
67
+ )
68
+
69
+ scaffold_into(
70
+ template_dir=template_dir,
71
+ target_dir=target_dir,
72
+ source=source,
73
+ target=target,
74
+ description=args.description,
75
+ force=args.force,
76
+ )
77
+
78
+ print(f"Created tasks/{snake}/ in {target_dir}.")
79
+ print("Next steps:")
80
+ print(f" cd {target_dir}")
81
+ print(" uv sync --group tasks")
82
+ print(f" uv run inspect eval {snake} --model mockllm/replay --limit 1")
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
@@ -0,0 +1,156 @@
1
+ """Detect TemplateContext / TargetContext from on-disk repos."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import tomlkit
9
+ import tomlkit.exceptions
10
+
11
+ from inspect_eval_utils.scaffolder import TargetContext, TemplateContext
12
+
13
+
14
+ def detect_template_context(template_dir: Path) -> TemplateContext:
15
+ """Infer (namespace, prefix, template_name) from the template directory."""
16
+ src_dir = template_dir / "src"
17
+ if not src_dir.is_dir():
18
+ sys.exit(f"template missing src/ directory: {template_dir}")
19
+
20
+ # Find unique src/<NAMESPACE>/<TEMPLATE>/.
21
+ namespace_dirs = [p for p in src_dir.iterdir() if p.is_dir()]
22
+ if len(namespace_dirs) != 1:
23
+ sys.exit(
24
+ f"expected exactly one namespace directory under {src_dir}, "
25
+ f"found {len(namespace_dirs)}: {[p.name for p in namespace_dirs]}"
26
+ )
27
+ namespace = namespace_dirs[0].name
28
+
29
+ template_dirs = [p for p in namespace_dirs[0].iterdir() if p.is_dir()]
30
+ if len(template_dirs) != 1:
31
+ sys.exit(
32
+ f"expected exactly one template directory under {namespace_dirs[0]}, "
33
+ f"found {len(template_dirs)}"
34
+ )
35
+ template_name = template_dirs[0].name
36
+
37
+ pyproject = template_dir / "pyproject.toml"
38
+ if not pyproject.is_file():
39
+ sys.exit(f"template missing pyproject.toml: {pyproject}")
40
+ doc = tomlkit.parse(pyproject.read_text())
41
+ name = str(doc["project"]["name"]) # type: ignore[index]
42
+ template_name_kebab = template_name.replace("_", "-")
43
+ if not name.endswith(template_name_kebab):
44
+ sys.exit(
45
+ f"template's project.name {name!r} doesn't end with "
46
+ f"the kebab template name {template_name_kebab!r}"
47
+ )
48
+ project_prefix = name[: -len(template_name_kebab)]
49
+
50
+ return TemplateContext(
51
+ namespace=namespace,
52
+ project_prefix=project_prefix,
53
+ template_name=template_name,
54
+ )
55
+
56
+
57
+ def detect_target_context(
58
+ target_dir: Path,
59
+ *,
60
+ new_task_name: str,
61
+ override_namespace: str | None = None,
62
+ override_prefix: str | None = None,
63
+ ) -> TargetContext:
64
+ """Resolve target's namespace and project prefix.
65
+
66
+ Order: explicit overrides -> [tool.task-scaffolder] config -> existing task.
67
+ """
68
+ if override_namespace is not None:
69
+ prefix = override_prefix
70
+ if prefix is None:
71
+ prefix = override_namespace.replace("_", "-") + "-"
72
+ return TargetContext(
73
+ namespace=override_namespace,
74
+ project_prefix=prefix,
75
+ new_task_name=new_task_name,
76
+ )
77
+
78
+ pyproject = target_dir / "pyproject.toml"
79
+ if pyproject.is_file():
80
+ doc = tomlkit.parse(pyproject.read_text())
81
+ scaffolder_cfg = doc.get("tool", {}).get("task-scaffolder") # type: ignore[union-attr]
82
+ if scaffolder_cfg is not None:
83
+ try:
84
+ ns = str(scaffolder_cfg["namespace"]) # type: ignore[index]
85
+ except tomlkit.exceptions.NonExistentKey:
86
+ sys.exit(
87
+ f"[tool.task-scaffolder] in {pyproject} is missing required "
88
+ "key 'namespace'.\nExpected:\n"
89
+ " [tool.task-scaffolder]\n"
90
+ ' namespace = "your_namespace"'
91
+ )
92
+ prefix_raw = scaffolder_cfg.get("project-prefix") # type: ignore[union-attr]
93
+ prefix = str(prefix_raw) if prefix_raw is not None else ns.replace("_", "-") + "-"
94
+ return TargetContext(
95
+ namespace=ns,
96
+ project_prefix=prefix,
97
+ new_task_name=new_task_name,
98
+ )
99
+
100
+ # Existing-task heuristic.
101
+ skipped: list[str] = []
102
+ tasks_dir = target_dir / "tasks"
103
+ if tasks_dir.is_dir():
104
+ for task in sorted(tasks_dir.iterdir()):
105
+ if not task.is_dir():
106
+ continue
107
+ if task.name in {"template", "template_task", "common"}:
108
+ continue
109
+ src_dir = task / "src"
110
+ if not src_dir.is_dir():
111
+ skipped.append(f"{task.name}: missing src/ directory")
112
+ continue
113
+ ns_candidates = [p for p in src_dir.iterdir() if p.is_dir()]
114
+ if len(ns_candidates) == 0:
115
+ skipped.append(f"{task.name}: src/ has no namespace dirs")
116
+ continue
117
+ if len(ns_candidates) > 1:
118
+ names = ", ".join(p.name for p in ns_candidates)
119
+ skipped.append(
120
+ f"{task.name}: src/ has multiple namespace dirs ({names})"
121
+ )
122
+ continue
123
+ ns = ns_candidates[0].name
124
+ task_pyproject = task / "pyproject.toml"
125
+ if not task_pyproject.is_file():
126
+ skipped.append(f"{task.name}: missing pyproject.toml")
127
+ continue
128
+ try:
129
+ task_doc = tomlkit.parse(task_pyproject.read_text())
130
+ task_name = str(task_doc["project"]["name"]) # type: ignore[index]
131
+ except Exception as e:
132
+ skipped.append(f"{task.name}: could not read pyproject.toml ({e})")
133
+ continue
134
+ task_kebab = task.name.replace("_", "-")
135
+ if task_name.endswith(task_kebab):
136
+ prefix = task_name[: -len(task_kebab)]
137
+ return TargetContext(
138
+ namespace=ns,
139
+ project_prefix=prefix,
140
+ new_task_name=new_task_name,
141
+ )
142
+ skipped.append(
143
+ f"{task.name}: project name {task_name!r} doesn't end with {task_kebab!r}"
144
+ )
145
+
146
+ msg = (
147
+ "could not determine target namespace; add to "
148
+ f"{target_dir}/pyproject.toml:\n"
149
+ " [tool.task-scaffolder]\n"
150
+ ' namespace = "your_namespace"\n'
151
+ "or pass --namespace on the command line."
152
+ )
153
+ if skipped:
154
+ msg += "\n\n Skipped existing tasks:\n"
155
+ msg += "\n".join(f" {entry}" for entry in skipped)
156
+ sys.exit(msg)
@@ -0,0 +1,16 @@
1
+ [project]
2
+ name = "metr-tasks-template"
3
+ version = "0.1.0"
4
+ description = "Template task - copy and modify to create new tasks"
5
+ requires-python = ">=3.13"
6
+ dependencies = ["inspect-ai>=0.3.0"]
7
+
8
+ [build-system]
9
+ requires = ["hatchling"]
10
+ build-backend = "hatchling.build"
11
+
12
+ [tool.hatch.build.targets.wheel]
13
+ packages = ["src/metr_tasks"]
14
+
15
+ [project.entry-points.inspect_ai]
16
+ metr_tasks = "metr_tasks.template._registry"
@@ -0,0 +1,6 @@
1
+ """Template task - copy and modify to create new tasks."""
2
+
3
+ from metr_tasks.template.task import template
4
+ from metr_tasks.template.version import __version__
5
+
6
+ __all__ = ["template", "__version__"]
@@ -0,0 +1,3 @@
1
+ from metr_tasks.template.task import template
2
+
3
+ __all__ = ["template"]
@@ -0,0 +1,6 @@
1
+ You are a helpful assistant completing a task in a sandboxed environment.
2
+
3
+ Available tools:
4
+ - bash: Execute shell commands to explore files and complete the task
5
+
6
+ Read the task description carefully and complete it step by step.
@@ -0,0 +1,14 @@
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /home/agent
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ curl \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ RUN useradd -m -s /bin/bash agent && \
10
+ chown -R agent:agent /home/agent
11
+
12
+ USER agent
13
+
14
+ CMD ["tail", "-f", "/dev/null"]
@@ -0,0 +1,12 @@
1
+ services:
2
+ default:
3
+ image: ${DOCKER_IMAGE_REPO:-template}:${SAMPLE_METADATA_TASK_VERSION:-latest}
4
+
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+
9
+ init: true
10
+ working_dir: /home/agent
11
+ network_mode: ${SAMPLE_METADATA_NETWORK_MODE:-bridge}
12
+ command: ["tail", "-f", "/dev/null"]
@@ -0,0 +1,51 @@
1
+ from pathlib import Path
2
+ from typing import Final, Literal
3
+
4
+ from inspect_ai import Task, task
5
+ from inspect_ai.dataset import MemoryDataset, Sample
6
+ from inspect_ai.scorer import match
7
+ from inspect_ai.solver import Solver, basic_agent, system_message
8
+ from inspect_ai.tool import bash
9
+
10
+ from .version import __version__
11
+
12
+ SANDBOX_DIR: Final = Path(__file__).parent / "sandbox"
13
+ INSTRUCTIONS_PATH: Final = Path(__file__).parent / "assets" / "instructions.md"
14
+
15
+
16
+ @task(name="template")
17
+ def template(
18
+ solver: Solver | None = None,
19
+ max_messages: int = 50,
20
+ sandbox_type: Literal["docker", "k8s"] = "docker",
21
+ ) -> Task:
22
+ # TODO: Replace with your dataset
23
+ dataset = MemoryDataset(
24
+ samples=[
25
+ Sample(
26
+ input="What is 2 + 2?",
27
+ target="4",
28
+ metadata={
29
+ "task_version": __version__,
30
+ "network_mode": "bridge",
31
+ },
32
+ ),
33
+ ]
34
+ )
35
+
36
+ return Task(
37
+ dataset=dataset,
38
+ solver=solver or default_solver(),
39
+ scorer=match(),
40
+ max_messages=max_messages,
41
+ sandbox=(sandbox_type, str(SANDBOX_DIR / "compose.yaml")),
42
+ version=__version__,
43
+ )
44
+
45
+
46
+ def default_solver() -> Solver:
47
+ instructions = INSTRUCTIONS_PATH.read_text()
48
+ return basic_agent(
49
+ init=system_message(instructions),
50
+ tools=[bash(timeout=120)],
51
+ )
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,31 @@
1
+ """Runtime helpers for Inspect AI tasks."""
2
+
3
+ from inspect_eval_utils.common.sandbox_files import (
4
+ expand_template,
5
+ get_sandbox_files,
6
+ load_text_file,
7
+ )
8
+ from inspect_eval_utils.common.task_secrets import (
9
+ DEFAULT_ARN_PREFIX_ENV_VAR,
10
+ InvalidTaskSecretPrefixError,
11
+ MissingTaskSecretPrefixError,
12
+ TaskSecretBinaryError,
13
+ TaskSecretError,
14
+ TaskSecretMissingStringError,
15
+ get_task_secret,
16
+ get_task_secret_from_aws,
17
+ )
18
+
19
+ __all__ = [
20
+ "expand_template",
21
+ "get_sandbox_files",
22
+ "DEFAULT_ARN_PREFIX_ENV_VAR",
23
+ "InvalidTaskSecretPrefixError",
24
+ "MissingTaskSecretPrefixError",
25
+ "TaskSecretBinaryError",
26
+ "TaskSecretError",
27
+ "TaskSecretMissingStringError",
28
+ "get_task_secret",
29
+ "get_task_secret_from_aws",
30
+ "load_text_file",
31
+ ]
@@ -0,0 +1,153 @@
1
+ """Utility functions for managing sandbox asset files."""
2
+
3
+ import fnmatch
4
+ from pathlib import Path
5
+
6
+ from jinja2 import Environment, StrictUndefined, UndefinedError
7
+
8
+ _TEMPLATE_SUFFIX = ".jinja2"
9
+
10
+
11
+ def expand_template(
12
+ content: str,
13
+ template_vars: dict[str, object],
14
+ source_path: Path | None = None,
15
+ ) -> str:
16
+ """Expand Jinja2 template with provided variables.
17
+
18
+ Args:
19
+ content: Template string with {{ VAR }} placeholders
20
+ template_vars: Dictionary of variables to use for template expansion
21
+ source_path: Optional path for error messages
22
+
23
+ Raises:
24
+ ValueError: If any referenced template variables are missing
25
+ """
26
+ env = Environment(undefined=StrictUndefined)
27
+ template = env.from_string(content)
28
+ try:
29
+ return template.render(template_vars)
30
+ except UndefinedError as e:
31
+ location = f" in {source_path}" if source_path else ""
32
+ raise ValueError(f"Missing template variable{location}: {e}") from e
33
+
34
+
35
+ def load_text_file(
36
+ path: Path,
37
+ template_vars: dict[str, object] | None = None,
38
+ ) -> str:
39
+ """Load a text file, optionally expanding Jinja2 templates.
40
+
41
+ Transparently handles template files: if `path` doesn't exist but
42
+ `path.jinja2` does, loads and expands the template. Raises an error
43
+ if both exist to avoid ambiguity.
44
+
45
+ Args:
46
+ path: Path to the file to load (without .jinja2 suffix)
47
+ template_vars: If provided, expand {{ VAR }} with these variables
48
+
49
+ Returns:
50
+ File contents, optionally with templates expanded
51
+
52
+ Raises:
53
+ FileNotFoundError: If neither the file nor its .jinja2 variant exists
54
+ ValueError: If both file and .jinja2 variant exist, or if
55
+ template_vars is provided and referenced variables are missing
56
+ """
57
+ template_path = path.parent / (path.name + _TEMPLATE_SUFFIX)
58
+ plain_exists = path.exists()
59
+ template_exists = template_path.exists()
60
+
61
+ if plain_exists and template_exists:
62
+ raise ValueError(f"Both {path} and {template_path} exist; remove one")
63
+
64
+ if template_exists:
65
+ if template_vars is None:
66
+ raise ValueError(
67
+ f"Template file {template_path} found but no template_vars provided"
68
+ )
69
+ content = template_path.read_text()
70
+ return expand_template(content, template_vars, template_path)
71
+
72
+ if plain_exists:
73
+ content = path.read_text()
74
+ if template_vars is not None:
75
+ return expand_template(content, template_vars, path)
76
+ return content
77
+
78
+ raise FileNotFoundError(f"File not found: {path} (also checked {template_path})")
79
+
80
+
81
+ _DEFAULT_CONTAINER_DEST = Path("/home/agent")
82
+
83
+
84
+ def get_sandbox_files(
85
+ task_dir: Path,
86
+ target_sandbox: str = "default",
87
+ container_dest: Path | None = None,
88
+ assets_subdir: str = "assets/agent",
89
+ exclude: list[str] | None = None,
90
+ template_vars: dict[str, object] | None = None,
91
+ ) -> dict[str, str]:
92
+ """
93
+ Create a Sample.files dictionary from a task's assets folder.
94
+
95
+ Args:
96
+ task_dir: The task's directory (usually Path(__file__).parent)
97
+ target_sandbox: The target sandbox environment (e.g., "default", "game")
98
+ container_dest: Destination path in the container (default: /home/agent)
99
+ assets_subdir: Subdirectory within task_dir containing assets
100
+ exclude: List of glob patterns to exclude (e.g., ["*.dvc", "docs/wiki/*"])
101
+ template_vars: If provided, process .jinja2 files by expanding {{ VAR }}
102
+ patterns with these variables and write to temp files
103
+
104
+ Returns:
105
+ Dictionary mapping container paths to absolute source paths for each file
106
+
107
+ Raises:
108
+ FileNotFoundError: If the assets folder doesn't exist
109
+ ValueError: If template_vars is provided and any referenced variables are missing
110
+ """
111
+ if container_dest is None:
112
+ container_dest = _DEFAULT_CONTAINER_DEST
113
+ if exclude is None:
114
+ exclude = []
115
+ assets_path = task_dir / assets_subdir
116
+ if not assets_path.exists():
117
+ raise FileNotFoundError(f"Assets folder not found: {assets_path}")
118
+
119
+ def is_excluded(rel_path: Path) -> bool:
120
+ """Check if a path matches any exclude pattern."""
121
+ rel_str = str(rel_path)
122
+ return any(fnmatch.fnmatch(rel_str, pattern) for pattern in exclude)
123
+
124
+ files: dict[str, str] = {}
125
+ for file_path in assets_path.rglob("*"):
126
+ if file_path.is_file():
127
+ relative_to_assets = file_path.relative_to(assets_path)
128
+ if is_excluded(relative_to_assets):
129
+ continue
130
+
131
+ # Handle template files
132
+ if file_path.name.endswith(_TEMPLATE_SUFFIX):
133
+ if template_vars is not None:
134
+ # Remove .jinja2 suffix for container path
135
+ output_name = file_path.name[: -len(_TEMPLATE_SUFFIX)]
136
+ container_relative = relative_to_assets.parent / output_name
137
+ container_path = container_dest / container_relative
138
+
139
+ # Expand template
140
+ content = file_path.read_text()
141
+ expanded = expand_template(content, template_vars, file_path)
142
+
143
+ files[f"{target_sandbox}:{container_path}"] = expanded
144
+ # Skip .jinja2 files if no template_vars provided
145
+ continue
146
+
147
+ # Normal file handling
148
+ container_path = container_dest / relative_to_assets
149
+ files[f"{target_sandbox}:{container_path}"] = file_path.read_text(
150
+ encoding="utf-8"
151
+ )
152
+
153
+ return files