PyPI - inspect-eval-utils - Versions diffs - 0.4.0__py3-none-any.whl - Mend

inspect-eval-utils 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

inspect_eval_utils/__init__.py +1 -0
inspect_eval_utils/_cli.py +86 -0
inspect_eval_utils/_detect.py +156 -0
inspect_eval_utils/_templates/default/pyproject.toml +16 -0
inspect_eval_utils/_templates/default/src/metr_tasks/template/__init__.py +6 -0
inspect_eval_utils/_templates/default/src/metr_tasks/template/_registry.py +3 -0
inspect_eval_utils/_templates/default/src/metr_tasks/template/assets/instructions.md +6 -0
inspect_eval_utils/_templates/default/src/metr_tasks/template/py.typed +0 -0
inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/Dockerfile +14 -0
inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/compose.yaml +12 -0
inspect_eval_utils/_templates/default/src/metr_tasks/template/task.py +51 -0
inspect_eval_utils/_templates/default/src/metr_tasks/template/version.py +1 -0
inspect_eval_utils/common/__init__.py +31 -0
inspect_eval_utils/common/sandbox_files.py +153 -0
inspect_eval_utils/common/task_secrets.py +154 -0
inspect_eval_utils/py.typed +0 -0
inspect_eval_utils/report/__init__.py +25 -0
inspect_eval_utils/report/assets/InstrumentSans.ttf +0 -0
inspect_eval_utils/report/assets/OFL.txt +93 -0
inspect_eval_utils/report/cost.py +23 -0
inspect_eval_utils/report/events.py +62 -0
inspect_eval_utils/report/html.py +86 -0
inspect_eval_utils/report/plot.py +219 -0
inspect_eval_utils/report/writer.py +68 -0
inspect_eval_utils/scaffolder.py +509 -0
inspect_eval_utils/setting/__init__.py +23 -0
inspect_eval_utils/setting/_context.py +50 -0
inspect_eval_utils/setting/_types.py +104 -0
inspect_eval_utils/setting/_utils.py +64 -0
inspect_eval_utils/tool_cli/__init__.py +19 -0
inspect_eval_utils/tool_cli/_mechanism.py +715 -0
inspect_eval_utils/tool_cli/_setting.py +55 -0
inspect_eval_utils-0.4.0.dist-info/METADATA +521 -0
inspect_eval_utils-0.4.0.dist-info/RECORD +37 -0
inspect_eval_utils-0.4.0.dist-info/WHEEL +4 -0
inspect_eval_utils-0.4.0.dist-info/entry_points.txt +2 -0
inspect_eval_utils-0.4.0.dist-info/licenses/LICENSE +21 -0

inspect_eval_utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """inspect-eval-utils: shared utilities for METR Inspect AI eval repos."""

inspect_eval_utils/_cli.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""CLI entry point for the new_task scaffolder."""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+from inspect_eval_utils._detect import detect_target_context, detect_template_context
+from inspect_eval_utils.scaffolder import (
+    canonical_template_path,
+    normalize_name,
+    scaffold_into,
+)
+def _resolve_template(target_dir: Path, override: Path | None) -> Path:
+    if override is not None:
+        return override
+    local = target_dir / "tasks" / "template"
+    if local.is_dir():
+        return local
+    return canonical_template_path()
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(
+        prog="new_task",
+        description="Scaffold a new Inspect AI task.",
+    )
+    parser.add_argument("name", help="Task name (snake_case or kebab-case)")
+    parser.add_argument(
+        "--target", type=Path, default=Path.cwd(),
+        help="Target repo (default: current directory)",
+    )
+    parser.add_argument(
+        "--template", type=Path, default=None,
+        help="Custom template directory (default: <target>/tasks/template/, else canonical)",
+    )
+    parser.add_argument("--namespace", default=None, help="Override target's Python namespace")
+    parser.add_argument("--project-prefix", default=None, help="Override target's project name prefix")
+    parser.add_argument("--description", default="TODO: describe this eval")
+    parser.add_argument(
+        "--force", action="store_true",
+        help="Overwrite an existing tasks/<name>/",
+    )
+    args = parser.parse_args(argv)
+    target_dir = args.target.resolve()
+    if not target_dir.is_dir():
+        sys.exit(f"target is not a directory: {target_dir}")
+    if not (target_dir / "pyproject.toml").is_file():
+        sys.exit(f"target has no pyproject.toml: {target_dir}")
+    snake, _kebab = normalize_name(args.name)
+    template_dir = _resolve_template(target_dir, args.template)
+    source = detect_template_context(template_dir)
+    if snake == source.template_name:
+        sys.exit(
+            f"task name {snake!r} matches the template name; choose a different name"
+        )
+    target = detect_target_context(
+        target_dir,
+        new_task_name=snake,
+        override_namespace=args.namespace,
+        override_prefix=args.project_prefix,
+    )
+    scaffold_into(
+        template_dir=template_dir,
+        target_dir=target_dir,
+        source=source,
+        target=target,
+        description=args.description,
+        force=args.force,
+    )
+    print(f"Created tasks/{snake}/ in {target_dir}.")
+    print("Next steps:")
+    print(f"  cd {target_dir}")
+    print("  uv sync --group tasks")
+    print(f"  uv run inspect eval {snake} --model mockllm/replay --limit 1")
+if __name__ == "__main__":
+    main()

inspect_eval_utils/_detect.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""Detect TemplateContext / TargetContext from on-disk repos."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+import tomlkit
+import tomlkit.exceptions
+from inspect_eval_utils.scaffolder import TargetContext, TemplateContext
+def detect_template_context(template_dir: Path) -> TemplateContext:
+    """Infer (namespace, prefix, template_name) from the template directory."""
+    src_dir = template_dir / "src"
+    if not src_dir.is_dir():
+        sys.exit(f"template missing src/ directory: {template_dir}")
+    # Find unique src/<NAMESPACE>/<TEMPLATE>/.
+    namespace_dirs = [p for p in src_dir.iterdir() if p.is_dir()]
+    if len(namespace_dirs) != 1:
+        sys.exit(
+            f"expected exactly one namespace directory under {src_dir}, "
+            f"found {len(namespace_dirs)}: {[p.name for p in namespace_dirs]}"
+        )
+    namespace = namespace_dirs[0].name
+    template_dirs = [p for p in namespace_dirs[0].iterdir() if p.is_dir()]
+    if len(template_dirs) != 1:
+        sys.exit(
+            f"expected exactly one template directory under {namespace_dirs[0]}, "
+            f"found {len(template_dirs)}"
+        )
+    template_name = template_dirs[0].name
+    pyproject = template_dir / "pyproject.toml"
+    if not pyproject.is_file():
+        sys.exit(f"template missing pyproject.toml: {pyproject}")
+    doc = tomlkit.parse(pyproject.read_text())
+    name = str(doc["project"]["name"])  # type: ignore[index]
+    template_name_kebab = template_name.replace("_", "-")
+    if not name.endswith(template_name_kebab):
+        sys.exit(
+            f"template's project.name {name!r} doesn't end with "
+            f"the kebab template name {template_name_kebab!r}"
+        )
+    project_prefix = name[: -len(template_name_kebab)]
+    return TemplateContext(
+        namespace=namespace,
+        project_prefix=project_prefix,
+        template_name=template_name,
+    )
+def detect_target_context(
+    target_dir: Path,
+    *,
+    new_task_name: str,
+    override_namespace: str | None = None,
+    override_prefix: str | None = None,
+) -> TargetContext:
+    """Resolve target's namespace and project prefix.
+    Order: explicit overrides -> [tool.task-scaffolder] config -> existing task.
+    """
+    if override_namespace is not None:
+        prefix = override_prefix
+        if prefix is None:
+            prefix = override_namespace.replace("_", "-") + "-"
+        return TargetContext(
+            namespace=override_namespace,
+            project_prefix=prefix,
+            new_task_name=new_task_name,
+        )
+    pyproject = target_dir / "pyproject.toml"
+    if pyproject.is_file():
+        doc = tomlkit.parse(pyproject.read_text())
+        scaffolder_cfg = doc.get("tool", {}).get("task-scaffolder")  # type: ignore[union-attr]
+        if scaffolder_cfg is not None:
+            try:
+                ns = str(scaffolder_cfg["namespace"])  # type: ignore[index]
+            except tomlkit.exceptions.NonExistentKey:
+                sys.exit(
+                    f"[tool.task-scaffolder] in {pyproject} is missing required "
+                    "key 'namespace'.\nExpected:\n"
+                    "  [tool.task-scaffolder]\n"
+                    '  namespace = "your_namespace"'
+                )
+            prefix_raw = scaffolder_cfg.get("project-prefix")  # type: ignore[union-attr]
+            prefix = str(prefix_raw) if prefix_raw is not None else ns.replace("_", "-") + "-"
+            return TargetContext(
+                namespace=ns,
+                project_prefix=prefix,
+                new_task_name=new_task_name,
+            )
+    # Existing-task heuristic.
+    skipped: list[str] = []
+    tasks_dir = target_dir / "tasks"
+    if tasks_dir.is_dir():
+        for task in sorted(tasks_dir.iterdir()):
+            if not task.is_dir():
+                continue
+            if task.name in {"template", "template_task", "common"}:
+                continue
+            src_dir = task / "src"
+            if not src_dir.is_dir():
+                skipped.append(f"{task.name}: missing src/ directory")
+                continue
+            ns_candidates = [p for p in src_dir.iterdir() if p.is_dir()]
+            if len(ns_candidates) == 0:
+                skipped.append(f"{task.name}: src/ has no namespace dirs")
+                continue
+            if len(ns_candidates) > 1:
+                names = ", ".join(p.name for p in ns_candidates)
+                skipped.append(
+                    f"{task.name}: src/ has multiple namespace dirs ({names})"
+                )
+                continue
+            ns = ns_candidates[0].name
+            task_pyproject = task / "pyproject.toml"
+            if not task_pyproject.is_file():
+                skipped.append(f"{task.name}: missing pyproject.toml")
+                continue
+            try:
+                task_doc = tomlkit.parse(task_pyproject.read_text())
+                task_name = str(task_doc["project"]["name"])  # type: ignore[index]
+            except Exception as e:
+                skipped.append(f"{task.name}: could not read pyproject.toml ({e})")
+                continue
+            task_kebab = task.name.replace("_", "-")
+            if task_name.endswith(task_kebab):
+                prefix = task_name[: -len(task_kebab)]
+                return TargetContext(
+                    namespace=ns,
+                    project_prefix=prefix,
+                    new_task_name=new_task_name,
+                )
+            skipped.append(
+                f"{task.name}: project name {task_name!r} doesn't end with {task_kebab!r}"
+            )
+    msg = (
+        "could not determine target namespace; add to "
+        f"{target_dir}/pyproject.toml:\n"
+        "  [tool.task-scaffolder]\n"
+        '  namespace = "your_namespace"\n'
+        "or pass --namespace on the command line."
+    )
+    if skipped:
+        msg += "\n\n  Skipped existing tasks:\n"
+        msg += "\n".join(f"    {entry}" for entry in skipped)
+    sys.exit(msg)

inspect_eval_utils/_templates/default/pyproject.toml ADDED Viewed

@@ -0,0 +1,16 @@
+[project]
+name = "metr-tasks-template"
+version = "0.1.0"
+description = "Template task - copy and modify to create new tasks"
+requires-python = ">=3.13"
+dependencies = ["inspect-ai>=0.3.0"]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/metr_tasks"]
+[project.entry-points.inspect_ai]
+metr_tasks = "metr_tasks.template._registry"

inspect_eval_utils/_templates/default/src/metr_tasks/template/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Template task - copy and modify to create new tasks."""
+from metr_tasks.template.task import template
+from metr_tasks.template.version import __version__
+__all__ = ["template", "__version__"]

inspect_eval_utils/_templates/default/src/metr_tasks/template/_registry.py ADDED Viewed

@@ -0,0 +1,3 @@
+from metr_tasks.template.task import template
+__all__ = ["template"]

inspect_eval_utils/_templates/default/src/metr_tasks/template/assets/instructions.md ADDED Viewed

@@ -0,0 +1,6 @@
+You are a helpful assistant completing a task in a sandboxed environment.
+Available tools:
+- bash: Execute shell commands to explore files and complete the task
+Read the task description carefully and complete it step by step.

inspect_eval_utils/_templates/default/src/metr_tasks/template/py.typed ADDED Viewed

File without changes

inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/Dockerfile ADDED Viewed

@@ -0,0 +1,14 @@
+FROM python:3.11-slim
+WORKDIR /home/agent
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -s /bin/bash agent && \
+    chown -R agent:agent /home/agent
+USER agent
+CMD ["tail", "-f", "/dev/null"]

inspect_eval_utils/_templates/default/src/metr_tasks/template/sandbox/compose.yaml ADDED Viewed

@@ -0,0 +1,12 @@
+services:
+  default:
+    image: ${DOCKER_IMAGE_REPO:-template}:${SAMPLE_METADATA_TASK_VERSION:-latest}
+    build:
+      context: .
+      dockerfile: Dockerfile
+    init: true
+    working_dir: /home/agent
+    network_mode: ${SAMPLE_METADATA_NETWORK_MODE:-bridge}
+    command: ["tail", "-f", "/dev/null"]

inspect_eval_utils/_templates/default/src/metr_tasks/template/task.py ADDED Viewed

@@ -0,0 +1,51 @@
+from pathlib import Path
+from typing import Final, Literal
+from inspect_ai import Task, task
+from inspect_ai.dataset import MemoryDataset, Sample
+from inspect_ai.scorer import match
+from inspect_ai.solver import Solver, basic_agent, system_message
+from inspect_ai.tool import bash
+from .version import __version__
+SANDBOX_DIR: Final = Path(__file__).parent / "sandbox"
+INSTRUCTIONS_PATH: Final = Path(__file__).parent / "assets" / "instructions.md"
+@task(name="template")
+def template(
+    solver: Solver | None = None,
+    max_messages: int = 50,
+    sandbox_type: Literal["docker", "k8s"] = "docker",
+) -> Task:
+    # TODO: Replace with your dataset
+    dataset = MemoryDataset(
+        samples=[
+            Sample(
+                input="What is 2 + 2?",
+                target="4",
+                metadata={
+                    "task_version": __version__,
+                    "network_mode": "bridge",
+                },
+            ),
+        ]
+    )
+    return Task(
+        dataset=dataset,
+        solver=solver or default_solver(),
+        scorer=match(),
+        max_messages=max_messages,
+        sandbox=(sandbox_type, str(SANDBOX_DIR / "compose.yaml")),
+        version=__version__,
+    )
+def default_solver() -> Solver:
+    instructions = INSTRUCTIONS_PATH.read_text()
+    return basic_agent(
+        init=system_message(instructions),
+        tools=[bash(timeout=120)],
+    )

inspect_eval_utils/_templates/default/src/metr_tasks/template/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

inspect_eval_utils/common/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Runtime helpers for Inspect AI tasks."""
+from inspect_eval_utils.common.sandbox_files import (
+    expand_template,
+    get_sandbox_files,
+    load_text_file,
+)
+from inspect_eval_utils.common.task_secrets import (
+    DEFAULT_ARN_PREFIX_ENV_VAR,
+    InvalidTaskSecretPrefixError,
+    MissingTaskSecretPrefixError,
+    TaskSecretBinaryError,
+    TaskSecretError,
+    TaskSecretMissingStringError,
+    get_task_secret,
+    get_task_secret_from_aws,
+)
+__all__ = [
+    "expand_template",
+    "get_sandbox_files",
+    "DEFAULT_ARN_PREFIX_ENV_VAR",
+    "InvalidTaskSecretPrefixError",
+    "MissingTaskSecretPrefixError",
+    "TaskSecretBinaryError",
+    "TaskSecretError",
+    "TaskSecretMissingStringError",
+    "get_task_secret",
+    "get_task_secret_from_aws",
+    "load_text_file",
+]

inspect_eval_utils/common/sandbox_files.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""Utility functions for managing sandbox asset files."""
+import fnmatch
+from pathlib import Path
+from jinja2 import Environment, StrictUndefined, UndefinedError
+_TEMPLATE_SUFFIX = ".jinja2"
+def expand_template(
+    content: str,
+    template_vars: dict[str, object],
+    source_path: Path | None = None,
+) -> str:
+    """Expand Jinja2 template with provided variables.
+    Args:
+        content: Template string with {{ VAR }} placeholders
+        template_vars: Dictionary of variables to use for template expansion
+        source_path: Optional path for error messages
+    Raises:
+        ValueError: If any referenced template variables are missing
+    """
+    env = Environment(undefined=StrictUndefined)
+    template = env.from_string(content)
+    try:
+        return template.render(template_vars)
+    except UndefinedError as e:
+        location = f" in {source_path}" if source_path else ""
+        raise ValueError(f"Missing template variable{location}: {e}") from e
+def load_text_file(
+    path: Path,
+    template_vars: dict[str, object] | None = None,
+) -> str:
+    """Load a text file, optionally expanding Jinja2 templates.
+    Transparently handles template files: if `path` doesn't exist but
+    `path.jinja2` does, loads and expands the template. Raises an error
+    if both exist to avoid ambiguity.
+    Args:
+        path: Path to the file to load (without .jinja2 suffix)
+        template_vars: If provided, expand {{ VAR }} with these variables
+    Returns:
+        File contents, optionally with templates expanded
+    Raises:
+        FileNotFoundError: If neither the file nor its .jinja2 variant exists
+        ValueError: If both file and .jinja2 variant exist, or if
+            template_vars is provided and referenced variables are missing
+    """
+    template_path = path.parent / (path.name + _TEMPLATE_SUFFIX)
+    plain_exists = path.exists()
+    template_exists = template_path.exists()
+    if plain_exists and template_exists:
+        raise ValueError(f"Both {path} and {template_path} exist; remove one")
+    if template_exists:
+        if template_vars is None:
+            raise ValueError(
+                f"Template file {template_path} found but no template_vars provided"
+            )
+        content = template_path.read_text()
+        return expand_template(content, template_vars, template_path)
+    if plain_exists:
+        content = path.read_text()
+        if template_vars is not None:
+            return expand_template(content, template_vars, path)
+        return content
+    raise FileNotFoundError(f"File not found: {path} (also checked {template_path})")
+_DEFAULT_CONTAINER_DEST = Path("/home/agent")
+def get_sandbox_files(
+    task_dir: Path,
+    target_sandbox: str = "default",
+    container_dest: Path | None = None,
+    assets_subdir: str = "assets/agent",
+    exclude: list[str] | None = None,
+    template_vars: dict[str, object] | None = None,
+) -> dict[str, str]:
+    """
+    Create a Sample.files dictionary from a task's assets folder.
+    Args:
+        task_dir: The task's directory (usually Path(__file__).parent)
+        target_sandbox: The target sandbox environment (e.g., "default", "game")
+        container_dest: Destination path in the container (default: /home/agent)
+        assets_subdir: Subdirectory within task_dir containing assets
+        exclude: List of glob patterns to exclude (e.g., ["*.dvc", "docs/wiki/*"])
+        template_vars: If provided, process .jinja2 files by expanding {{ VAR }}
+            patterns with these variables and write to temp files
+    Returns:
+        Dictionary mapping container paths to absolute source paths for each file
+    Raises:
+        FileNotFoundError: If the assets folder doesn't exist
+        ValueError: If template_vars is provided and any referenced variables are missing
+    """
+    if container_dest is None:
+        container_dest = _DEFAULT_CONTAINER_DEST
+    if exclude is None:
+        exclude = []
+    assets_path = task_dir / assets_subdir
+    if not assets_path.exists():
+        raise FileNotFoundError(f"Assets folder not found: {assets_path}")
+    def is_excluded(rel_path: Path) -> bool:
+        """Check if a path matches any exclude pattern."""
+        rel_str = str(rel_path)
+        return any(fnmatch.fnmatch(rel_str, pattern) for pattern in exclude)
+    files: dict[str, str] = {}
+    for file_path in assets_path.rglob("*"):
+        if file_path.is_file():
+            relative_to_assets = file_path.relative_to(assets_path)
+            if is_excluded(relative_to_assets):
+                continue
+            # Handle template files
+            if file_path.name.endswith(_TEMPLATE_SUFFIX):
+                if template_vars is not None:
+                    # Remove .jinja2 suffix for container path
+                    output_name = file_path.name[: -len(_TEMPLATE_SUFFIX)]
+                    container_relative = relative_to_assets.parent / output_name
+                    container_path = container_dest / container_relative
+                    # Expand template
+                    content = file_path.read_text()
+                    expanded = expand_template(content, template_vars, file_path)
+                    files[f"{target_sandbox}:{container_path}"] = expanded
+                # Skip .jinja2 files if no template_vars provided
+                continue
+            # Normal file handling
+            container_path = container_dest / relative_to_assets
+            files[f"{target_sandbox}:{container_path}"] = file_path.read_text(
+                encoding="utf-8"
+            )
+    return files