PyPI - inspect-ai - Versions diffs - 0.3.53__py3-none-any.whl → 0.3.55__py3-none-any.whl - Mend

inspect-ai 0.3.53py3-none-any.whl → 0.3.55py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

inspect_ai/_cli/eval.py +26 -1
inspect_ai/_cli/main.py +2 -0
inspect_ai/_cli/trace.py +244 -0
inspect_ai/_display/textual/app.py +5 -1
inspect_ai/_display/textual/widgets/tasks.py +13 -3
inspect_ai/_eval/eval.py +17 -0
inspect_ai/_eval/task/images.py +4 -14
inspect_ai/_eval/task/log.py +2 -1
inspect_ai/_eval/task/run.py +26 -10
inspect_ai/_util/constants.py +3 -3
inspect_ai/_util/display.py +1 -0
inspect_ai/_util/logger.py +34 -8
inspect_ai/_util/trace.py +275 -0
inspect_ai/log/_log.py +3 -0
inspect_ai/log/_message.py +2 -2
inspect_ai/log/_recorders/eval.py +6 -17
inspect_ai/log/_recorders/json.py +19 -17
inspect_ai/model/_cache.py +22 -16
inspect_ai/model/_call_tools.py +9 -1
inspect_ai/model/_generate_config.py +2 -2
inspect_ai/model/_model.py +11 -12
inspect_ai/model/_providers/bedrock.py +1 -1
inspect_ai/model/_providers/openai.py +11 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
inspect_ai/util/_sandbox/context.py +6 -1
inspect_ai/util/_sandbox/docker/compose.py +58 -19
inspect_ai/util/_sandbox/docker/docker.py +11 -11
inspect_ai/util/_sandbox/docker/util.py +0 -6
inspect_ai/util/_sandbox/service.py +17 -7
inspect_ai/util/_subprocess.py +6 -1
inspect_ai/util/_subtask.py +8 -2
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/METADATA +7 -7
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/RECORD +37 -35
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/top_level.txt +0 -0

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -58,7 +58,7 @@ class GenerateConfigArgs(TypedDict, total=False):
     """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, and TogetherAI only."""
     logprobs: bool | None
-    """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, and Huggingface only."""
+    """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
     top_logprobs: int | None
     """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Google, Grok, and Huggingface only."""
@@ -128,7 +128,7 @@ class GenerateConfig(BaseModel):
     """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, and vLLM only."""
     logprobs: bool | None = Field(default=None)
-    """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, and vLLM only."""
+    """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
     top_logprobs: int | None = Field(default=None)
     """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Google, Grok, Huggingface, and vLLM only."""

inspect_ai/model/_model.py CHANGED Viewed

@@ -9,7 +9,6 @@ from contextvars import ContextVar
 from copy import deepcopy
 from typing import Any, Callable, Literal, Type, cast
-from shortuuid import uuid
 from tenacity import (
     retry,
     retry_if_exception,
@@ -30,6 +29,7 @@ from inspect_ai._util.registry import (
     registry_unqualified_name,
 )
 from inspect_ai._util.retry import log_rate_limit_retry
+from inspect_ai._util.trace import trace_action
 from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
@@ -363,17 +363,16 @@ class Model:
                 cache="write" if cache else None,
             )
-            generate_id = uuid()
-            logger.debug(f"model generate {generate_id} ({str(self)})")
-            time_start = time.perf_counter()
-            result = await self.api.generate(
-                input=input,
-                tools=tools,
-                tool_choice=tool_choice,
-                config=config,
-            )
-            time_elapsed = time.perf_counter() - time_start
-            logger.debug(f"model generate {generate_id} (completed)")
+            with trace_action(logger, "Model", f"generate ({str(self)})"):
+                time_start = time.perf_counter()
+                result = await self.api.generate(
+                    input=input,
+                    tools=tools,
+                    tool_choice=tool_choice,
+                    config=config,
+                )
+                time_elapsed = time.perf_counter() - time_start
             if isinstance(result, tuple):
                 output, call = result
             else:

inspect_ai/model/_providers/bedrock.py CHANGED Viewed

@@ -312,7 +312,7 @@ class BedrockAPI(ModelAPI):
         from botocore.exceptions import ClientError
         # The bedrock client
-        async with self.session.client(
+        async with self.session.client(  # type: ignore[call-overload]
             service_name="bedrock-runtime",
             endpoint_url=self.base_url,
             config=Config(

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import os
+from logging import getLogger
 from typing import Any
 from openai import (
@@ -36,6 +37,7 @@ from inspect_ai._util.constants import DEFAULT_MAX_RETRIES
 from inspect_ai._util.content import Content
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.images import image_as_data_uri
+from inspect_ai._util.logger import warn_once
 from inspect_ai._util.url import is_data_uri, is_http_url
 from inspect_ai.tool import ToolCall, ToolChoice, ToolFunction, ToolInfo
@@ -58,6 +60,8 @@ from .util import (
     parse_tool_call,
 )
+logger = getLogger(__name__)
 OPENAI_API_KEY = "OPENAI_API_KEY"
 AZURE_OPENAI_API_KEY = "AZURE_OPENAI_API_KEY"
 AZUREAI_OPENAI_API_KEY = "AZUREAI_OPENAI_API_KEY"
@@ -270,7 +274,13 @@ class OpenAIAPI(ModelAPI):
         if config.seed is not None:
             params["seed"] = config.seed
         if config.temperature is not None:
-            params["temperature"] = config.temperature
+            if self.is_o1():
+                warn_once(
+                    logger,
+                    "o1 models do not support the 'temperature' parameter (temperature is always 1).",
+                )
+            else:
+                params["temperature"] = config.temperature
         # TogetherAPI requires temperature w/ num_choices
         elif config.num_choices is not None:
             params["temperature"] = 1

inspect_ai/tool/_tools/_web_browser/_web_browser.py CHANGED Viewed

@@ -362,7 +362,7 @@ async def web_browser_cmd(cmd: str, *args: str) -> str:
     else:
         arg_list = ["python3", WEB_CLIENT_REQUEST, cmd] + list(args)
-    result = await sandbox_env.exec(arg_list)
+    result = await sandbox_env.exec(arg_list, timeout=180)
     if not result.success:
         raise RuntimeError(
             f"Error executing web browser command {cmd}({', '.join(args)}): {result.stderr}"

inspect_ai/util/_sandbox/context.py CHANGED Viewed

@@ -191,7 +191,12 @@ async def setup_sandbox_environment(
     # chmod, execute, and remove
     async def exec(cmd: list[str]) -> None:
-        result = await env.exec(cmd)
+        try:
+            result = await env.exec(cmd, timeout=30)
+        except TimeoutError:
+            raise RuntimeError(
+                f"Timed out executing command {' '.join(cmd)} in sandbox"
+            )
         if not result.success:
             raise RuntimeError(

inspect_ai/util/_sandbox/docker/compose.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .prereqs import (
     DOCKER_COMPOSE_REQUIRED_VERSION_PULL_POLICY,
     validate_docker_compose,
 )
-from .util import ComposeProject, is_inspect_project, sandbox_log
+from .util import ComposeProject, is_inspect_project
 logger = getLogger(__name__)
@@ -31,7 +31,9 @@ async def compose_up(project: ComposeProject) -> None:
         project=project,
     )
     if not result.success:
-        msg = f"Failed to start docker services {result.stderr}"
+        msg = (
+            f"Failed to start docker services for {project.config}: " f"{result.stderr}"
+        )
         raise RuntimeError(msg)
@@ -94,7 +96,10 @@ async def compose_check_running(services: list[str], project: ComposeProject) ->
             for running_service in running_services:
                 unhealthy_services.remove(running_service["Service"])
-            msg = f"One or more docker containers failed to start {','.join(unhealthy_services)}"
+            msg = (
+                "One or more docker containers failed to start from "
+                f"{project.config}: {','.join(unhealthy_services)}"
+            )
             raise RuntimeError(msg)
     else:
         raise RuntimeError("No services started")
@@ -152,8 +157,9 @@ async def compose_pull(
 async def compose_exec(
     command: list[str],
+    *,
     project: ComposeProject,
-    timeout: int | None = None,
+    timeout: int | None,
     input: str | bytes | None = None,
     output_limit: int | None = None,
 ) -> ExecResult[str]:
@@ -206,7 +212,6 @@ async def compose_cleanup_images(
     cwd: str | None = None,
     timeout: int | None = None,
 ) -> None:
-    sandbox_log("Removing images")
     # List the images that would be created for this compose
     images_result = await compose_command(
         ["config", "--images"], project=project, cwd=cwd
@@ -241,10 +246,14 @@ async def compose_cleanup_images(
                         logger.warning(msg)
+DEFAULT_COMPOSE_TIMEOUT = 60
 async def compose_command(
     command: list[str],
+    *,
     project: ComposeProject,
-    timeout: int | None = None,
+    timeout: int | None = DEFAULT_COMPOSE_TIMEOUT,
     input: str | bytes | None = None,
     cwd: str | Path | None = None,
     forward_env: bool = True,
@@ -278,16 +287,46 @@ async def compose_command(
     # build final command
     compose_command = compose_command + command
-    # Execute the command
-    sandbox_log(f"compose command: {shlex.join(compose_command)}")
-    result = await subprocess(
-        compose_command,
-        input=input,
-        cwd=cwd,
-        env=env,
-        timeout=timeout,
-        capture_output=capture_output,
-        output_limit=output_limit,
-    )
-    sandbox_log(f"compose command completed: {shlex.join(compose_command)}")
-    return result
+    # function to run command
+    async def run_command(command_timeout: int | None) -> ExecResult[str]:
+        result = await subprocess(
+            compose_command,
+            input=input,
+            cwd=cwd,
+            env=env,
+            timeout=command_timeout,
+            capture_output=capture_output,
+            output_limit=output_limit,
+        )
+        return result
+    # we have observed underlying unreliability in docker compose in some linux
+    # environments on EC2 -- this exhibits in very simple commands (e.g. compose config)
+    # simply never returning. this tends to happen when we know there is a large
+    # number of commands in flight (task/sample init) so could be some sort of
+    # timing issue / race condition in the docker daemon. we've also observed that
+    # these same commands succeed if you just retry them. therefore, we add some
+    # extra resiliance by retrying commands with a timeout once. we were observing
+    # commands hanging at a rate of ~ 1/1000, so we retry up to twice (tweaking the
+    # retry time down) to make the odds of hanging vanishingly small
+    if timeout is not None:
+        MAX_RETRIES = 2
+        retries = 0
+        while True:
+            try:
+                command_timeout = (
+                    timeout if retries == 0 else (min(timeout, 60) // retries)
+                )
+                return await run_command(command_timeout)
+            except TimeoutError:
+                retries += 1
+                if retries <= MAX_RETRIES:
+                    logger.info(
+                        f"Retrying docker compose command: {shlex.join(compose_command)}"
+                    )
+                else:
+                    raise
+    else:
+        return await run_command(timeout)

inspect_ai/util/_sandbox/docker/docker.py CHANGED Viewed

@@ -42,7 +42,7 @@ from .compose import (
 from .config import CONFIG_FILES, DOCKERFILE
 from .internal import build_internal_image, is_internal_image
 from .prereqs import validate_prereqs
-from .util import ComposeProject, sandbox_log, task_project_name
+from .util import ComposeProject, task_project_name
 logger = getLogger(__name__)
@@ -113,8 +113,6 @@ class DockerSandboxEnvironment(SandboxEnvironment):
         config: SandboxEnvironmentConfigType | None,
         metadata: dict[str, str],
     ) -> dict[str, SandboxEnvironment]:
-        sandbox_log("setup")
         # create environment variables for sample metadata
         env: dict[str, str] = {}
         if isinstance(config, str) and Path(config).exists():
@@ -264,7 +262,9 @@ class DockerSandboxEnvironment(SandboxEnvironment):
     @override
     async def write_file(self, file: str, contents: str | bytes) -> None:
-        sandbox_log(f"write_file: {file}")
+        # exec function w/ timeout
+        async def exec(cmd: list[str]) -> ExecResult[str]:
+            return await self.exec(cmd, timeout=60)
         # resolve relative file paths
         file = self.container_file(file)
@@ -311,8 +311,8 @@ class DockerSandboxEnvironment(SandboxEnvironment):
         local_tmpfile.close()  # this will also delete the file
         if not hasattr(self, "_docker_user"):
-            uid = (await self.exec(["id", "-u"])).stdout.strip()
-            gid = (await self.exec(["id", "-g"])).stdout.strip()
+            uid = (await exec(["id", "-u"])).stdout.strip()
+            gid = (await exec(["id", "-g"])).stdout.strip()
             self._docker_user = (uid, gid)
         await compose_command(
@@ -331,7 +331,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
         parent = PurePosixPath(file).parent
         # We do these steps in a shell script for efficiency to avoid round-trips to docker.
-        res_cp = await self.exec(
+        res_cp = await exec(
             [
                 "sh",
                 "-e",
@@ -346,7 +346,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
         if res_cp.returncode != 0:
             if "Permission denied" in res_cp.stderr:
-                ls_result = await self.exec(["ls", "-la", "."])
+                ls_result = await exec(["ls", "-la", "."])
                 error_string = f"Permission was denied. Error details: {res_cp.stderr}; ls -la: {ls_result.stdout}; {self._docker_user=}"
                 raise PermissionError(error_string)
             elif (
@@ -367,8 +367,6 @@ class DockerSandboxEnvironment(SandboxEnvironment):
     @override
     async def read_file(self, file: str, text: bool = True) -> Union[str, bytes]:
-        sandbox_log(f"read_file: {file}")
         # Write the contents to a temp file
         with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir:
             # resolve relative file paths
@@ -449,7 +447,9 @@ class DockerSandboxEnvironment(SandboxEnvironment):
 async def container_working_dir(
     service: str, project: ComposeProject, default: str = "/"
 ) -> str:
-    result = await compose_exec([service, "sh", "-c", "pwd"], project)
+    result = await compose_exec(
+        [service, "sh", "-c", "pwd"], timeout=60, project=project
+    )
     if result.success:
         return result.stdout.strip()
     else:

inspect_ai/util/_sandbox/docker/util.py CHANGED Viewed

@@ -5,8 +5,6 @@ from pathlib import Path
 from shortuuid import uuid
-from inspect_ai._util.constants import SANDBOX
 from ..environment import SandboxEnvironmentConfigType
 from .config import (
     COMPOSE_DOCKERFILE_YAML,
@@ -94,7 +92,3 @@ inspect_project_pattern = r"^inspect-[a-z\d\-_]*-i[a-z\d]{22}$"
 def is_inspect_project(name: str) -> bool:
     return re.match(inspect_project_pattern, name) is not None
-def sandbox_log(msg: str) -> None:
-    logger.log(SANDBOX, f"DOCKER: {msg}")

inspect_ai/util/_sandbox/service.py CHANGED Viewed

@@ -10,6 +10,8 @@ from typing import (
 from pydantic import JsonValue
+from inspect_ai.util._subprocess import ExecResult
 from .environment import SandboxEnvironment
 REQUESTS_DIR = "requests"
@@ -129,9 +131,9 @@ class SandboxService:
         """Handle all pending service requests."""
         # list pending requests
         list_requests = f"ls -1 {self._requests_dir}/*.json"
-        result = await self._sandbox.exec(["bash", "-c", list_requests])
+        result = await self._exec(["bash", "-c", list_requests])
-        # process reqests
+        # process requests
         if result.success:
             request_files = result.stdout.strip().splitlines()
             if request_files:
@@ -142,7 +144,7 @@ class SandboxService:
     async def _handle_request(self, request_file: str) -> None:
         # read request
         read_request = f"cat {request_file}"
-        result = await self._sandbox.exec(["bash", "-c", read_request])
+        result = await self._exec(["bash", "-c", read_request])
         if not result.success:
             raise RuntimeError(
                 f"Error reading request for service {self._name}: '{read_request}' ({result.stderr})"
@@ -181,7 +183,7 @@ class SandboxService:
             await self._write_text_file(response_path, json.dumps(response_data))
             # remove request file
-            exec_rm = await self._sandbox.exec(["rm", "-f", request_file])
+            exec_rm = await self._exec(["rm", "-f", request_file])
             if not exec_rm.success:
                 raise RuntimeError(
                     f"Error removing request file '{request_file}': {exec_rm.stderr}"
@@ -215,8 +217,8 @@ class SandboxService:
     async def _create_rpc_dir(self, name: str) -> str:
         rpc_dir = PurePosixPath(self._service_dir, name).as_posix()
-        result = await self._sandbox.exec(["rm", "-rf", rpc_dir])
-        result = await self._sandbox.exec(["mkdir", "-p", rpc_dir])
+        result = await self._exec(["rm", "-rf", rpc_dir])
+        result = await self._exec(["mkdir", "-p", rpc_dir])
         if not result.success:
             raise RuntimeError(
                 f"Error creating rpc directory '{name}' for sandbox '{self._name}': {result.stderr}"
@@ -224,11 +226,19 @@ class SandboxService:
         return rpc_dir
     async def _write_text_file(self, file: str, contents: str) -> None:
-        result = await self._sandbox.exec(["tee", "--", file], input=contents)
+        result = await self._exec(["tee", "--", file], input=contents)
         if not result.success:
             msg = f"Failed to write file '{file}' into container: {result.stderr}"
             raise RuntimeError(msg)
+    async def _exec(self, cmd: list[str], input: str | None = None) -> ExecResult[str]:
+        try:
+            return await self._sandbox.exec(cmd, input=input, timeout=30)
+        except TimeoutError:
+            raise RuntimeError(
+                f"Timed out executing command {' '.join(cmd)} in sandbox"
+            )
     def _generate_client(self) -> str:
         return dedent(f"""
         from typing import Any

inspect_ai/util/_subprocess.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import os
+import shlex
 import sys
 from asyncio.subprocess import Process
 from contextvars import ContextVar
@@ -8,6 +9,8 @@ from logging import getLogger
 from pathlib import Path
 from typing import AsyncGenerator, Generic, Literal, TypeVar, Union, cast, overload
+from inspect_ai._util.trace import trace_action
 from ._concurrency import concurrency
 logger = getLogger(__name__)
@@ -217,7 +220,9 @@ async def subprocess(
     # run command
     async with concurrency("subprocesses", max_subprocesses_context_var.get()):
-        return await run_command_timeout()
+        message = args if isinstance(args, str) else shlex.join(args)
+        with trace_action(logger, "Subprocess", message):
+            return await run_command_timeout()
 def init_max_subprocesses(max_subprocesses: int | None = None) -> None:

inspect_ai/util/_subtask.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import inspect
 from functools import wraps
+from logging import getLogger
 from typing import (
     Any,
     Callable,
@@ -13,6 +14,7 @@ from typing import (
 from inspect_ai._util._async import is_callable_coroutine
 from inspect_ai._util.content import Content
+from inspect_ai._util.trace import trace_action
 from inspect_ai.util._store import Store, dict_jsonable, init_subtask_store
 SubtaskResult = str | int | float | bool | list[Content]
@@ -20,6 +22,9 @@ SubtaskResult = str | int | float | bool | list[Content]
 RT = TypeVar("RT", SubtaskResult, Any)
+logger = getLogger(__name__)
 @runtime_checkable
 class Subtask(Protocol):
     """Subtask with distinct `Store` and `Transcript`.
@@ -118,8 +123,9 @@ def subtask(
                 init_subtask(subtask_name, store if store else Store())
                 # run the subtask
-                with track_store_changes():  # type: ignore
-                    result = await func(*args, **kwargs)
+                with trace_action(logger, "Subtask", subtask_name):
+                    with track_store_changes():  # type: ignore
+                        result = await func(*args, **kwargs)
                 # return result and event
                 return result, list(transcript().events)

{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: inspect_ai
-Version: 0.3.53
+Version: 0.3.55
 Summary: Framework for large language model evaluations
 Author: UK AI Safety Institute
 License: MIT License
@@ -67,7 +67,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
 Requires-Dist: pytest-dotenv; extra == "dev"
 Requires-Dist: pytest-xdist; extra == "dev"
-Requires-Dist: ruff==0.8.3; extra == "dev"
+Requires-Dist: ruff==0.8.4; extra == "dev"
 Requires-Dist: textual-dev>=0.86.2; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"
 Requires-Dist: types-beautifulsoup4; extra == "dev"
@@ -96,22 +96,22 @@ To get started with Inspect, please see the documentation at <https://inspect.ai
 ***
 To work on development of Inspect, clone the repository and install with the `-e` flag and `[dev]` optional dependencies:
 ```bash
-$ git clone https://github.com/UKGovernmentBEIS/inspect_ai.git
-$ cd inspect_ai
-$ pip install -e ".[dev]"
+git clone https://github.com/UKGovernmentBEIS/inspect_ai.git
+cd inspect_ai
+pip install -e ".[dev]"
 ```
 Optionally install pre-commit hooks via
 ```bash
 make hooks
 ```
 Run linting, formatting, and tests via
 ```bash
 make check
 make test

inspect-ai 0.3.53__py3-none-any.whl → 0.3.55__py3-none-any.whl

inspect-ai 0.3.53py3-none-any.whl → 0.3.55py3-none-any.whl