PyPI - inspect-ai - Versions diffs - 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl - Mend

inspect-ai 0.3.52py3-none-any.whl → 0.3.54py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

inspect_ai/_cli/eval.py +55 -1
inspect_ai/_cli/main.py +2 -0
inspect_ai/_cli/trace.py +244 -0
inspect_ai/_display/core/progress.py +9 -3
inspect_ai/_display/core/results.py +8 -4
inspect_ai/_display/textual/app.py +5 -1
inspect_ai/_display/textual/widgets/task_detail.py +3 -0
inspect_ai/_display/textual/widgets/tasks.py +97 -6
inspect_ai/_eval/eval.py +33 -0
inspect_ai/_eval/evalset.py +4 -0
inspect_ai/_eval/registry.py +2 -2
inspect_ai/_eval/task/images.py +4 -14
inspect_ai/_eval/task/results.py +22 -4
inspect_ai/_eval/task/run.py +40 -20
inspect_ai/_eval/task/sandbox.py +72 -43
inspect_ai/_eval/task/task.py +4 -0
inspect_ai/_eval/task/util.py +2 -0
inspect_ai/_util/constants.py +3 -3
inspect_ai/_util/display.py +1 -0
inspect_ai/_util/logger.py +34 -8
inspect_ai/_util/trace.py +275 -0
inspect_ai/_view/www/App.css +13 -0
inspect_ai/_view/www/dist/assets/index.css +13 -0
inspect_ai/_view/www/dist/assets/index.js +80 -43
inspect_ai/_view/www/src/App.mjs +31 -6
inspect_ai/_view/www/src/Types.mjs +6 -0
inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
inspect_ai/_view/www/src/components/Tools.mjs +46 -18
inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
inspect_ai/log/_log.py +6 -0
inspect_ai/log/_message.py +2 -2
inspect_ai/log/_recorders/eval.py +8 -18
inspect_ai/log/_recorders/json.py +19 -17
inspect_ai/model/_cache.py +22 -16
inspect_ai/model/_call_tools.py +9 -1
inspect_ai/model/_generate_config.py +8 -2
inspect_ai/model/_model.py +11 -12
inspect_ai/model/_providers/azureai.py +1 -1
inspect_ai/model/_providers/bedrock.py +18 -2
inspect_ai/model/_providers/hf.py +1 -1
inspect_ai/model/_providers/openai.py +32 -8
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/vllm.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
inspect_ai/util/_sandbox/context.py +7 -3
inspect_ai/util/_sandbox/docker/compose.py +58 -19
inspect_ai/util/_sandbox/docker/config.py +8 -10
inspect_ai/util/_sandbox/docker/docker.py +20 -16
inspect_ai/util/_sandbox/docker/util.py +3 -9
inspect_ai/util/_sandbox/environment.py +7 -2
inspect_ai/util/_sandbox/limits.py +1 -1
inspect_ai/util/_sandbox/local.py +8 -9
inspect_ai/util/_sandbox/service.py +17 -7
inspect_ai/util/_subprocess.py +6 -1
inspect_ai/util/_subtask.py +8 -2
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/METADATA +6 -8
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/RECORD +64 -62
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.52.dist-info → inspect_ai-0.3.54.dist-info}/top_level.txt +0 -0

inspect_ai/model/_cache.py CHANGED Viewed

@@ -6,10 +6,12 @@ from datetime import datetime, timezone
 from hashlib import md5
 from pathlib import Path
 from shutil import rmtree
+from typing import Any
 from dateutil.relativedelta import relativedelta
 from inspect_ai._util.appdirs import inspect_cache_dir
+from inspect_ai._util.trace import trace_message
 from inspect_ai.tool import ToolChoice, ToolInfo
 from ._chat_message import ChatMessage
@@ -19,6 +21,10 @@ from ._model_output import ModelOutput
 logger = logging.getLogger(__name__)
+def trace(msg: str, *args: Any) -> None:
+    trace_message(logger, "Cache", msg, args)
 def _path_is_in_cache(path: Path | str) -> bool:
     """This ensures the path is in our cache directory, just in case the `model` is ../../../home/ubuntu/maliciousness"""
     if isinstance(path, str):
@@ -153,7 +159,7 @@ def _cache_key(entry: CacheEntry) -> str:
     base_string = "|".join([str(component) for component in components])
-    logger.debug(_cache_key_debug_string([str(component) for component in components]))
+    trace(_cache_key_debug_string([str(component) for component in components]))
     return md5(base_string.encode("utf-8")).hexdigest()
@@ -192,11 +198,11 @@ def cache_store(
         with open(filename, "wb") as f:
             expiry = _cache_expiry(entry.policy)
-            logger.debug("Storing in cache: %s (expires: %s)", filename, expiry)
+            trace("Storing in cache: %s (expires: %s)", filename, expiry)
             pickle.dump((expiry, output), f)
         return True
     except Exception as e:
-        logger.debug(f"Failed to cache {filename}: {e}")
+        trace(f"Failed to cache {filename}: {e}")
         return False
@@ -204,12 +210,12 @@ def cache_fetch(entry: CacheEntry) -> ModelOutput | None:
     """Fetch a value from the cache directory."""
     filename = cache_path(model=entry.model) / _cache_key(entry)
     try:
-        logger.debug("Fetching from cache: %s", filename)
+        trace("Fetching from cache: %s", filename)
         with open(filename, "rb") as f:
             expiry, output = pickle.load(f)
             if not isinstance(output, ModelOutput):
-                logger.debug(
+                trace(
                     "Unexpected cached type, can only fetch ModelOutput: %s (%s)",
                     type(output),
                     filename,
@@ -217,7 +223,7 @@ def cache_fetch(entry: CacheEntry) -> ModelOutput | None:
                 return None
             if _is_expired(expiry):
-                logger.debug("Cache expired for %s (%s)", filename, expiry)
+                trace("Cache expired for %s (%s)", filename, expiry)
                 # If it's expired, no point keeping it as we'll never access it
                 # successfully again.
                 filename.unlink(missing_ok=True)
@@ -225,7 +231,7 @@ def cache_fetch(entry: CacheEntry) -> ModelOutput | None:
             return output
     except Exception as e:
-        logger.debug(f"Failed to fetch from cache {filename}: {e}")
+        trace(f"Failed to fetch from cache {filename}: {e}")
         return None
@@ -235,7 +241,7 @@ def cache_clear(model: str = "") -> bool:
         path = cache_path(model)
         if (model == "" or _path_is_in_cache(path)) and path.exists():
-            logger.debug("Clearing cache: %s", path)
+            trace("Clearing cache: %s", path)
             rmtree(path)
             return True
@@ -351,24 +357,24 @@ def cache_list_expired(filter_by: list[str] = []) -> list[Path]:
         # "../../foo/bar") but we don't want to search the entire cache
         return []
-    logger.debug("Filtering by paths: %s", filter_by_paths)
+    trace("Filtering by paths: %s", filter_by_paths)
     for dirpath, _dirnames, filenames in os.walk(cache_path()):
         if filter_by_paths and Path(dirpath) not in filter_by_paths:
-            logger.debug("Skipping path %s", dirpath)
+            trace("Skipping path %s", dirpath)
             continue
-        logger.debug("Checking dirpath %s", dirpath)
+        trace("Checking dirpath %s", dirpath)
         for filename in filenames:
             path = Path(dirpath) / filename
-            logger.debug("Checking path %s", path)
+            trace("Checking path %s", path)
             try:
                 with open(path, "rb") as f:
                     expiry, _cache_entry = pickle.load(f)
                     if _is_expired(expiry):
-                        logger.debug("Expired cache entry found: %s (%s)", path, expiry)
+                        trace("Expired cache entry found: %s (%s)", path, expiry)
                         expired_cache_entries.append(path)
             except Exception as e:
-                logger.debug("Failed to load cached item %s: %s", path, e)
+                trace("Failed to load cached item %s: %s", path, e)
                 continue
     return expired_cache_entries
@@ -389,8 +395,8 @@ def cache_prune(files: list[Path] = []) -> None:
             with open(file, "rb") as f:
                 expiry, _cache_entry = pickle.load(f)
                 if _is_expired(expiry):
-                    logger.debug("Pruning expired cache: %s", file)
+                    trace("Pruning expired cache: %s", file)
                     file.unlink(missing_ok=True)
         except Exception as e:
-            logger.debug("Failed to prune cache %s: %s", file, e)
+            trace("Failed to prune cache %s: %s", file, e)
             continue

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import inspect
 from dataclasses import is_dataclass
+from logging import getLogger
 from textwrap import dedent
 from typing import (
     Any,
@@ -19,7 +20,9 @@ from jsonschema import Draft7Validator
 from pydantic import BaseModel
 from inspect_ai._util.content import Content, ContentImage, ContentText
+from inspect_ai._util.format import format_function_call
 from inspect_ai._util.text import truncate_string_to_bytes
+from inspect_ai._util.trace import trace_action
 from inspect_ai.model._trace import trace_tool_mesage
 from inspect_ai.tool import Tool, ToolCall, ToolError, ToolInfo
 from inspect_ai.tool._tool import (
@@ -35,6 +38,8 @@ from inspect_ai.util import OutputLimitExceededError
 from ._chat_message import ChatMessageAssistant, ChatMessageTool
 from ._generate_config import active_generate_config
+logger = getLogger(__name__)
 async def call_tools(
     message: ChatMessageAssistant,
@@ -215,7 +220,10 @@ async def call_tool(tools: list[ToolDef], message: str, call: ToolCall) -> Any:
     arguments = tool_params(call.arguments, tool_def.tool)
     # call the tool
-    result = await tool_def.tool(**arguments)
+    with trace_action(
+        logger, "Tool Call", format_function_call(tool_def.name, arguments, width=1000)
+    ):
+        result = await tool_def.tool(**arguments)
     # return result
     return result

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -58,7 +58,7 @@ class GenerateConfigArgs(TypedDict, total=False):
     """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, and TogetherAI only."""
     logprobs: bool | None
-    """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, and Huggingface only."""
+    """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
     top_logprobs: int | None
     """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Google, Grok, and Huggingface only."""
@@ -72,6 +72,9 @@ class GenerateConfigArgs(TypedDict, total=False):
     cache_prompt: Literal["auto"] | bool | None
     """Whether to cache the prompt prefix. Defaults to "auto", which will enable caching for requests with tools. Anthropic only."""
+    reasoning_effort: Literal["low", "medium", "high"] | None
+    """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
 class GenerateConfig(BaseModel):
     """Base class for model generation configs."""
@@ -125,7 +128,7 @@ class GenerateConfig(BaseModel):
     """How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, and vLLM only."""
     logprobs: bool | None = Field(default=None)
-    """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, and vLLM only."""
+    """Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only."""
     top_logprobs: int | None = Field(default=None)
     """Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Google, Grok, Huggingface, and vLLM only."""
@@ -139,6 +142,9 @@ class GenerateConfig(BaseModel):
     cache_prompt: Literal["auto"] | bool | None = Field(default=None)
     """Whether to cache the prompt prefix. Defaults to "auto", which will enable caching for requests with tools. Anthropic only."""
+    reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
+    """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
     def merge(
         self, other: Union["GenerateConfig", GenerateConfigArgs]
     ) -> "GenerateConfig":

inspect_ai/model/_model.py CHANGED Viewed

@@ -9,7 +9,6 @@ from contextvars import ContextVar
 from copy import deepcopy
 from typing import Any, Callable, Literal, Type, cast
-from shortuuid import uuid
 from tenacity import (
     retry,
     retry_if_exception,
@@ -30,6 +29,7 @@ from inspect_ai._util.registry import (
     registry_unqualified_name,
 )
 from inspect_ai._util.retry import log_rate_limit_retry
+from inspect_ai._util.trace import trace_action
 from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
@@ -363,17 +363,16 @@ class Model:
                 cache="write" if cache else None,
             )
-            generate_id = uuid()
-            logger.debug(f"model generate {generate_id} ({str(self)})")
-            time_start = time.perf_counter()
-            result = await self.api.generate(
-                input=input,
-                tools=tools,
-                tool_choice=tool_choice,
-                config=config,
-            )
-            time_elapsed = time.perf_counter() - time_start
-            logger.debug(f"model generate {generate_id} (completed)")
+            with trace_action(logger, "Model", f"generate ({str(self)})"):
+                time_start = time.perf_counter()
+                result = await self.api.generate(
+                    input=input,
+                    tools=tools,
+                    tool_choice=tool_choice,
+                    config=config,
+                )
+                time_elapsed = time.perf_counter() - time_start
             if isinstance(result, tuple):
                 output, call = result
             else:

inspect_ai/model/_providers/azureai.py CHANGED Viewed

@@ -93,7 +93,7 @@ class AzureAIAPI(ModelAPI):
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
             value = model_args.get(name, None)
-            if value:
+            if value is not None:
                 model_args.pop(name)
             return value

inspect_ai/model/_providers/bedrock.py CHANGED Viewed

@@ -236,15 +236,21 @@ class BedrockAPI(ModelAPI):
         self,
         model_name: str,
         base_url: str | None,
+        api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
         **model_args: Any,
     ):
         super().__init__(
             model_name=model_name,
             base_url=model_base_url(base_url, "BEDROCK_BASE_URL"),
+            api_key=api_key,
+            api_key_vars=[],
             config=config,
         )
+        # save model_args
+        self.model_args = model_args
         # import aioboto3 on demand
         try:
             import aioboto3
@@ -263,6 +269,9 @@ class BedrockAPI(ModelAPI):
     @override
     def max_tokens(self) -> int | None:
+        if "llama3-70" in self.model_name or "llama3-8" in self.model_name:
+            return 2048
         if "llama3" in self.model_name or "claude3" in self.model_name:
             return 4096
@@ -303,7 +312,7 @@ class BedrockAPI(ModelAPI):
         from botocore.exceptions import ClientError
         # The bedrock client
-        async with self.session.client(
+        async with self.session.client(  # type: ignore[call-overload]
             service_name="bedrock-runtime",
             endpoint_url=self.base_url,
             config=Config(
@@ -316,6 +325,7 @@ class BedrockAPI(ModelAPI):
                     mode="adaptive",
                 ),
             ),
+            **self.model_args,
         ) as client:
             # Process the tools
             resolved_tools = converse_tools(tools)
@@ -658,6 +668,8 @@ def converse_image_type(type: str) -> ConverseImageFormat:
             return "png"
         case "image/webp":
             return "webp"
+        case "image/jpeg":
+            return "jpeg"
         case _:
             raise ValueError(
                 f"Image mime type {type} is not supported for Bedrock Converse models."
@@ -673,7 +685,11 @@ def converse_tools(tools: list[ToolInfo]) -> list[ConverseTool] | None:
         tool_spec = ConverseToolSpec(
             name=tool.name,
             description=tool.description,
-            inputSchema={"json": tool.parameters.model_dump(exclude_none=True)},
+            inputSchema={
+                "json": tool.parameters.model_dump(
+                    exclude_none=True, exclude={"additionalProperties"}
+                )
+            },
         )
         result.append(ConverseTool(toolSpec=tool_spec))
     return result

inspect_ai/model/_providers/hf.py CHANGED Viewed

@@ -64,7 +64,7 @@ class HuggingFaceAPI(ModelAPI):
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
             value = model_args.get(name, None)
-            if value:
+            if value is not None:
                 model_args.pop(name)
             return value

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -18,6 +18,7 @@ from openai.types.chat import (
     ChatCompletionContentPartImageParam,
     ChatCompletionContentPartParam,
     ChatCompletionContentPartTextParam,
+    ChatCompletionDeveloperMessageParam,
     ChatCompletionMessage,
     ChatCompletionMessageParam,
     ChatCompletionMessageToolCallParam,
@@ -141,6 +142,18 @@ class OpenAIAPI(ModelAPI):
                 **model_args,
             )
+    def is_o1(self) -> bool:
+        return self.model_name.startswith("o1")
+    def is_o1_full(self) -> bool:
+        return self.is_o1() and not self.is_o1_mini() and not self.is_o1_preview()
+    def is_o1_mini(self) -> bool:
+        return self.model_name.startswith("o1-mini")
+    def is_o1_preview(self) -> bool:
+        return self.model_name.startswith("o1-preview")
     async def generate(
         self,
         input: list[ChatMessage],
@@ -148,8 +161,8 @@ class OpenAIAPI(ModelAPI):
         tool_choice: ToolChoice,
         config: GenerateConfig,
     ) -> ModelOutput | tuple[ModelOutput, ModelCall]:
-        # short-circuit to call o1- model
-        if self.model_name.startswith("o1-"):
+        # short-circuit to call o1- models that are text only
+        if self.is_o1_preview() or self.is_o1_mini():
             return await generate_o1(
                 client=self.client,
                 input=input,
@@ -179,7 +192,7 @@ class OpenAIAPI(ModelAPI):
         # prepare request (we do this so we can log the ModelCall)
         request = dict(
-            messages=await as_openai_chat_messages(input),
+            messages=await as_openai_chat_messages(input, self.is_o1_full()),
             tools=chat_tools(tools) if len(tools) > 0 else NOT_GIVEN,
             tool_choice=chat_tool_choice(tool_choice) if len(tools) > 0 else NOT_GIVEN,
             **self.completion_params(config, len(tools) > 0),
@@ -271,8 +284,10 @@ class OpenAIAPI(ModelAPI):
             params["logprobs"] = config.logprobs
         if config.top_logprobs is not None:
             params["top_logprobs"] = config.top_logprobs
-        if tools and config.parallel_tool_calls is not None:
+        if tools and config.parallel_tool_calls is not None and not self.is_o1():
             params["parallel_tool_calls"] = config.parallel_tool_calls
+        if config.reasoning_effort is not None and self.is_o1_full():
+            params["reasoning_effort"] = config.reasoning_effort
         return params
@@ -291,14 +306,23 @@ class OpenAIAPI(ModelAPI):
 async def as_openai_chat_messages(
-    messages: list[ChatMessage],
+    messages: list[ChatMessage], o1_full: bool
 ) -> list[ChatCompletionMessageParam]:
-    return [await openai_chat_message(message) for message in messages]
+    return [await openai_chat_message(message, o1_full) for message in messages]
-async def openai_chat_message(message: ChatMessage) -> ChatCompletionMessageParam:
+async def openai_chat_message(
+    message: ChatMessage, o1_full: bool
+) -> ChatCompletionMessageParam:
     if message.role == "system":
-        return ChatCompletionSystemMessageParam(role=message.role, content=message.text)
+        if o1_full:
+            return ChatCompletionDeveloperMessageParam(
+                role="developer", content=message.text
+            )
+        else:
+            return ChatCompletionSystemMessageParam(
+                role=message.role, content=message.text
+            )
     elif message.role == "user":
         return ChatCompletionUserMessageParam(
             role=message.role,

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -242,7 +242,7 @@ def mockllm() -> type[ModelAPI]:
 def validate_openai_client(feature: str) -> None:
     FEATURE = feature
     PACKAGE = "openai"
-    MIN_VERSION = "1.45.0"
+    MIN_VERSION = "1.58.1"
     # verify we have the package
     try:

inspect_ai/model/_providers/vllm.py CHANGED Viewed

@@ -75,7 +75,7 @@ class VLLMAPI(ModelAPI):
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
             value = model_args.get(name, None)
-            if value:
+            if value is not None:
                 model_args.pop(name)
             return value

inspect_ai/tool/_tools/_web_browser/_web_browser.py CHANGED Viewed

@@ -362,7 +362,7 @@ async def web_browser_cmd(cmd: str, *args: str) -> str:
     else:
         arg_list = ["python3", WEB_CLIENT_REQUEST, cmd] + list(args)
-    result = await sandbox_env.exec(arg_list)
+    result = await sandbox_env.exec(arg_list, timeout=180)
     if not result.success:
         raise RuntimeError(
             f"Error executing web browser command {cmd}({', '.join(args)}): {result.stderr}"

inspect_ai/util/_sandbox/context.py CHANGED Viewed

@@ -109,7 +109,7 @@ def raise_no_sandbox() -> NoReturn:
 async def init_sandbox_environments_sample(
-    type: str,
+    sandboxenv_type: type[SandboxEnvironment],
     task_name: str,
     config: SandboxEnvironmentConfigType | None,
     files: dict[str, bytes],
@@ -117,7 +117,6 @@ async def init_sandbox_environments_sample(
     metadata: dict[str, Any],
 ) -> dict[str, SandboxEnvironment]:
     # get setup and cleanup functions
-    sandboxenv_type = registry_find_sandboxenv(type)
     sample_init = cast(SampleInit, getattr(sandboxenv_type, "sample_init"))
     sample_cleanup = cast(SampleCleanup, getattr(sandboxenv_type, "sample_cleanup"))
@@ -192,7 +191,12 @@ async def setup_sandbox_environment(
     # chmod, execute, and remove
     async def exec(cmd: list[str]) -> None:
-        result = await env.exec(cmd)
+        try:
+            result = await env.exec(cmd, timeout=30)
+        except TimeoutError:
+            raise RuntimeError(
+                f"Timed out executing command {' '.join(cmd)} in sandbox"
+            )
         if not result.success:
             raise RuntimeError(

inspect_ai/util/_sandbox/docker/compose.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .prereqs import (
     DOCKER_COMPOSE_REQUIRED_VERSION_PULL_POLICY,
     validate_docker_compose,
 )
-from .util import ComposeProject, is_inspect_project, sandbox_log
+from .util import ComposeProject, is_inspect_project
 logger = getLogger(__name__)
@@ -31,7 +31,9 @@ async def compose_up(project: ComposeProject) -> None:
         project=project,
     )
     if not result.success:
-        msg = f"Failed to start docker services {result.stderr}"
+        msg = (
+            f"Failed to start docker services for {project.config}: " f"{result.stderr}"
+        )
         raise RuntimeError(msg)
@@ -94,7 +96,10 @@ async def compose_check_running(services: list[str], project: ComposeProject) ->
             for running_service in running_services:
                 unhealthy_services.remove(running_service["Service"])
-            msg = f"One or more docker containers failed to start {','.join(unhealthy_services)}"
+            msg = (
+                "One or more docker containers failed to start from "
+                f"{project.config}: {','.join(unhealthy_services)}"
+            )
             raise RuntimeError(msg)
     else:
         raise RuntimeError("No services started")
@@ -152,8 +157,9 @@ async def compose_pull(
 async def compose_exec(
     command: list[str],
+    *,
     project: ComposeProject,
-    timeout: int | None = None,
+    timeout: int | None,
     input: str | bytes | None = None,
     output_limit: int | None = None,
 ) -> ExecResult[str]:
@@ -206,7 +212,6 @@ async def compose_cleanup_images(
     cwd: str | None = None,
     timeout: int | None = None,
 ) -> None:
-    sandbox_log("Removing images")
     # List the images that would be created for this compose
     images_result = await compose_command(
         ["config", "--images"], project=project, cwd=cwd
@@ -241,10 +246,14 @@ async def compose_cleanup_images(
                         logger.warning(msg)
+DEFAULT_COMPOSE_TIMEOUT = 60
 async def compose_command(
     command: list[str],
+    *,
     project: ComposeProject,
-    timeout: int | None = None,
+    timeout: int | None = DEFAULT_COMPOSE_TIMEOUT,
     input: str | bytes | None = None,
     cwd: str | Path | None = None,
     forward_env: bool = True,
@@ -278,16 +287,46 @@ async def compose_command(
     # build final command
     compose_command = compose_command + command
-    # Execute the command
-    sandbox_log(f"compose command: {shlex.join(compose_command)}")
-    result = await subprocess(
-        compose_command,
-        input=input,
-        cwd=cwd,
-        env=env,
-        timeout=timeout,
-        capture_output=capture_output,
-        output_limit=output_limit,
-    )
-    sandbox_log(f"compose command completed: {shlex.join(compose_command)}")
-    return result
+    # function to run command
+    async def run_command(command_timeout: int | None) -> ExecResult[str]:
+        result = await subprocess(
+            compose_command,
+            input=input,
+            cwd=cwd,
+            env=env,
+            timeout=command_timeout,
+            capture_output=capture_output,
+            output_limit=output_limit,
+        )
+        return result
+    # we have observed underlying unreliability in docker compose in some linux
+    # environments on EC2 -- this exhibits in very simple commands (e.g. compose config)
+    # simply never returning. this tends to happen when we know there is a large
+    # number of commands in flight (task/sample init) so could be some sort of
+    # timing issue / race condition in the docker daemon. we've also observed that
+    # these same commands succeed if you just retry them. therefore, we add some
+    # extra resiliance by retrying commands with a timeout once. we were observing
+    # commands hanging at a rate of ~ 1/1000, so we retry up to twice (tweaking the
+    # retry time down) to make the odds of hanging vanishingly small
+    if timeout is not None:
+        MAX_RETRIES = 2
+        retries = 0
+        while True:
+            try:
+                command_timeout = (
+                    timeout if retries == 0 else (min(timeout, 60) // retries)
+                )
+                return await run_command(command_timeout)
+            except TimeoutError:
+                retries += 1
+                if retries <= MAX_RETRIES:
+                    logger.info(
+                        f"Retrying docker compose command: {shlex.join(compose_command)}"
+                    )
+                else:
+                    raise
+    else:
+        return await run_command(timeout)

inspect_ai/util/_sandbox/docker/config.py CHANGED Viewed

@@ -2,8 +2,6 @@ import os
 from logging import getLogger
 from pathlib import Path
-import aiofiles
 logger = getLogger(__name__)
@@ -17,7 +15,7 @@ CONFIG_FILES = [
 DOCKERFILE = "Dockerfile"
-async def resolve_compose_file(parent: str = "") -> str:
+def resolve_compose_file(parent: str = "") -> str:
     # existing compose file provides all the config we need
     compose = find_compose_file(parent)
     if compose is not None:
@@ -29,11 +27,11 @@ async def resolve_compose_file(parent: str = "") -> str:
     # dockerfile just needs a compose.yaml synthesized
     elif has_dockerfile(parent):
-        return await auto_compose_file(COMPOSE_DOCKERFILE_YAML, parent)
+        return auto_compose_file(COMPOSE_DOCKERFILE_YAML, parent)
     # otherwise provide a generic python container
     else:
-        return await auto_compose_file(COMPOSE_GENERIC_YAML, parent)
+        return auto_compose_file(COMPOSE_GENERIC_YAML, parent)
 def find_compose_file(parent: str = "") -> str | None:
@@ -59,9 +57,9 @@ def is_auto_compose_file(file: str) -> bool:
     return os.path.basename(file) == AUTO_COMPOSE_YAML
-async def ensure_auto_compose_file(file: str | None) -> None:
+def ensure_auto_compose_file(file: str | None) -> None:
     if file is not None and is_auto_compose_file(file) and not os.path.exists(file):
-        await resolve_compose_file(os.path.dirname(file))
+        resolve_compose_file(os.path.dirname(file))
 def safe_cleanup_auto_compose(file: str | None) -> None:
@@ -100,8 +98,8 @@ services:
 """
-async def auto_compose_file(contents: str, parent: str = "") -> str:
+def auto_compose_file(contents: str, parent: str = "") -> str:
     path = os.path.join(parent, AUTO_COMPOSE_YAML)
-    async with aiofiles.open(path, "w", encoding="utf-8") as f:
-        await f.write(contents)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(contents)
     return Path(path).resolve().as_posix()

inspect-ai 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl

inspect-ai 0.3.52py3-none-any.whl → 0.3.54py3-none-any.whl