PyPI - inspect-ai - Versions diffs - 0.3.49__py3-none-any.whl → 0.3.51__py3-none-any.whl - Mend

inspect-ai 0.3.49py3-none-any.whl → 0.3.51py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

inspect_ai/_cli/info.py +2 -2
inspect_ai/_cli/log.py +2 -2
inspect_ai/_cli/score.py +2 -2
inspect_ai/_display/core/display.py +19 -0
inspect_ai/_display/core/panel.py +37 -7
inspect_ai/_display/core/progress.py +29 -2
inspect_ai/_display/core/results.py +79 -40
inspect_ai/_display/core/textual.py +21 -0
inspect_ai/_display/rich/display.py +28 -8
inspect_ai/_display/textual/app.py +107 -1
inspect_ai/_display/textual/display.py +1 -1
inspect_ai/_display/textual/widgets/samples.py +132 -91
inspect_ai/_display/textual/widgets/task_detail.py +236 -0
inspect_ai/_display/textual/widgets/tasks.py +74 -6
inspect_ai/_display/textual/widgets/toggle.py +32 -0
inspect_ai/_eval/context.py +2 -0
inspect_ai/_eval/eval.py +4 -3
inspect_ai/_eval/loader.py +1 -1
inspect_ai/_eval/run.py +35 -2
inspect_ai/_eval/task/log.py +13 -11
inspect_ai/_eval/task/results.py +12 -3
inspect_ai/_eval/task/run.py +139 -36
inspect_ai/_eval/task/sandbox.py +2 -1
inspect_ai/_util/_async.py +30 -1
inspect_ai/_util/file.py +31 -4
inspect_ai/_util/html.py +3 -0
inspect_ai/_util/logger.py +6 -5
inspect_ai/_util/platform.py +5 -6
inspect_ai/_util/registry.py +1 -1
inspect_ai/_view/server.py +9 -9
inspect_ai/_view/www/App.css +2 -2
inspect_ai/_view/www/dist/assets/index.css +2 -2
inspect_ai/_view/www/dist/assets/index.js +352 -294
inspect_ai/_view/www/log-schema.json +13 -0
inspect_ai/_view/www/package.json +1 -0
inspect_ai/_view/www/src/components/MessageBand.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +16 -13
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -3
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +52 -77
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -13
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +15 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +4 -2
inspect_ai/_view/www/src/types/log.d.ts +2 -0
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +2 -0
inspect_ai/_view/www/yarn.lock +9 -4
inspect_ai/approval/__init__.py +1 -1
inspect_ai/approval/_human/approver.py +35 -0
inspect_ai/approval/_human/console.py +62 -0
inspect_ai/approval/_human/manager.py +108 -0
inspect_ai/approval/_human/panel.py +233 -0
inspect_ai/approval/_human/util.py +51 -0
inspect_ai/dataset/_sources/hf.py +2 -2
inspect_ai/dataset/_sources/util.py +1 -1
inspect_ai/log/_file.py +106 -36
inspect_ai/log/_recorders/eval.py +226 -158
inspect_ai/log/_recorders/file.py +9 -6
inspect_ai/log/_recorders/json.py +35 -12
inspect_ai/log/_recorders/recorder.py +15 -15
inspect_ai/log/_samples.py +52 -0
inspect_ai/model/_model.py +14 -0
inspect_ai/model/_model_output.py +4 -0
inspect_ai/model/_providers/azureai.py +1 -1
inspect_ai/model/_providers/hf.py +106 -4
inspect_ai/model/_providers/util/__init__.py +2 -0
inspect_ai/model/_providers/util/hf_handler.py +200 -0
inspect_ai/scorer/_common.py +1 -1
inspect_ai/solver/_plan.py +0 -8
inspect_ai/solver/_task_state.py +18 -1
inspect_ai/solver/_use_tools.py +9 -1
inspect_ai/tool/_tool_def.py +2 -2
inspect_ai/tool/_tool_info.py +14 -2
inspect_ai/tool/_tool_params.py +2 -1
inspect_ai/tool/_tools/_execute.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +6 -0
inspect_ai/util/__init__.py +5 -6
inspect_ai/util/_panel.py +91 -0
inspect_ai/util/_sandbox/__init__.py +2 -6
inspect_ai/util/_sandbox/context.py +4 -3
inspect_ai/util/_sandbox/docker/compose.py +12 -2
inspect_ai/util/_sandbox/docker/docker.py +19 -9
inspect_ai/util/_sandbox/docker/util.py +10 -2
inspect_ai/util/_sandbox/environment.py +47 -41
inspect_ai/util/_sandbox/local.py +15 -10
inspect_ai/util/_subprocess.py +43 -3
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/METADATA +2 -2
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/RECORD +90 -82
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
inspect_ai/approval/_human.py +0 -123
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/top_level.txt +0 -0

inspect_ai/util/_sandbox/environment.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import abc
 from dataclasses import dataclass, field
 from typing import Awaitable, Callable, Literal, NamedTuple, Union, overload
@@ -6,49 +8,37 @@ from pydantic import BaseModel, Field
 from .._subprocess import ExecResult
-TaskInit = Callable[[str, str | None], Awaitable[None]]
-TaskCleanup = Callable[[str, str | None, bool], Awaitable[None]]
+TaskInit = Callable[[str, Union["SandboxEnvironmentConfigType", None]], Awaitable[None]]
+TaskCleanup = Callable[
+    [str, Union["SandboxEnvironmentConfigType", None], bool], Awaitable[None]
+]
 SampleInit = Callable[
-    [str, str | None, dict[str, str]], Awaitable[dict[str, "SandboxEnvironment"]]
+    [str, Union["SandboxEnvironmentConfigType", None], dict[str, str]],
+    Awaitable[dict[str, "SandboxEnvironment"]],
 ]
 SampleCleanup = Callable[
-    [str, str | None, dict[str, "SandboxEnvironment"], bool], Awaitable[None]
+    [
+        str,
+        Union["SandboxEnvironmentConfigType", None],
+        dict[str, "SandboxEnvironment"],
+        bool,
+    ],
+    Awaitable[None],
 ]
-class SandboxConnectionBase(BaseModel):
+class SandboxConnection(BaseModel):
+    """Information required to connect to sandbox."""
     command: str
     """Shell command to connect to sandbox."""
-    working_dir: str
-    """Agent working directory."""
-class SandboxConnectionLocal(SandboxConnectionBase):
-    type: Literal["local"] = Field(default="local")
-class SandboxConnectionContainer(SandboxConnectionBase):
-    type: Literal["container"] = Field(default="container")
-    """Sandbox login type."""
-    container: str
-    """Container name."""
-class SandboxConnectionSSH(SandboxConnectionBase):
-    type: Literal["ssh"] = Field(default="ssh")
-    """Sandbox login type."""
-    destination: str
-    """SSH destination server."""
+    vscode_command: list[str] | None = Field(default=None)
+    """Optional vscode command (+args) to connect to sandbox."""
-SandboxConnection = Union[
-    SandboxConnectionContainer, SandboxConnectionLocal, SandboxConnectionSSH
-]
-"""Information required to connect to sandbox."""
+    container: str | None = Field(default=None)
+    """Optional container name (will not apply to all sandboxes)."""
 class SandboxEnvironment(abc.ABC):
@@ -64,24 +54,29 @@ class SandboxEnvironment(abc.ABC):
         return []
     @classmethod
-    async def task_init(cls, task_name: str, config: str | None) -> None:
+    async def task_init(
+        cls, task_name: str, config: SandboxEnvironmentConfigType | None
+    ) -> None:
         """Called at task startup initialize resources.
         Args:
           task_name (str): Name of task using the sandbox environment.
-          config (str): Implementation defined configuration file (optional).
+          config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
         """
         pass
     @classmethod
     async def sample_init(
-        cls, task_name: str, config: str | None, metadata: dict[str, str]
+        cls,
+        task_name: str,
+        config: SandboxEnvironmentConfigType | None,
+        metadata: dict[str, str],
     ) -> dict[str, "SandboxEnvironment"]:
         """Initialize sandbox environments for a sample.
         Args:
           task_name (str): Name of task using the sandbox environment.
-          config (str): Implementation defined configuration file (optional).
+          config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
           metadata (dict[str,str]): Sample `metadata` field
         Returns:
@@ -96,7 +91,7 @@ class SandboxEnvironment(abc.ABC):
     async def sample_cleanup(
         cls,
         task_name: str,
-        config: str | None,
+        config: SandboxEnvironmentConfigType | None,
         environments: dict[str, "SandboxEnvironment"],
         interrupted: bool,
     ) -> None:
@@ -104,7 +99,7 @@ class SandboxEnvironment(abc.ABC):
         Args:
           task_name (str): Name of task using the sandbox environment.
-          config (str): Implementation defined configuration file (optional).
+          config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
           environments (dict[str,SandboxEnvironment]): Sandbox environments created for this sample.
           interrupted (bool): Was the task interrupted by an error or cancellation
         """
@@ -112,13 +107,13 @@ class SandboxEnvironment(abc.ABC):
     @classmethod
     async def task_cleanup(
-        cls, task_name: str, config: str | None, cleanup: bool
+        cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool
     ) -> None:
         """Called at task exit as a last chance to cleanup resources.
         Args:
           task_name (str): Name of task using the sandbox environment.
-          config (str): Implementation defined configuration file (optional).
+          config (SandboxEnvironmentConfigType): Implementation defined configuration (optional).
           cleanup (bool): Whether to actually cleanup environment resources
             (False if `--no-sandbox-cleanup` was specified)
         """
@@ -227,6 +222,15 @@ class SandboxEnvironment(abc.ABC):
         ...
     async def connection(self) -> SandboxConnection:
+        """Information required to connect to sandbox environment.
+        Returns:
+           SandboxConnection: connection information
+        Raises:
+           NotImplementedError: For sandboxes that don't provide connections
+           ConnectionError: If sandbox is not currently running.
+        """
         raise NotImplementedError("connection not implemented")
@@ -248,8 +252,10 @@ class SandboxEnvironmentSpec(NamedTuple):
     """Specification of a SandboxEnvironment."""
     type: str
-    config: str | None = None
+    config: SandboxEnvironmentConfigType | None = None
+SandboxEnvironmentConfigType = BaseModel | str
 SandboxEnvironmentType = SandboxEnvironmentSpec | str | tuple[str, str]
 """SandboxEnvironmentSpec and str and tuple shorthands for it.

inspect_ai/util/_sandbox/local.py CHANGED Viewed

@@ -7,8 +7,15 @@ import aiofiles
 from typing_extensions import override
 from .._subprocess import ExecResult, subprocess
-from .environment import SandboxConnection, SandboxConnectionLocal, SandboxEnvironment
-from .limits import verify_exec_result_size, verify_read_file_size
+from .environment import (
+    SandboxEnvironment,
+    SandboxEnvironmentConfigType,
+)
+from .limits import (
+    SandboxEnvironmentLimits,
+    verify_exec_result_size,
+    verify_read_file_size,
+)
 from .registry import sandboxenv
@@ -17,7 +24,10 @@ class LocalSandboxEnvironment(SandboxEnvironment):
     @override
     @classmethod
     async def sample_init(
-        cls, task_name: str, config: str | None, metadata: dict[str, str]
+        cls,
+        task_name: str,
+        config: SandboxEnvironmentConfigType | None,
+        metadata: dict[str, str],
     ) -> dict[str, SandboxEnvironment]:
         return {"default": LocalSandboxEnvironment()}
@@ -26,7 +36,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
     async def sample_cleanup(
         cls,
         task_name: str,
-        config: str | None,
+        config: SandboxEnvironmentConfigType | None,
         environments: dict[str, SandboxEnvironment],
         interrupted: bool,
     ) -> None:
@@ -63,6 +73,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
             cwd=final_cwd,
             env=env,
             timeout=timeout,
+            output_limit=SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE,
         )
         verify_exec_result_size(result)
         return result
@@ -97,12 +108,6 @@ class LocalSandboxEnvironment(SandboxEnvironment):
             async with aiofiles.open(file, "rb") as f:
                 return await f.read()
-    @override
-    async def connection(self) -> SandboxConnection:
-        return SandboxConnectionLocal(
-            command="/bin/bash --login", working_dir=self.directory.name
-        )
     def _resolve_file(self, file: str) -> str:
         path = Path(file)
         if path.is_absolute():

inspect_ai/util/_subprocess.py CHANGED Viewed

@@ -39,6 +39,7 @@ async def subprocess(
     cwd: str | Path | None = None,
     env: dict[str, str] = {},
     capture_output: bool = True,
+    output_limit: int | None = None,
     timeout: int | None = None,
 ) -> ExecResult[str]: ...
@@ -51,6 +52,7 @@ async def subprocess(
     cwd: str | Path | None = None,
     env: dict[str, str] = {},
     capture_output: bool = True,
+    output_limit: int | None = None,
     timeout: int | None = None,
 ) -> ExecResult[bytes]: ...
@@ -62,6 +64,7 @@ async def subprocess(
     cwd: str | Path | None = None,
     env: dict[str, str] = {},
     capture_output: bool = True,
+    output_limit: int | None = None,
     timeout: int | None = None,
 ) -> Union[ExecResult[str], ExecResult[bytes]]:
     """Execute and wait for a subprocess.
@@ -80,6 +83,8 @@ async def subprocess(
        env (dict[str, str]): Additional environment variables.
        capture_output (bool): Capture stderr and stdout into ExecResult
          (if False, then output is redirected to parent stderr/stdout)
+       output_limit (int | None): Stop reading output if it exceeds
+         the specified limit (in bytes).
        timeout (int | None): Timeout. If the timeout expires then
          a `TimeoutError` will be raised.
@@ -119,10 +124,45 @@ async def subprocess(
         # yield the proc
         yield proc
+        # write stdin if specified
+        if proc.stdin is not None:
+            if input is not None:
+                proc.stdin.write(input)
+                await proc.stdin.drain()
+            proc.stdin.close()
+            await proc.stdin.wait_closed()
+        # read streams incrementally so we can check output limits
+        async def read_stream(stream: asyncio.StreamReader | None) -> bytes:
+            # return early for no stream
+            if stream is None:
+                return bytes()
+            # read 8k at a time
+            output = bytearray()
+            while True:
+                # read chunk and terminate if we are done
+                chunk = await stream.read(8192)
+                if not chunk:
+                    break
+                # append to output
+                output.extend(chunk)
+                # stop if we have a limit and we have exceeded it
+                if output_limit is not None and len(output) > output_limit:
+                    proc.kill()
+                    break
+            # return stream output
+            return bytes(output)
         # wait for it to execute and yield result
-        stdout, stderr = await proc.communicate(input=input)
-        success = proc.returncode == 0
-        returncode = proc.returncode if proc.returncode is not None else 1
+        stdout, stderr = await asyncio.gather(
+            read_stream(proc.stdout), read_stream(proc.stderr)
+        )
+        returncode = await proc.wait()
+        success = returncode == 0
         if text:
             yield ExecResult[str](
                 success=success,

{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: inspect_ai
-Version: 0.3.49
+Version: 0.3.51
 Summary: Framework for large language model evaluations
 Author: UK AI Safety Institute
 License: MIT License
@@ -68,7 +68,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
 Requires-Dist: pytest-dotenv; extra == "dev"
 Requires-Dist: pytest-xdist; extra == "dev"
-Requires-Dist: ruff==0.8.1; extra == "dev"
+Requires-Dist: ruff==0.8.2; extra == "dev"
 Requires-Dist: textual-dev>=0.86.2; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"
 Requires-Dist: types-aiofiles; extra == "dev"

inspect-ai 0.3.49__py3-none-any.whl → 0.3.51__py3-none-any.whl

inspect-ai 0.3.49py3-none-any.whl → 0.3.51py3-none-any.whl