PyPI - inspect-ai - Versions diffs - 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl - Mend

inspect-ai 0.3.56py3-none-any.whl → 0.3.58py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_cli/common.py +4 -2
inspect_ai/_cli/eval.py +2 -0
inspect_ai/_cli/trace.py +21 -2
inspect_ai/_display/core/active.py +0 -2
inspect_ai/_display/core/panel.py +1 -1
inspect_ai/_display/rich/display.py +4 -4
inspect_ai/_display/textual/app.py +4 -1
inspect_ai/_display/textual/widgets/samples.py +41 -5
inspect_ai/_eval/eval.py +32 -20
inspect_ai/_eval/evalset.py +7 -5
inspect_ai/_eval/run.py +16 -11
inspect_ai/_eval/task/__init__.py +2 -2
inspect_ai/_eval/task/images.py +40 -25
inspect_ai/_eval/task/run.py +141 -119
inspect_ai/_eval/task/task.py +140 -25
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/content.py +23 -1
inspect_ai/_util/datetime.py +1 -1
inspect_ai/_util/deprecation.py +1 -1
inspect_ai/_util/images.py +20 -17
inspect_ai/_util/json.py +11 -1
inspect_ai/_util/kvstore.py +73 -0
inspect_ai/_util/logger.py +2 -1
inspect_ai/_util/notgiven.py +18 -0
inspect_ai/_util/thread.py +5 -0
inspect_ai/_util/trace.py +39 -3
inspect_ai/_util/transcript.py +36 -7
inspect_ai/_view/www/.prettierrc.js +12 -0
inspect_ai/_view/www/dist/assets/index.js +322 -226
inspect_ai/_view/www/log-schema.json +221 -138
inspect_ai/_view/www/src/App.mjs +18 -9
inspect_ai/_view/www/src/Types.mjs +0 -1
inspect_ai/_view/www/src/api/Types.mjs +15 -4
inspect_ai/_view/www/src/api/api-http.mjs +2 -0
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +18 -3
inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
inspect_ai/_view/www/src/types/log.d.ts +53 -35
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
inspect_ai/approval/_human/util.py +2 -2
inspect_ai/dataset/_sources/csv.py +2 -1
inspect_ai/dataset/_sources/json.py +2 -1
inspect_ai/dataset/_sources/util.py +15 -7
inspect_ai/log/_condense.py +11 -1
inspect_ai/log/_log.py +27 -5
inspect_ai/log/_recorders/eval.py +21 -8
inspect_ai/log/_samples.py +10 -5
inspect_ai/log/_transcript.py +28 -1
inspect_ai/model/__init__.py +10 -2
inspect_ai/model/_call_tools.py +82 -17
inspect_ai/model/_chat_message.py +2 -4
inspect_ai/model/{_trace.py → _conversation.py} +9 -8
inspect_ai/model/_model.py +2 -2
inspect_ai/model/_providers/anthropic.py +9 -7
inspect_ai/model/_providers/azureai.py +6 -4
inspect_ai/model/_providers/bedrock.py +6 -4
inspect_ai/model/_providers/google.py +103 -14
inspect_ai/model/_providers/groq.py +7 -5
inspect_ai/model/_providers/hf.py +11 -6
inspect_ai/model/_providers/mistral.py +6 -9
inspect_ai/model/_providers/openai.py +34 -8
inspect_ai/model/_providers/openai_o1.py +10 -12
inspect_ai/model/_providers/vertex.py +17 -4
inspect_ai/scorer/__init__.py +13 -2
inspect_ai/scorer/_metrics/__init__.py +2 -2
inspect_ai/scorer/_metrics/std.py +3 -3
inspect_ai/tool/__init__.py +9 -1
inspect_ai/tool/_tool.py +9 -2
inspect_ai/tool/_tool_info.py +2 -1
inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
inspect_ai/util/__init__.py +4 -3
inspect_ai/util/{_trace.py → _conversation.py} +3 -17
inspect_ai/util/_display.py +14 -4
inspect_ai/util/_sandbox/context.py +12 -13
inspect_ai/util/_sandbox/docker/compose.py +24 -13
inspect_ai/util/_sandbox/docker/docker.py +20 -13
inspect_ai/util/_sandbox/docker/util.py +2 -1
inspect_ai/util/_sandbox/environment.py +13 -1
inspect_ai/util/_sandbox/local.py +1 -0
inspect_ai/util/_sandbox/self_check.py +18 -18
inspect_ai/util/_store.py +2 -2
inspect_ai/util/_subprocess.py +3 -3
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0

inspect_ai/util/_sandbox/docker/docker.py CHANGED Viewed

@@ -138,28 +138,31 @@ class DockerSandboxEnvironment(SandboxEnvironment):
             # start the services
             await compose_up(project)
+            # check to ensure that the services are running
+            running_services = await compose_check_running(
+                list(services.keys()), project=project
+            )
             # note that the project is running
             project_startup(project)
-            # check to ensure that the services are running
-            await compose_check_running(list(services.keys()), project=project)
-            # create sandbox environments
+            # create sandbox environments for all running services
             default_service: str | None = None
             environments: dict[str, SandboxEnvironment] = {}
             for service, service_info in services.items():
-                # update the project w/ the working directory
-                working_dir = await container_working_dir(service, project)
+                if service in running_services:
+                    # update the project w/ the working directory
+                    working_dir = await container_working_dir(service, project)
-                # create the docker sandbox environemnt
-                docker_env = DockerSandboxEnvironment(service, project, working_dir)
+                    # create the docker sandbox environemnt
+                    docker_env = DockerSandboxEnvironment(service, project, working_dir)
-                # save reference to default service if requested
-                if service_info.get("x-default", False):
-                    default_service = service
+                    # save reference to default service if requested
+                    if service_info.get("x-default", False):
+                        default_service = service
-                # record service => environment
-                environments[service] = docker_env
+                    # record service => environment
+                    environments[service] = docker_env
             # confirm that we have a 'default' environemnt
             if environments.get("default", None) is None and default_service is None:
@@ -225,6 +228,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
         env: dict[str, str] = {},
         user: str | None = None,
         timeout: int | None = None,
+        timeout_retry: bool = True,
     ) -> ExecResult[str]:
         # additional args
         args = []
@@ -251,6 +255,7 @@ class DockerSandboxEnvironment(SandboxEnvironment):
             args + [self._service] + cmd,
             project=self._project,
             timeout=timeout,
+            timeout_retry=timeout_retry,
             input=input,
             output_limit=SandboxEnvironmentLimits.MAX_EXEC_OUTPUT_SIZE,
         )
@@ -428,11 +433,13 @@ class DockerSandboxEnvironment(SandboxEnvironment):
         # return container connection
         if container:
             return SandboxConnection(
+                type="docker",
                 command=f"docker exec -it {container} bash -l",
                 vscode_command=[
                     "remote-containers.attachToRunningContainer",
                     container,
                 ],
+                container=container,
             )
         # error (not currently running)
         else:

inspect_ai/util/_sandbox/docker/util.py CHANGED Viewed

@@ -84,7 +84,8 @@ def task_project_name(task: str) -> str:
     if len(task) == 0:
         task = "task"
-    return f"inspect-{task[:12]}-i{uuid().lower()[:6]}"
+    # _- breaks docker project name constraints so we strip trailing underscores.
+    return f"inspect-{task[:12].rstrip('_')}-i{uuid().lower()[:6]}"
 inspect_project_pattern = r"^inspect-[a-z\d\-_]*-i[a-z\d]{6,}$"

inspect_ai/util/_sandbox/environment.py CHANGED Viewed

@@ -31,12 +31,18 @@ SampleCleanup = Callable[
 class SandboxConnection(BaseModel):
     """Information required to connect to sandbox."""
+    type: str
+    """Sandbox type name (e.g. 'docker', 'local', etc.)"""
     command: str
     """Shell command to connect to sandbox."""
     vscode_command: list[Any] | None = Field(default=None)
     """Optional vscode command (+args) to connect to sandbox."""
+    container: str | None = Field(default=None)
+    """Optional container name (does not apply to all sandboxes)."""
 class SandboxEnvironment(abc.ABC):
     """Environment for executing arbitrary code from tools.
@@ -139,6 +145,7 @@ class SandboxEnvironment(abc.ABC):
         env: dict[str, str] = {},
         user: str | None = None,
         timeout: int | None = None,
+        timeout_retry: bool = True,
     ) -> ExecResult[str]:
         """Execute a command within a sandbox environment.
@@ -155,12 +162,17 @@ class SandboxEnvironment(abc.ABC):
           env (dict[str,str]): Environment variables for execution.
           user (str | None): Optional username or UID to run the command as.
           timeout (int | None): Optional execution timeout (seconds).
+          timeout_retry (bool): Retry the command in the case that it times out.
+            Commands will be retried up to twice, with a timeout of no greater
+            than 60 seconds for the first retry and 30 for the second.
         Returns:
           Execution result (status code, stderr/stdout, etc.)
         Raises:
-          TimeoutError: If the specified `timeout` expires.
+          TimeoutError: If the specified `timeout` expires
+            (and `timeout_retry` attempts also timeout).
           UnicodeDecodeError: If an error occurs while
             decoding the command output.
           PermissionError: If the user does not have

inspect_ai/util/_sandbox/local.py CHANGED Viewed

@@ -55,6 +55,7 @@ class LocalSandboxEnvironment(SandboxEnvironment):
         env: dict[str, str] = {},
         user: str | None = None,
         timeout: int | None = None,
+        timeout_retry: bool = True,
     ) -> ExecResult[str]:
         if user is not None:
             warnings.warn(

inspect_ai/util/_sandbox/self_check.py CHANGED Viewed

@@ -75,9 +75,9 @@ async def test_read_and_write_file_text(sandbox_env: SandboxEnvironment) -> None
     written_file_string = await sandbox_env.read_file(
         "test_read_and_write_file_text.file", text=True
     )
-    assert (
-        "great #content\nincluding newlines" == written_file_string
-    ), f"unexpected content: [{written_file_string}]"
+    assert "great #content\nincluding newlines" == written_file_string, (
+        f"unexpected content: [{written_file_string}]"
+    )
     await _cleanup_file(sandbox_env, "test_read_and_write_file_text.file")
@@ -219,9 +219,9 @@ async def test_exec_output(sandbox_env: SandboxEnvironment) -> None:
     exec_result = await sandbox_env.exec(["sh", "-c", "echo foo; echo bar"])
     expected = "foo\nbar\n"
     # in the assertion message, we show the actual bytes to help debug newline issues
-    assert (
-        exec_result.stdout == expected
-    ), f"Unexpected output:expected {expected.encode('UTF-8')!r}; got {exec_result.stdout.encode('UTF-8')!r}"
+    assert exec_result.stdout == expected, (
+        f"Unexpected output:expected {expected.encode('UTF-8')!r}; got {exec_result.stdout.encode('UTF-8')!r}"
+    )
 async def test_exec_timeout(sandbox_env: SandboxEnvironment) -> None:
@@ -248,13 +248,13 @@ async def test_exec_as_user(sandbox_env: SandboxEnvironment) -> None:
         # Test exec as different users
         root_result = await sandbox_env.exec(["whoami"], user="root")
-        assert (
-            root_result.stdout.strip() == "root"
-        ), f"Expected 'root', got '{root_result.stdout.strip()}'"
+        assert root_result.stdout.strip() == "root", (
+            f"Expected 'root', got '{root_result.stdout.strip()}'"
+        )
         myuser_result = await sandbox_env.exec(["whoami"], user=username)
-        assert (
-            myuser_result.stdout.strip() == username
-        ), f"Expected '{username}', got '{myuser_result.stdout.strip()}'"
+        assert myuser_result.stdout.strip() == username, (
+            f"Expected '{username}', got '{myuser_result.stdout.strip()}'"
+        )
     finally:
         # Clean up
         await sandbox_env.exec(["userdel", "-r", username], user="root")
@@ -266,9 +266,9 @@ async def test_exec_as_nonexistent_user(sandbox_env: SandboxEnvironment) -> None
     expected_error = (
         "unable to find user nonexistent: no matching entries in passwd file"
     )
-    assert (
-        expected_error in result.stdout
-    ), f"Error string '{expected_error}' not found in error output: '{result.stdout}'"
+    assert expected_error in result.stdout, (
+        f"Error string '{expected_error}' not found in error output: '{result.stdout}'"
+    )
 async def test_cwd_unspecified(sandbox_env: SandboxEnvironment) -> None:
@@ -291,9 +291,9 @@ async def test_cwd_relative(sandbox_env: SandboxEnvironment) -> None:
     file_path = cwd_subdirectory + "/" + file_name
     await sandbox_env.write_file(file_path, "ls me plz")
     current_dir_contents = (await sandbox_env.exec(["ls"], cwd=cwd_subdirectory)).stdout
-    assert (
-        file_name in current_dir_contents
-    ), f"{file_name} not found in {current_dir_contents}"
+    assert file_name in current_dir_contents, (
+        f"{file_name} not found in {current_dir_contents}"
+    )
     await _cleanup_file(sandbox_env, file_path)

inspect_ai/util/_store.py CHANGED Viewed

@@ -34,8 +34,8 @@ class Store:
     inheriting from Pydantic `BaseModel`)
     """
-    def __init__(self) -> None:
-        self._data: dict[str, Any] = {}
+    def __init__(self, data: dict[str, Any] | None = None) -> None:
+        self._data = deepcopy(data) if data else {}
     @overload
     def get(self, key: str, default: None = None) -> Any: ...

inspect_ai/util/_subprocess.py CHANGED Viewed

@@ -101,9 +101,9 @@ async def subprocess(
     input = input.encode() if isinstance(input, str) else input
     # function to run command (we may or may not run it w/ concurrency)
-    async def run_command() -> (
-        AsyncGenerator[Union[Process, ExecResult[str], ExecResult[bytes]], None]
-    ):
+    async def run_command() -> AsyncGenerator[
+        Union[Process, ExecResult[str], ExecResult[bytes]], None
+    ]:
         if isinstance(args, str):
             proc = await asyncio.create_subprocess_shell(
                 args,

{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: inspect_ai
-Version: 0.3.56
+Version: 0.3.58
 Summary: Framework for large language model evaluations
 Author: UK AI Safety Institute
 License: MIT License
@@ -67,7 +67,7 @@ Requires-Dist: pytest-asyncio; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
 Requires-Dist: pytest-dotenv; extra == "dev"
 Requires-Dist: pytest-xdist; extra == "dev"
-Requires-Dist: ruff==0.8.4; extra == "dev"
+Requires-Dist: ruff==0.9.1; extra == "dev"
 Requires-Dist: textual-dev>=0.86.2; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"
 Requires-Dist: types-beautifulsoup4; extra == "dev"

inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl

inspect-ai 0.3.56py3-none-any.whl → 0.3.58py3-none-any.whl