PyPI - inspect-ai - Versions diffs - 0.3.103__py3-none-any.whl → 0.3.105__py3-none-any.whl - Mend

inspect-ai 0.3.103py3-none-any.whl → 0.3.105py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

inspect_ai/_cli/common.py +2 -1
inspect_ai/_cli/eval.py +2 -2
inspect_ai/_display/core/active.py +3 -0
inspect_ai/_display/core/config.py +1 -0
inspect_ai/_display/core/panel.py +21 -13
inspect_ai/_display/core/results.py +3 -7
inspect_ai/_display/core/rich.py +3 -5
inspect_ai/_display/log/__init__.py +0 -0
inspect_ai/_display/log/display.py +173 -0
inspect_ai/_display/plain/display.py +2 -2
inspect_ai/_display/rich/display.py +2 -4
inspect_ai/_display/textual/app.py +1 -6
inspect_ai/_display/textual/widgets/task_detail.py +3 -14
inspect_ai/_display/textual/widgets/tasks.py +1 -1
inspect_ai/_eval/eval.py +1 -1
inspect_ai/_eval/evalset.py +3 -3
inspect_ai/_eval/registry.py +6 -1
inspect_ai/_eval/run.py +5 -1
inspect_ai/_eval/task/constants.py +1 -0
inspect_ai/_eval/task/log.py +2 -0
inspect_ai/_eval/task/run.py +65 -39
inspect_ai/_util/citation.py +88 -0
inspect_ai/_util/content.py +24 -2
inspect_ai/_util/json.py +17 -2
inspect_ai/_util/registry.py +19 -4
inspect_ai/_view/schema.py +0 -6
inspect_ai/_view/server.py +17 -0
inspect_ai/_view/www/dist/assets/index.css +93 -31
inspect_ai/_view/www/dist/assets/index.js +10639 -10011
inspect_ai/_view/www/log-schema.json +418 -1
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
inspect_ai/_view/www/node_modules/katex/src/fonts/generate_fonts.py +58 -0
inspect_ai/_view/www/node_modules/katex/src/metrics/extract_tfms.py +114 -0
inspect_ai/_view/www/node_modules/katex/src/metrics/extract_ttfs.py +122 -0
inspect_ai/_view/www/node_modules/katex/src/metrics/format_json.py +28 -0
inspect_ai/_view/www/node_modules/katex/src/metrics/parse_tfm.py +211 -0
inspect_ai/_view/www/package.json +2 -2
inspect_ai/_view/www/src/@types/log.d.ts +140 -39
inspect_ai/_view/www/src/app/content/RecordTree.tsx +13 -0
inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -1
inspect_ai/_view/www/src/app/routing/logNavigation.ts +31 -0
inspect_ai/_view/www/src/app/routing/{navigationHooks.ts → sampleNavigation.ts} +39 -86
inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +1 -1
inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +1 -1
inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
inspect_ai/_view/www/src/app/samples/chat/MessageCitations.module.css +16 -0
inspect_ai/_view/www/src/app/samples/chat/MessageCitations.tsx +63 -0
inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +6 -0
inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +174 -25
inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +21 -3
inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.module.css +7 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.tsx +111 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.module.css +10 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.tsx +14 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.module.css +19 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.tsx +49 -0
inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -1
inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/chat/types.ts +4 -0
inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +1 -1
inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
inspect_ai/_view/www/src/app/samples/sampleLimit.ts +2 -2
inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +4 -4
inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +1 -1
inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
inspect_ai/_view/www/src/client/api/types.ts +3 -0
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +15 -2
inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
inspect_ai/_view/www/src/tests/README.md +2 -2
inspect_ai/_view/www/src/utils/git.ts +3 -1
inspect_ai/_view/www/src/utils/html.ts +6 -0
inspect_ai/agent/_handoff.py +8 -5
inspect_ai/agent/_react.py +5 -5
inspect_ai/dataset/_dataset.py +1 -1
inspect_ai/log/_condense.py +5 -0
inspect_ai/log/_file.py +4 -1
inspect_ai/log/_log.py +9 -4
inspect_ai/log/_recorders/json.py +4 -2
inspect_ai/log/_samples.py +5 -0
inspect_ai/log/_util.py +2 -0
inspect_ai/model/__init__.py +14 -0
inspect_ai/model/_call_tools.py +17 -8
inspect_ai/model/_chat_message.py +3 -0
inspect_ai/model/_openai_responses.py +80 -34
inspect_ai/model/_providers/_anthropic_citations.py +158 -0
inspect_ai/model/_providers/_google_citations.py +100 -0
inspect_ai/model/_providers/anthropic.py +219 -36
inspect_ai/model/_providers/google.py +98 -22
inspect_ai/model/_providers/mistral.py +20 -7
inspect_ai/model/_providers/openai.py +11 -10
inspect_ai/model/_providers/openai_compatible.py +3 -2
inspect_ai/model/_providers/openai_responses.py +2 -5
inspect_ai/model/_providers/perplexity.py +123 -0
inspect_ai/model/_providers/providers.py +13 -2
inspect_ai/model/_providers/vertex.py +3 -0
inspect_ai/model/_trim.py +5 -0
inspect_ai/tool/__init__.py +14 -0
inspect_ai/tool/_mcp/_mcp.py +5 -2
inspect_ai/tool/_mcp/sampling.py +19 -3
inspect_ai/tool/_mcp/server.py +1 -1
inspect_ai/tool/_tool.py +10 -1
inspect_ai/tool/_tools/_web_search/_base_http_provider.py +104 -0
inspect_ai/tool/_tools/_web_search/_exa.py +78 -0
inspect_ai/tool/_tools/_web_search/_google.py +22 -25
inspect_ai/tool/_tools/_web_search/_tavily.py +47 -65
inspect_ai/tool/_tools/_web_search/_web_search.py +83 -36
inspect_ai/tool/_tools/_web_search/_web_search_provider.py +7 -0
inspect_ai/util/__init__.py +8 -0
inspect_ai/util/_background.py +64 -0
inspect_ai/util/_display.py +11 -2
inspect_ai/util/_limit.py +72 -5
inspect_ai/util/_sandbox/__init__.py +2 -0
inspect_ai/util/_sandbox/docker/compose.py +2 -2
inspect_ai/util/_sandbox/service.py +28 -7
inspect_ai/util/_span.py +12 -1
inspect_ai/util/_subprocess.py +51 -38
{inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/METADATA +2 -2
{inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/RECORD +134 -109
/inspect_ai/model/{_openai_computer_use.py → _providers/_openai_computer_use.py} +0 -0
/inspect_ai/model/{_openai_web_search.py → _providers/_openai_web_search.py} +0 -0
{inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/top_level.txt +0 -0

inspect_ai/util/_limit.py CHANGED Viewed

@@ -4,6 +4,7 @@ import abc
 import logging
 from contextlib import ExitStack, contextmanager
 from contextvars import ContextVar
+from dataclasses import dataclass
 from types import TracebackType
 from typing import TYPE_CHECKING, Generic, Iterator, Literal, TypeVar
@@ -88,12 +89,31 @@ class Limit(abc.ABC):
     ) -> None:
         pass
+    @property
+    @abc.abstractmethod
+    def limit(self) -> float | None:
+        """The value of the limit being applied.
+        Can be None which represents no limit.
+        """
+        pass
     @property
     @abc.abstractmethod
     def usage(self) -> float:
         """The current usage of the resource being limited."""
         pass
+    @property
+    def remaining(self) -> float | None:
+        """The remaining "unused" amount of the resource being limited.
+        Returns None if the limit is None.
+        """
+        if self.limit is None:
+            return None
+        return self.limit - self.usage
     def _check_reuse(self) -> None:
         if self._entered:
             raise RuntimeError(
@@ -152,6 +172,46 @@ class LimitScope:
         self.limit_error: LimitExceededError | None = None
+@dataclass
+class SampleLimits:
+    """Data class to hold the limits applied to a Sample.
+    This is used to return the limits from `sample_limits()`.
+    """
+    token: Limit
+    """Token limit."""
+    message: Limit
+    """Message limit."""
+    working: Limit
+    """Working limit."""
+    time: Limit
+    """Time limit."""
+def sample_limits() -> SampleLimits:
+    """Get the top-level limits applied to the current `Sample`."""
+    def get_root_node(node: TNode | None, name: str) -> TNode:
+        if node is None:
+            raise RuntimeError(
+                f"No {name} limit node found. Is there a running sample?"
+            )
+        while node.parent is not None:
+            node = node.parent
+        return node
+    return SampleLimits(
+        token=get_root_node(token_limit_tree.get(), "token"),
+        message=get_root_node(message_limit_tree.get(), "message"),
+        working=get_root_node(working_limit_tree.get(), "working"),
+        time=get_root_node(time_limit_tree.get(), "time"),
+    )
 def token_limit(limit: int | None) -> _TokenLimit:
     """Limits the total number of tokens which can be used.
@@ -319,10 +379,9 @@ class _Tree(Generic[TNode]):
 token_limit_tree: _Tree[_TokenLimit] = _Tree("token_limit_tree")
-# Store the message limit leaf node so that we know which limit to check in
-# check_message_limit().
 message_limit_tree: _Tree[_MessageLimit] = _Tree("message_limit_tree")
 working_limit_tree: _Tree[_WorkingLimit] = _Tree("working_limit_tree")
+time_limit_tree: _Tree[_TimeLimit] = _Tree("time_limit_tree")
 class _Node:
@@ -497,7 +556,7 @@ class _MessageLimit(Limit, _Node):
             )
-class _TimeLimit(Limit):
+class _TimeLimit(Limit, _Node):
     def __init__(self, limit: float | None) -> None:
         super().__init__()
         _validate_time_limit("Time", limit)
@@ -507,8 +566,7 @@ class _TimeLimit(Limit):
     def __enter__(self) -> Limit:
         super()._check_reuse()
-        # Unlike the other limits, this one is not stored in a tree. Anyio handles all
-        # of the state.
+        time_limit_tree.push(self)
         self._cancel_scope = anyio.move_on_after(self._limit)
         self._cancel_scope.__enter__()
         self._start_time = anyio.current_time()
@@ -524,6 +582,7 @@ class _TimeLimit(Limit):
         self._cancel_scope.__exit__(exc_type, exc_val, exc_tb)
         self._end_time = anyio.current_time()
+        self._pop_and_check_identity(time_limit_tree)
         if self._cancel_scope.cancel_called and self._limit is not None:
             message = f"Time limit exceeded. limit: {self._limit} seconds"
             assert self._start_time is not None
@@ -541,6 +600,10 @@ class _TimeLimit(Limit):
                 source=self,
             ) from exc_val
+    @property
+    def limit(self) -> float | None:
+        return self._limit
     @property
     def usage(self) -> float:
         if self._start_time is None:
@@ -575,6 +638,10 @@ class _WorkingLimit(Limit, _Node):
         self._end_time = anyio.current_time()
         self._pop_and_check_identity(working_limit_tree)
+    @property
+    def limit(self) -> float | None:
+        return self._limit
     @property
     def usage(self) -> float:
         if self._start_time is None:

inspect_ai/util/_sandbox/__init__.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .environment import (
 from .limits import OutputLimitExceededError, SandboxEnvironmentLimits
 from .local import LocalSandboxEnvironment  # noqa: F401
 from .registry import sandboxenv
+from .service import sandbox_service
 __all__ = [
     "OutputLimitExceededError",
@@ -27,4 +28,5 @@ __all__ = [
     "sandbox",
     "sandbox_with",
     "sandbox_default",
+    "sandbox_service",
 ]

inspect_ai/util/_sandbox/docker/compose.py CHANGED Viewed

@@ -11,7 +11,7 @@ from pydantic import BaseModel
 from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.trace import trace_message
 from inspect_ai.util._concurrency import concurrency
-from inspect_ai.util._display import display_type
+from inspect_ai.util._display import display_type, display_type_plain
 from inspect_ai.util._subprocess import ExecResult, subprocess
 from .prereqs import (
@@ -285,7 +285,7 @@ async def compose_command(
     env = project.env if (project.env and forward_env) else {}
     # ansi (apply global override)
-    if display_type() == "plain":
+    if display_type_plain():
         ansi = "never"
     if ansi:
         compose_command = compose_command + ["--ansi", ansi]

inspect_ai/util/_sandbox/service.py CHANGED Viewed

@@ -44,14 +44,35 @@ async def sandbox_service(
 ) -> None:
     """Run a service that is callable from within a sandbox.
+    The service makes available a set of methods to a sandbox
+    for calling back into the main Inspect process.
+    To use the service from within a sandbox, either add it to the sys path
+    or use importlib. For example, if the service is named 'foo':
+    ```python
+    import sys
+    sys.path.append("/var/tmp/sandbox-services/foo")
+    import foo
+    ```
+    Or:
+    ```python
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(
+        "foo", "/var/tmp/sandbox-services/foo/foo.py"
+    )
+    foo = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(foo)
+    ```
     Args:
-        name (str): Service name
-        methods (dict[str, SandboxServiceMethod]): Service methods.
-        until (Callable[[], bool]): Function used to check whether
-          the service should stop.
-        sandbox (SandboxEnvironment): Sandbox to publish service to.
-        user (str | None): User to login as. Defaults to the sandbox environment's
-          default user.
+        name: Service name
+        methods: Service methods.
+        until: Function used to check whether the service should stop.
+        sandbox: Sandbox to publish service to.
+        user: User to login as. Defaults to the sandbox environment's default user.
     """
     # setup and start service
     service = SandboxService(name, sandbox, user)

inspect_ai/util/_span.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import contextlib
+import inspect
 from contextvars import ContextVar
+from logging import getLogger
 from typing import AsyncIterator
 from uuid import uuid4
+logger = getLogger(__name__)
 @contextlib.asynccontextmanager
 async def span(name: str, *, type: str | None = None) -> AsyncIterator[None]:
@@ -22,6 +26,10 @@ async def span(name: str, *, type: str | None = None) -> AsyncIterator[None]:
     # span id
     id = uuid4().hex
+    # span caller context
+    frame = inspect.stack()[1]
+    caller = f"{frame.function}() [{frame.filename}:{frame.lineno}]"
     # capture parent id
     parent_id = _current_span_id.get()
@@ -48,7 +56,10 @@ async def span(name: str, *, type: str | None = None) -> AsyncIterator[None]:
         # send end event
         transcript()._event(SpanEndEvent(id=id))
-        _current_span_id.reset(token)
+        try:
+            _current_span_id.reset(token)
+        except ValueError:
+            logger.warning(f"Exiting span created in another context: {caller}")
 def current_span_id() -> str | None:

inspect_ai/util/_subprocess.py CHANGED Viewed

@@ -2,16 +2,15 @@ import functools
 import io
 import os
 import shlex
-from contextlib import aclosing
 from contextvars import ContextVar
 from dataclasses import dataclass
 from logging import getLogger
 from pathlib import Path
 from subprocess import DEVNULL, PIPE
-from typing import AsyncGenerator, Generic, Literal, TypeVar, Union, cast, overload
+from typing import Generic, Literal, TypeVar, Union, overload
 import anyio
-from anyio import open_process
+from anyio import ClosedResourceError, create_task_group, open_process
 from anyio.abc import ByteReceiveStream, Process
 from inspect_ai._util._async import tg_collect
@@ -114,9 +113,7 @@ async def subprocess(
         else None
     )
-    async def run_command() -> AsyncGenerator[
-        Union[Process, ExecResult[str], ExecResult[bytes]], None
-    ]:
+    async def run_command() -> Union[ExecResult[str], ExecResult[bytes]]:
         process = await open_process(
             args,
             stdin=PIPE if input else DEVNULL,
@@ -126,9 +123,6 @@ async def subprocess(
             env={**os.environ, **env},
         )
         try:
-            # yield the process so the caller has a handle to it
-            yield process
             # write to stdin (convert input to bytes)
             if process.stdin and input:
                 await process.stdin.send(input)
@@ -161,19 +155,23 @@ async def subprocess(
             returncode = await process.wait()
             success = returncode == 0
             if text:
-                yield ExecResult[str](
+                return ExecResult[str](
                     success=success,
                     returncode=returncode,
                     stdout=stdout.decode() if capture_output else "",
                     stderr=stderr.decode() if capture_output else "",
                 )
             else:
-                yield ExecResult[bytes](
+                return ExecResult[bytes](
                     success=success,
                     returncode=returncode,
                     stdout=stdout if capture_output else bytes(),
                     stderr=stderr if capture_output else bytes(),
                 )
+        # Handle cancellation before aclose() is called to avoid deadlock.
+        except anyio.get_cancelled_exc_class():
+            await gracefully_terminate_cancelled_subprocess(process)
+            raise
         finally:
             try:
                 await process.aclose()
@@ -186,33 +184,13 @@ async def subprocess(
     # wrapper for run command that implements timeout
     async def run_command_timeout() -> Union[ExecResult[str], ExecResult[bytes]]:
-        # run the command and capture the process handle
-        async with aclosing(run_command()) as rc:
-            proc = cast(Process, await anext(rc))
-            # await result wrapped in timeout handler if requested
-            if timeout is not None:
-                try:
-                    with anyio.fail_after(timeout):
-                        result = await anext(rc)
-                        return cast(Union[ExecResult[str], ExecResult[bytes]], result)
-                except TimeoutError:
-                    # terminate timed out process -- try for graceful termination
-                    # then be more forceful if requied
-                    with anyio.CancelScope(shield=True):
-                        try:
-                            proc.terminate()
-                            await anyio.sleep(2)
-                            if proc.returncode is None:
-                                proc.kill()
-                        except Exception:
-                            pass
-                    raise
-            # await result without timeout
-            else:
-                result = await anext(rc)
-                return cast(Union[ExecResult[str], ExecResult[bytes]], result)
+        # wrap in timeout handler if requested
+        if timeout is not None:
+            with anyio.fail_after(timeout):
+                # run_command() handles terminating the process if it is cancelled.
+                return await run_command()
+        else:
+            return await run_command()
     # run command
     async with concurrency("subprocesses", max_subprocesses_context_var.get()):
@@ -233,6 +211,41 @@ def default_max_subprocesses() -> int:
     return cpus if cpus else 1
+async def gracefully_terminate_cancelled_subprocess(process: Process) -> None:
+    with anyio.CancelScope(shield=True):
+        try:
+            # Terminate timed out process -- try for graceful termination then kill if
+            # required.
+            process.terminate()
+            await anyio.sleep(2)
+            if process.returncode is None:
+                process.kill()
+            # With anyio's asyncio backend, process.aclose() calls process.wait() which
+            # can deadlock if the process generates so much output that it blocks
+            # waiting for the OS pipe buffer to accept more data. See
+            # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.subprocess.Process.wait
+            # Therefore, we need to ensure that the process's stdout and stderr streams
+            # are drained before we call process.wait() in aclose().
+            async with create_task_group() as tg:
+                tg.start_soon(drain_stream, process.stdout)
+                tg.start_soon(drain_stream, process.stderr)
+            # Wait for the process to exit. Will be called again by aclose().
+            await process.wait()
+        # The process may have already exited, in which case we can ignore the error.
+        except ProcessLookupError:
+            pass
+async def drain_stream(stream: ByteReceiveStream | None) -> None:
+    if stream is None:
+        return
+    try:
+        async for _ in stream:
+            pass
+    except ClosedResourceError:
+        pass
 max_subprocesses_context_var = ContextVar[int](
     "max_subprocesses", default=default_max_subprocesses()
 )

{inspect_ai-0.3.103.dist-info → inspect_ai-0.3.105.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inspect_ai
-Version: 0.3.103
+Version: 0.3.105
 Summary: Framework for large language model evaluations
 Author: UK AI Security Institute
 License: MIT License
@@ -63,7 +63,7 @@ Requires-Dist: groq; extra == "dev"
 Requires-Dist: ipython; extra == "dev"
 Requires-Dist: jsonpath-ng; extra == "dev"
 Requires-Dist: markdown; extra == "dev"
-Requires-Dist: mcp; extra == "dev"
+Requires-Dist: mcp>=1.9.4; extra == "dev"
 Requires-Dist: mistralai; extra == "dev"
 Requires-Dist: moto[server]; extra == "dev"
 Requires-Dist: mypy>=1.16.0; extra == "dev"

inspect-ai 0.3.103__py3-none-any.whl → 0.3.105__py3-none-any.whl

inspect-ai 0.3.103py3-none-any.whl → 0.3.105py3-none-any.whl