PyPI - inspect-ai - Versions diffs - 0.3.104__py3-none-any.whl → 0.3.105__py3-none-any.whl - Mend

inspect-ai 0.3.104py3-none-any.whl → 0.3.105py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

inspect_ai/_eval/evalset.py +1 -1
inspect_ai/_eval/task/run.py +64 -38
inspect_ai/_view/server.py +17 -0
inspect_ai/_view/www/dist/assets/index.css +33 -29
inspect_ai/_view/www/dist/assets/index.js +559 -247
inspect_ai/_view/www/src/app/samples/chat/ChatMessage.module.css +4 -0
inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +17 -0
inspect_ai/_view/www/src/app/samples/sample-tools/filters.ts +26 -0
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/SampleFilter.tsx +14 -3
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/completions.ts +359 -7
inspect_ai/_view/www/src/app/samples/sample-tools/sample-filter/language.ts +6 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +1 -1
inspect_ai/_view/www/src/client/api/api-browser.ts +25 -0
inspect_ai/_view/www/src/client/api/api-http.ts +3 -0
inspect_ai/_view/www/src/client/api/api-vscode.ts +6 -0
inspect_ai/_view/www/src/client/api/client-api.ts +3 -0
inspect_ai/_view/www/src/client/api/jsonrpc.ts +1 -0
inspect_ai/_view/www/src/client/api/types.ts +3 -0
inspect_ai/_view/www/src/state/samplePolling.ts +17 -1
inspect_ai/agent/_handoff.py +5 -2
inspect_ai/agent/_react.py +5 -5
inspect_ai/dataset/_dataset.py +1 -1
inspect_ai/log/_samples.py +5 -0
inspect_ai/model/_call_tools.py +4 -4
inspect_ai/model/_providers/anthropic.py +23 -2
inspect_ai/model/_providers/google.py +5 -1
inspect_ai/util/__init__.py +8 -0
inspect_ai/util/_background.py +64 -0
inspect_ai/util/_limit.py +72 -5
inspect_ai/util/_sandbox/__init__.py +2 -0
inspect_ai/util/_sandbox/service.py +28 -7
inspect_ai/util/_subprocess.py +51 -38
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/METADATA +1 -1
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/RECORD +38 -37
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.104.dist-info → inspect_ai-0.3.105.dist-info}/top_level.txt +0 -0

inspect_ai/_view/www/src/client/api/client-api.ts CHANGED Viewed

@@ -335,6 +335,9 @@ export const clientApi = (api: LogViewAPI, log_file?: string): ClientAPI => {
     ) => {
       return api.download_file(download_file, file_contents);
     },
+    log_message: (log_file: string, message: string) => {
+      return api.log_message(log_file, message);
+    },
     get_log_pending_samples: api.eval_pending_samples
       ? get_log_pending_samples
       : undefined,

inspect_ai/_view/www/src/client/api/jsonrpc.ts CHANGED Viewed

@@ -41,6 +41,7 @@ export const kMethodEvalLogBytes = "eval_log_bytes";
 export const kMethodEvalLogHeaders = "eval_log_headers";
 export const kMethodPendingSamples = "eval_log_pending_samples";
 export const kMethodSampleData = "eval_log_sample_data";
+export const kMethodLogMessage = "log_message";
 export const kJsonRpcParseError = -32700;
 export const kJsonRpcInvalidRequest = -32600;

inspect_ai/_view/www/src/client/api/types.ts CHANGED Viewed

@@ -115,6 +115,7 @@ export interface SampleSummary {
   scores: Scores1;
   error?: string;
   limit?: string;
+  metadata?: Record<string, any>;
   completed?: boolean;
   retries?: number;
 }
@@ -149,6 +150,7 @@ export interface LogViewAPI {
     end: number,
   ) => Promise<Uint8Array>;
   eval_log_headers: (log_files: string[]) => Promise<EvalLog[]>;
+  log_message: (log_file: string, message: string) => Promise<void>;
   download_file: (
     filename: string,
     filecontents: string | Blob | ArrayBuffer | ArrayBufferView,
@@ -177,6 +179,7 @@ export interface ClientAPI {
     id: string | number,
     epoch: number,
   ) => Promise<EvalSample | undefined>;
+  log_message?: (log_file: string, message: string) => Promise<void>;
   download_file: (
     file_name: string,
     file_contents: string | Blob | ArrayBuffer | ArrayBufferView,

inspect_ai/_view/www/src/state/samplePolling.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import { Event } from "../app/types";
 import {
   AttachmentData,
+  ClientAPI,
   EventData,
   SampleData,
   SampleSummary,
@@ -183,6 +184,8 @@ export function createSamplePolling(
           const processedEvents = processEvents(
             sampleDataResponse.sampleData,
             pollingState,
+            api,
+            logFile,
           );
           // update max attachment id
@@ -268,7 +271,12 @@ function processAttachments(
   });
 }
-function processEvents(sampleData: SampleData, pollingState: PollingState) {
+function processEvents(
+  sampleData: SampleData,
+  pollingState: PollingState,
+  api: ClientAPI,
+  log_file: string,
+) {
   // Go through each event and resolve it, either appending or replacing
   log.debug(`Processing ${sampleData.events.length} events`);
   if (sampleData.events.length === 0) {
@@ -289,6 +297,14 @@ function processEvents(sampleData: SampleData, pollingState: PollingState) {
           attachmentId,
           available_attachments: Object.keys(pollingState.attachments),
         };
+        if (api.log_message) {
+          api.log_message(
+            log_file,
+            `Unable to resolve attachment ${attachmentId}\n` +
+              JSON.stringify(snapshot),
+          );
+        }
         console.warn(`Unable to resolve attachment ${attachmentId}`, snapshot);
       },
     );

inspect_ai/agent/_handoff.py CHANGED Viewed

@@ -6,7 +6,7 @@ from inspect_ai._util.registry import (
     registry_unqualified_name,
     set_registry_info,
 )
-from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
+from inspect_ai.tool._tool import TOOL_PARALLEL, Tool, ToolResult, ToolSource
 from inspect_ai.tool._tool_def import ToolDef
 from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
 from inspect_ai.util._limit import Limit
@@ -61,7 +61,10 @@ def handoff(
         agent, tool_info.name, input_filter, output_filter, limits, **agent_kwargs
     )
     tool_name = tool_name or f"transfer_to_{tool_info.name}"
-    set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
+    set_registry_info(
+        agent_tool,
+        RegistryInfo(type="tool", name=tool_name, metadata={TOOL_PARALLEL: False}),
+    )
     set_tool_description(
         agent_tool,
         ToolDescription(

inspect_ai/agent/_react.py CHANGED Viewed

@@ -361,13 +361,13 @@ def _prompt_to_system_message(
                 and ("{submit}" not in prompt.assistant_prompt)
                 and prompt.submit_prompt
             ):
-                assistant_prompt = f"{prompt.assistant_prompt}\n{prompt.submit_prompt}"
+                assistant_prompt = f"{prompt.assistant_prompt}\n{prompt.submit_prompt.format(submit=submit_tool)}"
             else:
-                assistant_prompt = prompt.assistant_prompt
+                assistant_prompt = prompt.assistant_prompt.format(
+                    submit=submit_tool or "submit"
+                )
             prompt_lines.append(assistant_prompt)
-        prompt_content = "\n\n".join(prompt_lines).format(
-            submit=submit_tool or "submit"
-        )
+        prompt_content = "\n\n".join(prompt_lines)
         system_message: ChatMessage | None = ChatMessageSystem(content=prompt_content)
     else:
         system_message = None

inspect_ai/dataset/_dataset.py CHANGED Viewed

@@ -308,7 +308,7 @@ class MemoryDataset(Dataset):
     @override
     def shuffle(self, seed: int | None = None) -> None:
-        if seed:
+        if seed is not None:
             random.Random(seed).shuffle(self.samples)
         else:
             random.shuffle(self.samples)

inspect_ai/log/_samples.py CHANGED Viewed

@@ -3,6 +3,7 @@ from contextvars import ContextVar
 from datetime import datetime
 from typing import AsyncGenerator, Iterator, Literal
+from anyio.abc import TaskGroup
 from shortuuid import uuid
 from inspect_ai.dataset._dataset import Sample
@@ -28,6 +29,7 @@ class ActiveSample:
         fails_on_error: bool,
         transcript: Transcript,
         sandboxes: dict[str, SandboxConnection],
+        tg: TaskGroup,
     ) -> None:
         self.id = uuid()
         self.started: float | None = None
@@ -47,6 +49,7 @@ class ActiveSample:
         self.transcript = transcript
         self.sandboxes = sandboxes
         self._interrupt_action: Literal["score", "error"] | None = None
+        self.tg = tg
     @property
     def running_time(self) -> float:
@@ -86,6 +89,7 @@ async def active_sample(
     working_limit: int | None,
     fails_on_error: bool,
     transcript: Transcript,
+    tg: TaskGroup,
 ) -> AsyncGenerator[ActiveSample, None]:
     # create the sample
     active = ActiveSample(
@@ -101,6 +105,7 @@ async def active_sample(
         sandboxes=await sandbox_connections(),
         fails_on_error=fails_on_error,
         transcript=transcript,
+        tg=tg,
     )
     _active_samples.append(active)

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -534,11 +534,11 @@ def prepend_agent_name(
         content = copy(message.content)
         for i in range(0, len(content)):
             if isinstance(content[i], ContentText):
-                content[i] = content[i].model_copy(
-                    update=dict(
-                        text=f"[{agent_name}] {cast(ContentText, content[i]).text}"
+                text = cast(ContentText, content[i]).text
+                if text:
+                    content[i] = content[i].model_copy(
+                        update=dict(text=f"[{agent_name}] {text}")
                     )
-                )
                 break
         return message.model_copy(update=dict(content=content))

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -41,6 +41,7 @@ from anthropic.types import (
 from anthropic.types.beta import (
     BetaToolComputerUse20250124Param,
     BetaToolTextEditor20241022Param,
+    BetaToolTextEditor20250429Param,
 )
 from pydantic import JsonValue
 from typing_extensions import override
@@ -397,6 +398,9 @@ class AnthropicAPI(ModelAPI):
     def is_claude_3_7(self) -> bool:
         return "claude-3-7-" in self.service_model_name()
+    def is_claude_4(self) -> bool:
+        return re.search(r"claude-4-[a-zA-Z]", self.service_model_name()) is not None
     @override
     def connection_key(self) -> str:
         return str(self.api_key)
@@ -627,7 +631,17 @@ class AnthropicAPI(ModelAPI):
     def text_editor_tool_param(
         self, tool: ToolInfo
-    ) -> ToolTextEditor20250124Param | BetaToolTextEditor20241022Param | None:
+    ) -> (
+        ToolTextEditor20250124Param
+        | BetaToolTextEditor20241022Param
+        | BetaToolTextEditor20250429Param
+        | None
+    ):
+        # See: https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/text-editor-tool#before-using-the-text-editor-tool
+        # TODO: It would be great to enhance our `is_claude_xxx` functions to help here.
+        if self.model_name.startswith(("claude-3-5-haiku", "claude-3-opus")):
+            return None
         # check for compatible 'text editor' tool
         if tool.name == "text_editor" and (
             sorted(tool.parameters.properties.keys())
@@ -644,7 +658,11 @@ class AnthropicAPI(ModelAPI):
             )
         ):
             return (
-                BetaToolTextEditor20241022Param(
+                BetaToolTextEditor20250429Param(
+                    type="text_editor_20250429", name="str_replace_based_edit_tool"
+                )
+                if self.is_claude_4()
+                else BetaToolTextEditor20241022Param(
                     type="text_editor_20241022", name="str_replace_editor"
                 )
                 if self.is_claude_3_5()
@@ -706,6 +724,7 @@ ToolParamDef = (
     | BetaToolComputerUse20250124Param
     | ToolTextEditor20250124Param
     | BetaToolTextEditor20241022Param
+    | BetaToolTextEditor20250429Param
     | WebSearchTool20250305Param
 )
@@ -716,6 +735,7 @@ def add_cache_control(
     | BetaToolComputerUse20250124Param
     | ToolTextEditor20250124Param
     | BetaToolTextEditor20241022Param
+    | BetaToolTextEditor20250429Param
     | WebSearchTool20250305Param
     | dict[str, Any],
 ) -> None:
@@ -1008,6 +1028,7 @@ def _names_for_tool_call(
         (INTERNAL_COMPUTER_TOOL_NAME, "computer_20250124", "computer"),
         ("str_replace_editor", "text_editor_20241022", "text_editor"),
         ("str_replace_editor", "text_editor_20250124", "text_editor"),
+        ("str_replace_based_edit_tool", "text_editor_20250429", "text_editor"),
         ("bash", "bash_20250124", "bash_session"),
     )

inspect_ai/model/_providers/google.py CHANGED Viewed

@@ -991,6 +991,10 @@ def _combine_text_parts(acc: list[Part], part: Part) -> list[Part]:
     """Combine adjacent text parts into a single part."""
     return (
         acc + [part]
-        if part.text is None or len(acc) == 0 or acc[-1].text is None
+        if part.text is None
+        or part.thought is True
+        or len(acc) == 0
+        or acc[-1].text is None
+        or acc[-1].thought is True
         else acc[:-1] + [Part(text=acc[-1].text + part.text)]
     )

inspect_ai/util/__init__.py CHANGED Viewed

@@ -4,13 +4,16 @@ from inspect_ai.util._limit import (
     Limit,
     LimitExceededError,
     LimitScope,
+    SampleLimits,
     apply_limits,
     message_limit,
+    sample_limits,
     time_limit,
     token_limit,
     working_limit,
 )
+from ._background import background
 from ._collect import collect
 from ._concurrency import concurrency
 from ._console import input_screen
@@ -29,6 +32,7 @@ from ._sandbox import (
     SandboxEnvironmentType,
     sandbox,
     sandbox_default,
+    sandbox_service,
     sandbox_with,
     sandboxenv,
 )
@@ -44,6 +48,8 @@ from ._throttle import throttle
 __all__ = [
     "apply_limits",
+    "sample_limits",
+    "SampleLimits",
     "ExecResult",
     "concurrency",
     "DisplayType",
@@ -73,6 +79,7 @@ __all__ = [
     "sandbox",
     "sandbox_with",
     "sandbox_default",
+    "sandbox_service",
     "Store",
     "store",
     "StoreModel",
@@ -82,6 +89,7 @@ __all__ = [
     "Subtask",
     "subtask",
     "throttle",
+    "background",
     "token_limit",
     "time_limit",
     "working_limit",

inspect_ai/util/_background.py ADDED Viewed

@@ -0,0 +1,64 @@
+import sys
+from logging import getLogger
+from typing import Any, Awaitable, Callable
+if sys.version_info >= (3, 11):
+    from typing import TypeVarTuple
+else:
+    from typing_extensions import TypeVarTuple
+from typing_extensions import Unpack
+logger = getLogger(__name__)
+PosArgsT = TypeVarTuple("PosArgsT")
+def background(
+    func: Callable[[Unpack[PosArgsT]], Awaitable[Any]],
+    *args: Unpack[PosArgsT],
+) -> None:
+    """Run an async function in the background of the current sample.
+    Background functions must be run from an executing sample.
+    The function will run as long as the current sample is running.
+    When the sample terminates, an anyio cancelled error will be
+    raised in the background function. To catch this error and
+    cleanup:
+    ```python
+    import anyio
+    async def run():
+        try:
+            # background code
+        except anyio.get_cancelled_exc_class():
+            ...
+    ```
+    Args:
+       func: Async function to run
+       *args: Optional function arguments.
+    """
+    from inspect_ai.log._samples import sample_active
+    # get the active sample
+    sample = sample_active()
+    if sample is None:
+        raise RuntimeError(
+            "background() function must be called from a running sample."
+        )
+    # handle and log background exceptions
+    async def run() -> None:
+        try:
+            await func(*args)
+        except Exception as ex:
+            logger.error(f"Background worker error: {ex}")
+            raise
+    # kick it off
+    sample.tg.start_soon(run)

inspect_ai/util/_limit.py CHANGED Viewed

@@ -4,6 +4,7 @@ import abc
 import logging
 from contextlib import ExitStack, contextmanager
 from contextvars import ContextVar
+from dataclasses import dataclass
 from types import TracebackType
 from typing import TYPE_CHECKING, Generic, Iterator, Literal, TypeVar
@@ -88,12 +89,31 @@ class Limit(abc.ABC):
     ) -> None:
         pass
+    @property
+    @abc.abstractmethod
+    def limit(self) -> float | None:
+        """The value of the limit being applied.
+        Can be None which represents no limit.
+        """
+        pass
     @property
     @abc.abstractmethod
     def usage(self) -> float:
         """The current usage of the resource being limited."""
         pass
+    @property
+    def remaining(self) -> float | None:
+        """The remaining "unused" amount of the resource being limited.
+        Returns None if the limit is None.
+        """
+        if self.limit is None:
+            return None
+        return self.limit - self.usage
     def _check_reuse(self) -> None:
         if self._entered:
             raise RuntimeError(
@@ -152,6 +172,46 @@ class LimitScope:
         self.limit_error: LimitExceededError | None = None
+@dataclass
+class SampleLimits:
+    """Data class to hold the limits applied to a Sample.
+    This is used to return the limits from `sample_limits()`.
+    """
+    token: Limit
+    """Token limit."""
+    message: Limit
+    """Message limit."""
+    working: Limit
+    """Working limit."""
+    time: Limit
+    """Time limit."""
+def sample_limits() -> SampleLimits:
+    """Get the top-level limits applied to the current `Sample`."""
+    def get_root_node(node: TNode | None, name: str) -> TNode:
+        if node is None:
+            raise RuntimeError(
+                f"No {name} limit node found. Is there a running sample?"
+            )
+        while node.parent is not None:
+            node = node.parent
+        return node
+    return SampleLimits(
+        token=get_root_node(token_limit_tree.get(), "token"),
+        message=get_root_node(message_limit_tree.get(), "message"),
+        working=get_root_node(working_limit_tree.get(), "working"),
+        time=get_root_node(time_limit_tree.get(), "time"),
+    )
 def token_limit(limit: int | None) -> _TokenLimit:
     """Limits the total number of tokens which can be used.
@@ -319,10 +379,9 @@ class _Tree(Generic[TNode]):
 token_limit_tree: _Tree[_TokenLimit] = _Tree("token_limit_tree")
-# Store the message limit leaf node so that we know which limit to check in
-# check_message_limit().
 message_limit_tree: _Tree[_MessageLimit] = _Tree("message_limit_tree")
 working_limit_tree: _Tree[_WorkingLimit] = _Tree("working_limit_tree")
+time_limit_tree: _Tree[_TimeLimit] = _Tree("time_limit_tree")
 class _Node:
@@ -497,7 +556,7 @@ class _MessageLimit(Limit, _Node):
             )
-class _TimeLimit(Limit):
+class _TimeLimit(Limit, _Node):
     def __init__(self, limit: float | None) -> None:
         super().__init__()
         _validate_time_limit("Time", limit)
@@ -507,8 +566,7 @@ class _TimeLimit(Limit):
     def __enter__(self) -> Limit:
         super()._check_reuse()
-        # Unlike the other limits, this one is not stored in a tree. Anyio handles all
-        # of the state.
+        time_limit_tree.push(self)
         self._cancel_scope = anyio.move_on_after(self._limit)
         self._cancel_scope.__enter__()
         self._start_time = anyio.current_time()
@@ -524,6 +582,7 @@ class _TimeLimit(Limit):
         self._cancel_scope.__exit__(exc_type, exc_val, exc_tb)
         self._end_time = anyio.current_time()
+        self._pop_and_check_identity(time_limit_tree)
         if self._cancel_scope.cancel_called and self._limit is not None:
             message = f"Time limit exceeded. limit: {self._limit} seconds"
             assert self._start_time is not None
@@ -541,6 +600,10 @@ class _TimeLimit(Limit):
                 source=self,
             ) from exc_val
+    @property
+    def limit(self) -> float | None:
+        return self._limit
     @property
     def usage(self) -> float:
         if self._start_time is None:
@@ -575,6 +638,10 @@ class _WorkingLimit(Limit, _Node):
         self._end_time = anyio.current_time()
         self._pop_and_check_identity(working_limit_tree)
+    @property
+    def limit(self) -> float | None:
+        return self._limit
     @property
     def usage(self) -> float:
         if self._start_time is None:

inspect_ai/util/_sandbox/__init__.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .environment import (
 from .limits import OutputLimitExceededError, SandboxEnvironmentLimits
 from .local import LocalSandboxEnvironment  # noqa: F401
 from .registry import sandboxenv
+from .service import sandbox_service
 __all__ = [
     "OutputLimitExceededError",
@@ -27,4 +28,5 @@ __all__ = [
     "sandbox",
     "sandbox_with",
     "sandbox_default",
+    "sandbox_service",
 ]

inspect_ai/util/_sandbox/service.py CHANGED Viewed

@@ -44,14 +44,35 @@ async def sandbox_service(
 ) -> None:
     """Run a service that is callable from within a sandbox.
+    The service makes available a set of methods to a sandbox
+    for calling back into the main Inspect process.
+    To use the service from within a sandbox, either add it to the sys path
+    or use importlib. For example, if the service is named 'foo':
+    ```python
+    import sys
+    sys.path.append("/var/tmp/sandbox-services/foo")
+    import foo
+    ```
+    Or:
+    ```python
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(
+        "foo", "/var/tmp/sandbox-services/foo/foo.py"
+    )
+    foo = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(foo)
+    ```
     Args:
-        name (str): Service name
-        methods (dict[str, SandboxServiceMethod]): Service methods.
-        until (Callable[[], bool]): Function used to check whether
-          the service should stop.
-        sandbox (SandboxEnvironment): Sandbox to publish service to.
-        user (str | None): User to login as. Defaults to the sandbox environment's
-          default user.
+        name: Service name
+        methods: Service methods.
+        until: Function used to check whether the service should stop.
+        sandbox: Sandbox to publish service to.
+        user: User to login as. Defaults to the sandbox environment's default user.
     """
     # setup and start service
     service = SandboxService(name, sandbox, user)

inspect-ai 0.3.104__py3-none-any.whl → 0.3.105__py3-none-any.whl

inspect-ai 0.3.104py3-none-any.whl → 0.3.105py3-none-any.whl