PyPI - inspect-ai - Versions diffs - 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl - Mend

inspect-ai 0.3.91py3-none-any.whl → 0.3.93py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

inspect_ai/_cli/eval.py +31 -0
inspect_ai/_eval/eval.py +19 -2
inspect_ai/_eval/evalset.py +4 -1
inspect_ai/_eval/run.py +41 -0
inspect_ai/_eval/task/generate.py +38 -44
inspect_ai/_eval/task/log.py +26 -28
inspect_ai/_eval/task/run.py +13 -20
inspect_ai/_util/local_server.py +368 -0
inspect_ai/_util/working.py +10 -4
inspect_ai/_view/www/dist/assets/index.css +159 -146
inspect_ai/_view/www/dist/assets/index.js +1020 -1061
inspect_ai/_view/www/log-schema.json +4 -3
inspect_ai/_view/www/package.json +1 -1
inspect_ai/_view/www/src/@types/log.d.ts +3 -2
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
inspect_ai/_view/www/src/components/Card.css +0 -1
inspect_ai/_view/www/src/constants.ts +2 -0
inspect_ai/_view/www/src/utils/numeric.ts +17 -0
inspect_ai/agent/_agent.py +3 -3
inspect_ai/agent/_as_solver.py +20 -12
inspect_ai/agent/_as_tool.py +15 -3
inspect_ai/agent/_handoff.py +8 -1
inspect_ai/agent/_run.py +11 -3
inspect_ai/log/__init__.py +4 -0
inspect_ai/log/_file.py +56 -0
inspect_ai/log/_log.py +99 -0
inspect_ai/log/_recorders/__init__.py +2 -0
inspect_ai/log/_recorders/buffer/database.py +12 -11
inspect_ai/log/_recorders/buffer/filestore.py +2 -2
inspect_ai/log/_recorders/buffer/types.py +2 -2
inspect_ai/log/_recorders/eval.py +20 -65
inspect_ai/log/_recorders/file.py +28 -6
inspect_ai/log/_recorders/recorder.py +7 -0
inspect_ai/log/_recorders/types.py +1 -23
inspect_ai/log/_samples.py +0 -8
inspect_ai/log/_transcript.py +7 -1
inspect_ai/log/_util.py +52 -0
inspect_ai/model/__init__.py +5 -1
inspect_ai/model/_call_tools.py +32 -12
inspect_ai/model/_generate_config.py +14 -8
inspect_ai/model/_model.py +21 -48
inspect_ai/model/_model_output.py +25 -0
inspect_ai/model/_openai.py +2 -0
inspect_ai/model/_openai_responses.py +13 -1
inspect_ai/model/_providers/anthropic.py +13 -23
inspect_ai/model/_providers/openai_o1.py +8 -2
inspect_ai/model/_providers/providers.py +18 -4
inspect_ai/model/_providers/sglang.py +241 -0
inspect_ai/model/_providers/vllm.py +207 -400
inspect_ai/solver/__init__.py +7 -2
inspect_ai/solver/_basic_agent.py +3 -10
inspect_ai/solver/_task_state.py +26 -88
inspect_ai/tool/_json_rpc_helpers.py +45 -17
inspect_ai/tool/_mcp/_mcp.py +2 -0
inspect_ai/tool/_mcp/_sandbox.py +8 -2
inspect_ai/tool/_mcp/server.py +3 -1
inspect_ai/tool/_tool_call.py +4 -1
inspect_ai/tool/_tool_support_helpers.py +51 -12
inspect_ai/tool/_tools/_bash_session.py +190 -68
inspect_ai/tool/_tools/_computer/_computer.py +25 -1
inspect_ai/tool/_tools/_text_editor.py +4 -3
inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
inspect_ai/util/__init__.py +12 -0
inspect_ai/util/_limit.py +393 -0
inspect_ai/util/_limited_conversation.py +57 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +90 -109
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
inspect_ai/solver/_limit.py +0 -39
inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/tool/_tools/_computer/test_args.py +0 -151
/inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0

inspect_ai/util/_limit.py ADDED Viewed

@@ -0,0 +1,393 @@
+from __future__ import annotations
+import abc
+import logging
+from contextlib import ExitStack, contextmanager
+from contextvars import ContextVar
+from types import TracebackType
+from typing import TYPE_CHECKING, Iterator, Literal
+from inspect_ai._util.logger import warn_once
+if TYPE_CHECKING:
+    # These imports are used as type hints only - prevent circular imports.
+    from inspect_ai.model._model_output import ModelUsage
+    from inspect_ai.solver._task_state import TaskState
+logger = logging.getLogger(__name__)
+# Stores the current execution context's leaf _TokenLimitNode.
+# The resulting data structure is a tree of _TokenLimitNode nodes which each
+# have a pointer to their parent node. Each additional context manager inserts a new
+# child node into the tree. The fact that there can be multiple execution contexts is
+# what makes this a tree rather than a stack.
+token_limit_leaf_node: ContextVar[_TokenLimitNode | None] = ContextVar(
+    "token_limit_leaf_node", default=None
+)
+message_limit_leaf_node: ContextVar[_MessageLimitNode | None] = ContextVar(
+    "message_limit_leaf_node", default=None
+)
+class LimitExceededError(Exception):
+    """Exception raised when a limit is exceeded.
+    In some scenarios this error may be raised when `value >= limit` to
+    prevent another operation which is guaranteed to exceed the limit from being
+    wastefully performed.
+    Args:
+       type: Type of limit exceeded.
+       value: Value compared to.
+       limit: Limit applied.
+       message (str | None): Optional. Human readable message.
+    """
+    def __init__(
+        self,
+        type: Literal["message", "time", "working", "token", "operator", "custom"],
+        *,
+        value: int,
+        limit: int,
+        message: str | None = None,
+    ) -> None:
+        self.type = type
+        self.value = value
+        self.limit = limit
+        self.message = f"Exceeded {type} limit: {limit:,}"
+        super().__init__(message)
+    def with_state(self, state: TaskState) -> LimitExceededError:
+        warn_once(
+            logger,
+            "LimitExceededError.with_state() is deprecated (no longer required).",
+        )
+        return self
+class Limit(abc.ABC):
+    """Base class for all limits."""
+    @abc.abstractmethod
+    def __enter__(self) -> Limit:
+        pass
+    @abc.abstractmethod
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        pass
+@contextmanager
+def apply_limits(limits: list[Limit]) -> Iterator[None]:
+    """
+    Apply a list of limits within a context manager.
+    Args:
+      limits: List of limits to apply while the context manager is open. Should a
+        limit be exceeded, a LimitExceededError is raised.
+    """
+    with ExitStack() as stack:
+        for limit in limits:
+            stack.enter_context(limit)
+        yield
+def token_limit(limit: int | None) -> _TokenLimit:
+    """Limits the total number of tokens which can be used.
+    The counter starts when the context manager is opened and ends when it is closed.
+    The context manager can be opened multiple times, even in different execution
+    contexts.
+    These limits can be stacked.
+    This relies on "cooperative" checking - consumers must call check_token_limit()
+    themselves whenever tokens are consumed.
+    When a limit is exceeded, a LimitExceededError is raised.
+    Args:
+      limit: The maximum number of tokens that can be used while the context manager is
+        open. Tokens used before the context manager was opened are not counted. A value
+        of None means unlimited tokens.
+    """
+    return _TokenLimit(limit)
+def record_model_usage(usage: ModelUsage) -> None:
+    """Record model usage against any active token limits.
+    Does not check if the limit has been exceeded.
+    """
+    node = token_limit_leaf_node.get()
+    if node is None:
+        return
+    node.record(usage)
+def check_token_limit() -> None:
+    """Check if the current token usage exceeds _any_ of the token limits.
+    Within the current execution context (e.g. async task) and its parent contexts only.
+    Note that all active token limits are checked, not just the most recent one.
+    """
+    node = token_limit_leaf_node.get()
+    if node is None:
+        return
+    node.check()
+def message_limit(limit: int | None) -> _MessageLimit:
+    """Limits the number of messages in a conversation.
+    The total number of messages in the conversation are compared to the limit (not just
+    "new" messages). The context manager can be opened multiple times, even in different
+    execution contexts.
+    These limits can be stacked.
+    This relies on "cooperative" checking - consumers must call check_message_limit()
+    themselves whenever the message count is updated.
+    When a limit is exceeded, a LimitExceededError is raised.
+    Args:
+      limit: The maximum conversation length (number of messages) allowed while the
+        context manager is open. A value of None means unlimited messages.
+    """
+    return _MessageLimit(limit)
+def check_message_limit(count: int, raise_for_equal: bool) -> None:
+    """Check if the current message count exceeds the active message limit.
+    Only the most recent message limit is checked. Ancestors are not checked.
+    Args:
+      count: The number of messages in the conversation.
+      raise_for_equal: If True, raise an error if the message count is equal to the
+        limit, otherwise, only raise an error if the message count is greater than the
+        limit.
+    """
+    node = message_limit_leaf_node.get()
+    if node is None:
+        return
+    node.check(count, raise_for_equal)
+class _LimitValueWrapper:
+    """Container/wrapper type for the limit value.
+    This facilitates updating the limit value, which may have been passed to many
+    _TokenLimitNode instances.
+    """
+    def __init__(self, value: int | None) -> None:
+        self.value = value
+class _TokenLimit(Limit):
+    def __init__(self, limit: int | None) -> None:
+        self._validate_token_limit(limit)
+        self._limit_value_wrapper = _LimitValueWrapper(limit)
+    def __enter__(self) -> Limit:
+        current_node = token_limit_leaf_node.get()
+        new_node = _TokenLimitNode(self._limit_value_wrapper, current_node)
+        # Note that we don't store new_node as an instance variable, because the context
+        # manager may be used across multiple execution contexts, or opened multiple
+        # times.
+        token_limit_leaf_node.set(new_node)
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        current_node = token_limit_leaf_node.get()
+        assert current_node is not None, (
+            "Token limit node should not be None when exiting context manager."
+        )
+        token_limit_leaf_node.set(current_node.parent)
+    @property
+    def limit(self) -> int | None:
+        """Get the configured token limit value."""
+        return self._limit_value_wrapper.value
+    @limit.setter
+    def limit(self, value: int | None) -> None:
+        """Update the token limit value.
+        This will affect the limit for all active token limit nodes derived from this
+        context manager.
+        This does not trigger a check of the token limit (which could now have been
+        exceeded).
+        """
+        self._validate_token_limit(value)
+        self._limit_value_wrapper.value = value
+    def _validate_token_limit(self, value: int | None) -> None:
+        if value is not None and value < 0:
+            raise ValueError("Token limit value must be a non-negative integer.")
+class _TokenLimitNode:
+    def __init__(
+        self,
+        limit: _LimitValueWrapper,
+        parent: _TokenLimitNode | None,
+    ) -> None:
+        """
+        Initialize a token limit node.
+        Forms part of a tree structure. Each node has a pointer to its parent, or None
+        if it is the root node.
+        Tracks the token usage for this node and its parent nodes and checks if the
+        usage has exceeded a (variable) limit.
+        Args:
+          limit: The maximum number of tokens that can be used while the context
+            manager is open.
+          parent: The parent node in the tree.
+        """
+        from inspect_ai.model._model_output import ModelUsage
+        self._limit = limit
+        self.parent = parent
+        self._usage = ModelUsage()
+    def record(self, usage: ModelUsage) -> None:
+        """Record model usage for this node and its parent nodes."""
+        if self.parent is not None:
+            self.parent.record(usage)
+        self._usage += usage
+    def check(self) -> None:
+        """Check if this token limit or any parent limits have been exceeded."""
+        self._check_self()
+        if self.parent is not None:
+            self.parent.check()
+    def _check_self(self) -> None:
+        from inspect_ai.log._transcript import SampleLimitEvent, transcript
+        if self._limit.value is None:
+            return
+        total = self._usage.total_tokens
+        if total > self._limit.value:
+            message = (
+                f"Token limit exceeded. value: {total:,}; limit: {self._limit.value:,}"
+            )
+            transcript()._event(
+                SampleLimitEvent(type="token", limit=self._limit.value, message=message)
+            )
+            raise LimitExceededError(
+                "token", value=total, limit=self._limit.value, message=message
+            )
+class _MessageLimit(Limit):
+    def __init__(self, limit: int | None) -> None:
+        self._validate_message_limit(limit)
+        self._limit_value_wrapper = _LimitValueWrapper(limit)
+    def __enter__(self) -> Limit:
+        current_node = message_limit_leaf_node.get()
+        new_node = _MessageLimitNode(self._limit_value_wrapper, current_node)
+        # Note that we don't store new_node as an instance variable, because the context
+        # manager may be used across multiple execution contexts, or opened multiple
+        # times.
+        message_limit_leaf_node.set(new_node)
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        current_node = message_limit_leaf_node.get()
+        assert current_node is not None, (
+            "Message limit node should not be None when exiting context manager."
+        )
+        message_limit_leaf_node.set(current_node.parent)
+    @property
+    def limit(self) -> int | None:
+        """Get the configured message limit value."""
+        return self._limit_value_wrapper.value
+    @limit.setter
+    def limit(self, value: int | None) -> None:
+        """Update the message limit value.
+        This will affect the limit for all active message limit nodes derived from this
+        context manager.
+        This does not trigger a check of the message limit (which could now have been
+        exceeded).
+        """
+        self._validate_message_limit(value)
+        self._limit_value_wrapper.value = value
+    def _validate_message_limit(self, value: int | None) -> None:
+        if value is not None and value < 0:
+            raise ValueError("Message limit value must be a non-negative integer.")
+class _MessageLimitNode:
+    def __init__(
+        self,
+        limit: _LimitValueWrapper,
+        parent: _MessageLimitNode | None,
+    ) -> None:
+        """
+        Initialize a message limit node.
+        Forms part of a tree structure. Each node has a pointer to its parent, or None
+        if it is the root node.
+        Checks if the message count for this node has exceeded a (variable) limit.
+        Args:
+          limit: The maximum conversation length (number of messages) allowed while this
+            node is the lead node of the current execution context.
+          parent: The parent node in the tree.
+        """
+        self._limit = limit
+        self.parent = parent
+    def check(self, count: int, raise_for_equal: bool) -> None:
+        """Check if this message limit has been exceeded.
+        Does not check parents.
+        """
+        from inspect_ai.log._transcript import SampleLimitEvent, transcript
+        if self._limit.value is None:
+            return
+        limit = self._limit.value
+        if count > limit or (raise_for_equal and count == limit):
+            reached_or_exceeded = "reached" if count == limit else "exceeded"
+            message = (
+                f"Message limit {reached_or_exceeded}. count: {count:,}; "
+                f"limit: {limit:,}"
+            )
+            transcript()._event(
+                SampleLimitEvent(type="message", limit=limit, message=message)
+            )
+            raise LimitExceededError(
+                "message", value=count, limit=limit, message=message
+            )

inspect_ai/util/_limited_conversation.py ADDED Viewed

@@ -0,0 +1,57 @@
+from itertools import tee
+from typing import Iterable, SupportsIndex, overload
+from inspect_ai.model._chat_message import ChatMessage, ChatMessageBase
+from inspect_ai.util._limit import check_message_limit
+class ChatMessageList(list[ChatMessage]):
+    """A limited list of ChatMessage items.
+    Raises an exception if an operation would exceed the active message limit.
+    """
+    def __init__(self, iterable: Iterable[ChatMessage]):
+        items, length = self._iterable_length(iterable)
+        self._check_size(length)
+        super().__init__(items)
+    def _check_size(self, additional_items: int) -> None:
+        check_message_limit(len(self) + additional_items, raise_for_equal=False)
+    def append(self, item: ChatMessage) -> None:
+        self._check_size(1)
+        super().append(item)
+    def extend(self, items: Iterable[ChatMessage]) -> None:
+        items, length = self._iterable_length(items)
+        self._check_size(length)
+        super().extend(items)
+    def insert(self, index: SupportsIndex, item: ChatMessage) -> None:
+        self._check_size(1)
+        super().insert(index, item)
+    @overload
+    def __setitem__(self, index: SupportsIndex, item: ChatMessage) -> None: ...
+    @overload
+    def __setitem__(self, index: slice, item: Iterable[ChatMessage]) -> None: ...
+    def __setitem__(
+        self, index: SupportsIndex | slice, item: ChatMessage | Iterable[ChatMessage]
+    ) -> None:
+        if isinstance(index, slice) and not isinstance(item, ChatMessageBase):
+            item, length = self._iterable_length(item)
+            size_change = length - len(self[index])
+            if size_change > 0:
+                self._check_size(size_change)
+        super().__setitem__(index, item)  # type: ignore[assignment,index]
+    def _iterable_length(
+        self, items: Iterable[ChatMessage]
+    ) -> tuple[Iterable[ChatMessage], int]:
+        items, counter = tee(items)
+        length = sum(1 for _ in counter)
+        return items, length

{inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: inspect_ai
-Version: 0.3.91
+Version: 0.3.93
 Summary: Framework for large language model evaluations
 Author: UK AI Security Institute
 License: MIT License

inspect-ai 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl

inspect-ai 0.3.91py3-none-any.whl → 0.3.93py3-none-any.whl