PyPI - inspect-ai - Versions diffs - 0.3.73__py3-none-any.whl → 0.3.75__py3-none-any.whl - Mend

inspect-ai 0.3.73py3-none-any.whl → 0.3.75py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

inspect_ai/__init__.py +3 -2
inspect_ai/_cli/cache.py +1 -1
inspect_ai/_cli/common.py +15 -0
inspect_ai/_cli/eval.py +4 -5
inspect_ai/_cli/log.py +1 -1
inspect_ai/_cli/sandbox.py +1 -1
inspect_ai/_cli/trace.py +1 -1
inspect_ai/_cli/view.py +1 -1
inspect_ai/_display/core/config.py +3 -1
inspect_ai/_eval/eval.py +55 -61
inspect_ai/_eval/evalset.py +63 -154
inspect_ai/_eval/loader.py +27 -54
inspect_ai/_eval/registry.py +1 -10
inspect_ai/_eval/run.py +3 -4
inspect_ai/_eval/task/__init__.py +8 -2
inspect_ai/_eval/task/log.py +9 -1
inspect_ai/_eval/task/resolved.py +35 -0
inspect_ai/_eval/task/task.py +50 -69
inspect_ai/_eval/task/tasks.py +30 -0
inspect_ai/_util/constants.py +3 -0
inspect_ai/_util/dotenv.py +17 -0
inspect_ai/_util/registry.py +43 -2
inspect_ai/_view/server.py +28 -10
inspect_ai/_view/www/dist/assets/index.css +4 -3
inspect_ai/_view/www/dist/assets/index.js +13030 -25523
inspect_ai/_view/www/package.json +2 -2
inspect_ai/_view/www/src/appearance/styles.ts +6 -5
inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
inspect_ai/_view/www/src/constants.ts +3 -0
inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
inspect_ai/_view/www/yarn.lock +12 -5
inspect_ai/log/_log.py +10 -1
inspect_ai/log/_recorders/eval.py +27 -8
inspect_ai/log/_recorders/json.py +2 -2
inspect_ai/model/_cache.py +3 -1
inspect_ai/model/_chat_message.py +12 -1
inspect_ai/model/_model.py +25 -11
inspect_ai/model/_providers/anthropic.py +34 -2
inspect_ai/model/_providers/google.py +6 -2
inspect_ai/model/_providers/none.py +31 -0
inspect_ai/model/_providers/providers.py +7 -0
inspect_ai/solver/_bridge/bridge.py +1 -1
inspect_ai/solver/_chain.py +7 -6
inspect_ai/tool/_tools/_computer/_computer.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
inspect_ai/tool/_tools/_web_search.py +2 -2
inspect_ai/util/_sandbox/context.py +2 -1
inspect_ai/util/_sandbox/environment.py +17 -2
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/METADATA +4 -4
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/RECORD +63 -60
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/top_level.txt +0 -0

inspect_ai/model/_chat_message.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Any, Literal, Type, Union
 from pydantic import BaseModel, Field, model_validator
 from shortuuid import uuid
+from inspect_ai._util.constants import DESERIALIZING
 from inspect_ai._util.content import Content, ContentReasoning, ContentText
 from inspect_ai.tool import ToolCall
 from inspect_ai.tool._tool_call import ToolCallError
@@ -16,7 +17,7 @@ logger = getLogger(__name__)
 class ChatMessageBase(BaseModel):
     """Base class for chat messages."""
-    id: str = Field(default_factory=uuid)
+    id: str | None = Field(default=None)
     """Unique identifer for message."""
     content: str | list[Content]
@@ -25,6 +26,16 @@ class ChatMessageBase(BaseModel):
     source: Literal["input", "generate"] | None = Field(default=None)
     """Source of message."""
+    def model_post_init(self, __context: Any) -> None:
+        # check if deserializing
+        is_deserializing = isinstance(__context, dict) and __context.get(
+            DESERIALIZING, False
+        )
+        # Generate ID if needed and not deserializing
+        if self.id is None and not is_deserializing:
+            self.id = uuid()
     @property
     def text(self) -> str:
         """Get the text content of this message.

inspect_ai/model/_model.py CHANGED Viewed

@@ -33,6 +33,7 @@ from inspect_ai._util.content import (
 from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
 from inspect_ai._util.interrupt import check_sample_interrupt
 from inspect_ai._util.logger import warn_once
+from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
 from inspect_ai._util.platform import platform_init
 from inspect_ai._util.registry import (
     RegistryInfo,
@@ -77,7 +78,7 @@ class ModelAPI(abc.ABC):
     by the user. You can then pass these on to the approriate place in
     your model initialisation code (for example, here is what many
     of the built-in providers do with the `model_args` passed to them:
-    https://inspect.ai-safety-institute.org.uk/models.html#model-args)
+    https://inspect.aisi.org.uk/models.html#model-args)
     """
     def __init__(
@@ -232,15 +233,19 @@ class Model:
     config: GenerateConfig
     """Generation config."""
-    def __init__(self, api: ModelAPI, config: GenerateConfig) -> None:
+    def __init__(
+        self, api: ModelAPI, config: GenerateConfig, model_args: dict[str, Any] = {}
+    ) -> None:
         """Create a model.
         Args:
            api: Model API provider.
            config: Model configuration.
+           model_args: Optional model args
         """
         self.api = api
         self.config = config
+        self.model_args = model_args
         # state indicating whether our lifetime is bound by a context manager
         self._context_bound = False
@@ -773,6 +778,10 @@ def get_model(
     if isinstance(model, Model):
         return model
+    # next see if this is the special "none" model
+    if model == "none":
+        model = "none/none"
     # now try finding an 'ambient' model (active or env var)
     if model is None:
         # return active_model if there is one
@@ -835,7 +844,7 @@ def get_model(
             config=config,
             **model_args,
         )
-        m = Model(modelapi_instance, config)
+        m = Model(modelapi_instance, config, model_args)
         if memoize:
             _models[model_cache_key] = m
         return m
@@ -860,17 +869,25 @@ def cached_model(key: str) -> Model | None:
 def resolve_models(
-    model: str | Model | list[str] | list[Model] | None,
+    model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
     model_base_url: str | None = None,
     model_args: dict[str, Any] = dict(),
     config: GenerateConfig = GenerateConfig(),
 ) -> list[Model]:
+    # resolve NotGiven to current INSPECT_EVAL_MODEL
+    if isinstance(model, NotGiven):
+        model = os.getenv("INSPECT_EVAL_MODEL", None)
+    # resolve None to NoModel
+    if model is None:
+        return [get_model("none")]
     # reflect back a plain model
     if isinstance(model, Model):
         return [model]
     # helper to resolve model of various types
-    def resolve_model(m: str | Model | None) -> Model:
+    def resolve_model(m: str | Model) -> Model:
         return get_model(
             model=m,
             base_url=model_base_url,
@@ -878,11 +895,8 @@ def resolve_models(
             **model_args,
         )
-    # resolve None and str to list
-    if model is None or isinstance(model, str):
-        model = model or os.getenv("INSPECT_EVAL_MODEL", None)
-        if model is None:
-            raise ValueError("No model specified (and no INSPECT_EVAL_MODEL defined)")
+    # str to list
+    if isinstance(model, str):
         model = [m.strip() for m in model.split(",")]
     # resolve models
@@ -1236,7 +1250,7 @@ def active_model() -> Model | None:
 # shared contexts for asyncio tasks
-active_model_context_var: ContextVar[Model] = ContextVar("active_model")
+active_model_context_var: ContextVar[Model | None] = ContextVar("active_model")
 def handle_sample_message_limit(input: str | list[ChatMessage]) -> None:

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -240,7 +240,9 @@ class AnthropicAPI(ModelAPI):
             response = message.model_dump()
             # extract output
-            output = model_output_from_message(message, tools)
+            output = await model_output_from_message(
+                self.client, self.model_name, message, tools
+            )
             # return output and call
             return output, model_call()
@@ -724,9 +726,15 @@ async def message_param(message: ChatMessage) -> MessageParam:
         )
-def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelOutput:
+async def model_output_from_message(
+    client: AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex,
+    model: str,
+    message: Message,
+    tools: list[ToolInfo],
+) -> ModelOutput:
     # extract content and tool calls
     content: list[Content] = []
+    reasoning_tokens = 0
     tool_calls: list[ToolCall] | None = None
     for content_block in message.content:
@@ -754,6 +762,9 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
                 ContentReasoning(reasoning=content_block.data, redacted=True)
             )
         elif isinstance(content_block, ThinkingBlock):
+            reasoning_tokens += await count_tokens(
+                client, model, content_block.thinking
+            )
             content.append(
                 ContentReasoning(
                     reasoning=content_block.thinking, signature=content_block.signature
@@ -787,6 +798,7 @@ def model_output_from_message(message: Message, tools: list[ToolInfo]) -> ModelO
             total_tokens=total_tokens,
             input_tokens_cache_write=input_tokens_cache_write,
             input_tokens_cache_read=input_tokens_cache_read,
+            reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
         ),
     )
@@ -852,6 +864,26 @@ async def message_param_content(
         )
+async def count_tokens(
+    client: AsyncAnthropic | AsyncAnthropicBedrock | AsyncAnthropicVertex,
+    model: str,
+    text: str,
+) -> int:
+    try:
+        response = await client.messages.count_tokens(
+            model=model,
+            messages=[{"role": "user", "content": text}],
+        )
+        return response.input_tokens
+    except Exception as e:
+        logger.warning(
+            f"Error counting tokens (falling back to estimated tokens): {str(e)}"
+        )
+        words = text.split()
+        estimated_tokens = int(len(words) * 1.3)
+        return estimated_tokens
 def model_call_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
     # remove base64 encoded images
     if (

inspect_ai/model/_providers/google.py CHANGED Viewed

@@ -267,8 +267,12 @@ class GoogleGenAIAPI(ModelAPI):
         import requests  # type: ignore
         # standard http errors
-        if isinstance(ex, APIError):
-            return is_retryable_http_status(ex.status)
+        if (
+            isinstance(ex, APIError)
+            and isinstance(ex.status, str)
+            and ex.status.isdigit()
+        ):
+            return is_retryable_http_status(int(ex.status))
         # low-level requests exceptions
         elif isinstance(ex, requests.exceptions.RequestException):

inspect_ai/model/_providers/none.py ADDED Viewed

@@ -0,0 +1,31 @@
+from inspect_ai._util.error import PrerequisiteError
+from inspect_ai.tool import ToolChoice, ToolInfo
+from .._chat_message import ChatMessage
+from .._generate_config import GenerateConfig
+from .._model import ModelAPI
+from .._model_output import ModelOutput
+class NoModel(ModelAPI):
+    """A sentinel model type indicating there is no model specified."""
+    def __init__(
+        self,
+        model_name: str = "none",
+        base_url: str | None = None,
+        api_key: str | None = None,
+        config: GenerateConfig = GenerateConfig(),
+    ) -> None:
+        super().__init__(model_name, base_url, api_key, [], config)
+    async def generate(
+        self,
+        input: list[ChatMessage],
+        tools: list[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        raise PrerequisiteError(
+            "No model specified (and no INSPECT_EVAL_MODEL defined)"
+        )

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -250,6 +250,13 @@ def mockllm() -> type[ModelAPI]:
     return MockLLM
+@modelapi(name="none")
+def none() -> type[ModelAPI]:
+    from .none import NoModel
+    return NoModel
 @modelapi("goodfire")
 def goodfire() -> type[ModelAPI]:
     """Get the Goodfire API provider."""

inspect_ai/solver/_bridge/bridge.py CHANGED Viewed

@@ -17,7 +17,7 @@ from .._task_state import TaskState
 def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
     """Bridge an external agent into an Inspect Solver.
-    See documentation at <https://inspect.ai-safety-institute.org.uk/agent-bridge.html>
+    See documentation at <https://inspect.aisi.org.uk/agent-bridge.html>
     Args:
       agent: Callable which takes a sample `dict` and returns a result `dict`.

inspect_ai/solver/_chain.py CHANGED Viewed

@@ -2,10 +2,11 @@ from typing import Sequence, overload
 from typing_extensions import override
-from ._solver import Generate, Solver
+from ._solver import Generate, Solver, solver
 from ._task_state import TaskState
+@solver
 def chain(*solvers: Solver | list[Solver]) -> Solver:
     """Compose a solver from multiple other solvers.
@@ -22,8 +23,8 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
     """
     # flatten lists and chains
     all_solvers: list[Solver] = []
-    for solver in solvers:
-        all_solvers.extend(unroll(solver))
+    for s in solvers:
+        all_solvers.extend(unroll(s))
     return Chain(all_solvers)
@@ -72,9 +73,9 @@ class Chain(Sequence[Solver], Solver):
     ) -> TaskState:
         from ._transcript import solver_transcript
-        for solver in self._solvers:
-            with solver_transcript(solver, state) as st:
-                state = await solver(state, generate)
+        for slv in self._solvers:
+            with solver_transcript(slv, state) as st:
+                state = await slv(state, generate)
                 st.complete(state)
             if state.completed:
                 break

inspect_ai/tool/_tools/_computer/_computer.py CHANGED Viewed

@@ -15,7 +15,7 @@ ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]]
 def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool:
     """Desktop computer tool.
-    See documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-computer>.
+    See documentation at <https://inspect.aisi.org.uk/tools.html#sec-computer>.
     Args:
       max_screenshots: The maximum number of screenshots to play

inspect_ai/tool/_tools/_web_browser/_web_browser.py CHANGED Viewed

@@ -17,7 +17,7 @@ from inspect_ai.util._store_model import StoreModel, store_as
 def web_browser(interactive: bool = True) -> list[Tool]:
     """Tools used for web browser navigation.
-     See documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-web-browser>.
+     See documentation at <https://inspect.aisi.org.uk/tools.html#sec-web-browser>.
     Args:
        interactive: Provide interactive tools (enable

inspect_ai/tool/_tools/_web_search.py CHANGED Viewed

@@ -52,7 +52,7 @@ def web_search(
     A web search is conducted using the specified provider, the results are parsed for relevance
     using the specified model, and the top 'num_results' relevant pages are returned.
-    See further documentation at <https://inspect.ai-safety-institute.org.uk/tools.html#sec-web-search>.
+    See further documentation at <https://inspect.aisi.org.uk/tools.html#sec-web-search>.
     Args:
       provider: Search provider (defaults to "google", currently
@@ -190,7 +190,7 @@ def google_search_provider(client: httpx.AsyncClient) -> SearchProvider:
     google_cse_id = os.environ.get("GOOGLE_CSE_ID", None)
     if not google_api_key or not google_cse_id:
         raise PrerequisiteError(
-            "GOOGLE_CSE_ID and/or GOOGLE_CSE_API_KEY not set in the environment. Please ensure these variables are defined to use Google Custom Search with the web_search tool.\n\nLearn more about the Google web search provider at https://inspect.ai-safety-institute.org.uk/tools.html#google-provider"
+            "GOOGLE_CSE_ID and/or GOOGLE_CSE_API_KEY not set in the environment. Please ensure these variables are defined to use Google Custom Search with the web_search tool.\n\nLearn more about the Google web search provider at https://inspect.aisi.org.uk/tools.html#google-provider"
         )
     async def search(query: str, start_idx: int) -> list[SearchLink]:

inspect_ai/util/_sandbox/context.py CHANGED Viewed

@@ -192,7 +192,8 @@ async def copy_sandbox_environment_files(
             target_env = environments.get(envname, None)
             if not target_env:
                 raise RuntimeError(
-                    f"Environment referenced in sample file not found: '{envname}:{file}'"
+                    f"Environment referenced in sample file not found: '{envname}:{file}'. "
+                    + "Note that ':' can be optionally used to specify an explicit environment name for sample files (e.g. 'envname:file') so cannot be used as a character within filenames."
                 )
         else:
             target_env = default_environment

inspect_ai/util/_sandbox/environment.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import abc
+import logging
 from dataclasses import dataclass, field
 from typing import (
     Annotated,
@@ -17,8 +18,12 @@ from typing import (
 from pydantic import BaseModel, Field, model_validator
+from inspect_ai._util.logger import warn_once
 from .._subprocess import ExecResult
+logger = logging.getLogger(__name__)
 ST = TypeVar("ST", bound="SandboxEnvironment")
 TaskInit = Callable[[str, Union["SandboxEnvironmentConfigType", None]], Awaitable[None]]
@@ -381,11 +386,21 @@ def resolve_sandbox_environment(
         return None
-def deserialize_sandbox_specific_config(type: str, config: dict[str, Any]) -> BaseModel:
+def deserialize_sandbox_specific_config(
+    type: str, config: dict[str, Any]
+) -> BaseModel | dict[str, Any]:
     # Avoid circular import
     from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
-    sandboxenv_type = registry_find_sandboxenv(type)
+    try:
+        sandboxenv_type = registry_find_sandboxenv(type)
+    except ValueError:
+        warn_once(
+            logger,
+            f"Could not find sandbox environment plugin for type '{type}'. "
+            "Ensure the plugin is installed in your environment.",
+        )
+        return config
     config_deserialize = cast(
         ConfigDeserialize, getattr(sandboxenv_type, "config_deserialize")
     )

{inspect_ai-0.3.73.dist-info → inspect_ai-0.3.75.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.2
 Name: inspect_ai
-Version: 0.3.73
+Version: 0.3.75
 Summary: Framework for large language model evaluations
 Author: UK AI Security Institute
 License: MIT License
-Project-URL: Documentation, https://inspect.ai-safety-institute.org.uk/
+Project-URL: Documentation, https://inspect.aisi.org.uk/
 Project-URL: Source Code, https://github.com/UKGovernmentBEIS/inspect_ai
 Project-URL: Issue Tracker, https://github.com/UKGovernmentBEIS/inspect_ai/issues
 Classifier: Development Status :: 4 - Beta
@@ -97,13 +97,13 @@ Provides-Extra: dist
 Requires-Dist: twine; extra == "dist"
 Requires-Dist: build; extra == "dist"
-[<img width="295" src="https://inspect.ai-safety-institute.org.uk/images/aisi-logo.svg" />](https://aisi.gov.uk/)
+[<img width="295" src="https://inspect.aisi.org.uk/images/aisi-logo.svg" />](https://aisi.gov.uk/)
 Welcome to Inspect, a framework for large language model evaluations created by the [UK AI Security Institute](https://aisi.gov.uk/).
 Inspect provides many built-in components, including facilities for prompt engineering, tool usage, multi-turn dialog, and model graded evaluations. Extensions to Inspect (e.g. to support new elicitation and scoring techniques) can be provided by other Python packages.
-To get started with Inspect, please see the documentation at <https://inspect.ai-safety-institute.org.uk/>.
+To get started with Inspect, please see the documentation at <https://inspect.aisi.org.uk/>.
 ***

inspect-ai 0.3.73__py3-none-any.whl → 0.3.75__py3-none-any.whl

inspect-ai 0.3.73py3-none-any.whl → 0.3.75py3-none-any.whl