PyPI - inspect-ai - Versions diffs - 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl - Mend

inspect-ai 0.3.55py3-none-any.whl → 0.3.57py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

inspect_ai/__init__.py +1 -0
inspect_ai/_cli/common.py +1 -1
inspect_ai/_cli/trace.py +33 -20
inspect_ai/_display/core/active.py +1 -1
inspect_ai/_display/core/display.py +1 -1
inspect_ai/_display/core/footer.py +1 -1
inspect_ai/_display/core/panel.py +1 -1
inspect_ai/_display/core/progress.py +0 -6
inspect_ai/_display/core/rich.py +1 -1
inspect_ai/_display/rich/display.py +2 -2
inspect_ai/_display/textual/app.py +15 -17
inspect_ai/_display/textual/widgets/clock.py +3 -3
inspect_ai/_display/textual/widgets/samples.py +6 -13
inspect_ai/_eval/context.py +9 -1
inspect_ai/_eval/run.py +16 -11
inspect_ai/_eval/score.py +4 -10
inspect_ai/_eval/task/results.py +5 -4
inspect_ai/_eval/task/run.py +6 -12
inspect_ai/_eval/task/task.py +10 -0
inspect_ai/_util/ansi.py +31 -0
inspect_ai/_util/datetime.py +1 -1
inspect_ai/_util/deprecation.py +1 -1
inspect_ai/_util/format.py +7 -0
inspect_ai/_util/json.py +11 -1
inspect_ai/_util/logger.py +14 -13
inspect_ai/_util/throttle.py +10 -1
inspect_ai/_util/trace.py +79 -47
inspect_ai/_util/transcript.py +37 -4
inspect_ai/_util/vscode.py +51 -0
inspect_ai/_view/notify.py +2 -1
inspect_ai/_view/www/.prettierrc.js +12 -0
inspect_ai/_view/www/App.css +22 -1
inspect_ai/_view/www/dist/assets/index.css +2374 -2
inspect_ai/_view/www/dist/assets/index.js +29752 -24492
inspect_ai/_view/www/log-schema.json +262 -215
inspect_ai/_view/www/package.json +1 -0
inspect_ai/_view/www/src/App.mjs +19 -9
inspect_ai/_view/www/src/Types.mjs +0 -1
inspect_ai/_view/www/src/api/Types.mjs +15 -4
inspect_ai/_view/www/src/api/api-http.mjs +2 -0
inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +28 -5
inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
inspect_ai/_view/www/src/types/log.d.ts +28 -20
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
inspect_ai/_view/www/yarn.lock +44 -0
inspect_ai/approval/_apply.py +4 -0
inspect_ai/approval/_human/panel.py +5 -8
inspect_ai/dataset/_dataset.py +51 -10
inspect_ai/dataset/_util.py +31 -3
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_log.py +30 -2
inspect_ai/log/_recorders/eval.py +2 -0
inspect_ai/model/_call_tools.py +31 -7
inspect_ai/model/_chat_message.py +3 -0
inspect_ai/model/_model.py +42 -1
inspect_ai/model/_providers/anthropic.py +4 -0
inspect_ai/model/_providers/google.py +24 -6
inspect_ai/model/_providers/openai.py +17 -3
inspect_ai/model/_providers/openai_o1.py +10 -12
inspect_ai/model/_render.py +9 -2
inspect_ai/scorer/_metric.py +12 -1
inspect_ai/solver/__init__.py +2 -0
inspect_ai/solver/_human_agent/agent.py +83 -0
inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
inspect_ai/solver/_human_agent/commands/clock.py +70 -0
inspect_ai/solver/_human_agent/commands/command.py +59 -0
inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
inspect_ai/solver/_human_agent/commands/note.py +42 -0
inspect_ai/solver/_human_agent/commands/score.py +80 -0
inspect_ai/solver/_human_agent/commands/status.py +62 -0
inspect_ai/solver/_human_agent/commands/submit.py +151 -0
inspect_ai/solver/_human_agent/install.py +222 -0
inspect_ai/solver/_human_agent/panel.py +252 -0
inspect_ai/solver/_human_agent/service.py +45 -0
inspect_ai/solver/_human_agent/state.py +55 -0
inspect_ai/solver/_human_agent/view.py +24 -0
inspect_ai/solver/_task_state.py +28 -2
inspect_ai/tool/_tool.py +10 -2
inspect_ai/tool/_tool_info.py +2 -1
inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
inspect_ai/tool/_tools/_web_browser/_web_browser.py +16 -13
inspect_ai/util/__init__.py +12 -4
inspect_ai/{_util/display.py → util/_display.py} +6 -0
inspect_ai/util/_panel.py +31 -9
inspect_ai/util/_sandbox/__init__.py +0 -3
inspect_ai/util/_sandbox/context.py +5 -1
inspect_ai/util/_sandbox/docker/compose.py +17 -13
inspect_ai/util/_sandbox/docker/docker.py +9 -6
inspect_ai/util/_sandbox/docker/internal.py +1 -1
inspect_ai/util/_sandbox/docker/util.py +3 -2
inspect_ai/util/_sandbox/environment.py +6 -5
inspect_ai/util/_sandbox/local.py +1 -1
inspect_ai/util/_sandbox/self_check.py +18 -18
inspect_ai/util/_sandbox/service.py +22 -7
inspect_ai/util/_store.py +7 -8
inspect_ai/util/_store_model.py +110 -0
inspect_ai/util/_subprocess.py +3 -3
inspect_ai/util/_throttle.py +32 -0
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +131 -108
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0

inspect_ai/log/__init__.py CHANGED Viewed

@@ -23,6 +23,7 @@ from ._log import (
     EvalRevision,
     EvalSample,
     EvalSampleReductions,
+    EvalSampleScore,
     EvalScore,
     EvalSpec,
     EvalStats,
@@ -60,6 +61,7 @@ __all__ = [
     "EvalResults",
     "EvalRevision",
     "EvalSample",
+    "EvalSampleScore",
     "EvalSampleReductions",
     "EvalScore",
     "EvalSpec",

inspect_ai/log/_log.py CHANGED Viewed

@@ -16,6 +16,7 @@ from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, PKG_NAME
 from inspect_ai._util.error import EvalError, exception_message
 from inspect_ai._util.logger import warn_once
 from inspect_ai.approval._policy import ApprovalPolicyConfig
+from inspect_ai.dataset._dataset import MT, metadata_as
 from inspect_ai.model import (
     ChatMessage,
     GenerateConfig,
@@ -23,8 +24,9 @@ from inspect_ai.model import (
     ModelUsage,
 )
 from inspect_ai.scorer import Score
-from inspect_ai.scorer._metric import SampleScore
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
+from inspect_ai.util._store import Store
+from inspect_ai.util._store_model import SMT
 from ._transcript import Event
@@ -159,9 +161,31 @@ class EvalSample(BaseModel):
     metadata: dict[str, Any]
     """Additional sample metadata."""
+    def metadata_as(self, metadata_cls: Type[MT]) -> MT:
+        """Pydantic model interface to metadata.
+        Args:
+          metadata_cls: Pydantic model type
+        Returns:
+          BaseModel: Instance of metadata_cls bound to sample metadata.
+        """
+        return metadata_as(self.metadata, metadata_cls)
     store: dict[str, Any] = Field(default_factory=dict)
     """State at end of sample execution."""
+    def store_as(self, model_cls: Type[SMT]) -> SMT:
+        """Pydantic model interface to the store.
+        Args:
+          model_cls: Pydantic model type (must derive from StoreModel)
+        Returns:
+          StoreModel: Instance of model_cls bound to sample store data.
+        """
+        return model_cls(store=Store(self.store))
     events: list[Event] = Field(default_factory=list)
     """Events that occurred during sample execution."""
@@ -301,6 +325,10 @@ class EvalScore(BaseModel):
     """Additional scorer metadata."""
+class EvalSampleScore(Score):
+    sample_id: str | int | None = Field(default=None)
 class EvalSampleReductions(BaseModel):
     scorer: str
     """Name the of scorer"""
@@ -308,7 +336,7 @@ class EvalSampleReductions(BaseModel):
     reducer: str | None = Field(default=None)
     """Name the of reducer"""
-    samples: list[SampleScore]
+    samples: list[EvalSampleScore]
     """List of reduced scores"""

inspect_ai/log/_recorders/eval.py CHANGED Viewed

@@ -252,6 +252,8 @@ def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
                     filtered_content.append(ContentText(text="(Image)"))
                 message.content = filtered_content
                 input.append(message)
+            else:
+                input.append(message)
         return input
     else:

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -1,15 +1,20 @@
 import asyncio
 import inspect
+import types
 from dataclasses import is_dataclass
 from logging import getLogger
 from textwrap import dedent
+from types import UnionType
 from typing import (
     Any,
     Callable,
     Dict,
     List,
     NamedTuple,
+    Optional,
+    Tuple,
     Type,
+    Union,
     get_args,
     get_origin,
     get_type_hints,
@@ -25,10 +30,7 @@ from inspect_ai._util.text import truncate_string_to_bytes
 from inspect_ai._util.trace import trace_action
 from inspect_ai.model._trace import trace_tool_mesage
 from inspect_ai.tool import Tool, ToolCall, ToolError, ToolInfo
-from inspect_ai.tool._tool import (
-    ToolApprovalError,
-    ToolParsingError,
-)
+from inspect_ai.tool._tool import ToolApprovalError, ToolParsingError
 from inspect_ai.tool._tool_call import ToolCallContent, ToolCallError
 from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.tool._tool_info import parse_docstring
@@ -118,10 +120,12 @@ async def call_tools(
             # massage result, leave list[Content] alone, convert all other
             # types to string as that is what the model APIs accept
             truncated: tuple[int, int] | None = None
-            if isinstance(result, list) and (
+            if isinstance(result, ContentText | ContentImage):
+                content: str | list[Content] = [result]
+            elif isinstance(result, list) and (
                 isinstance(result[0], ContentText | ContentImage)
             ):
-                content: str | list[Content] = result
+                content = result
             else:
                 content = str(result)
@@ -266,6 +270,16 @@ def disable_parallel_tools(
     return False
+def type_hint_includes_none(type_hint: Type[Any] | None) -> bool:
+    origin = get_origin(type_hint)
+    if origin in {Union, UnionType}:
+        return type(None) in get_args(type_hint)
+    elif origin is Optional:
+        return True
+    return False
 def tool_params(input: dict[str, Any], func: Callable[..., Any]) -> dict[str, Any]:
     # parse function typeinfo
     signature = inspect.signature(func)
@@ -294,7 +308,7 @@ def tool_params(input: dict[str, Any], func: Callable[..., Any]) -> dict[str, An
         # yield parameter (fail if not passed and there is no default)
         if param_name in input:
             params[param_name] = tool_param(type_hint, input.get(param_name))
-        elif param.default is not None:
+        elif param.default is not None or type_hint_includes_none(type_hint):
             params[param_name] = param.default
         else:
             raise ToolParsingError(
@@ -337,11 +351,21 @@ def tool_param(type_hint: Type[Any], input: Any) -> Any:
             return [tool_param(args[0], x) for x in input]
         else:
             return input
+    elif origin is tuple or origin is Tuple:
+        if args:
+            return tuple([tool_param(args[0], x) for x in input])
+        else:
+            return tuple(input)
     elif origin is dict or origin is Dict:
         if args and len(args) > 1:
             return {k: tool_param(args[1], v) for k, v in input}
         else:
             return input
+    elif origin is Union or origin is types.UnionType:
+        if args[1] is type(None):
+            return tool_param(args[0], input)
+        else:
+            return input
     else:
         return input

inspect_ai/model/_chat_message.py CHANGED Viewed

@@ -74,6 +74,9 @@ class ChatMessageUser(ChatMessageBase):
     role: Literal["user"] = Field(default="user")
     """Conversation role."""
+    tool_call_id: str | None = Field(default=None)
+    """ID of tool call this message has the content payload for."""
 class ChatMessageAssistant(ChatMessageBase):
     role: Literal["assistant"] = Field(default="assistant")

inspect_ai/model/_model.py CHANGED Viewed

@@ -19,7 +19,7 @@ from tenacity import (
 )
 from inspect_ai._util.constants import DEFAULT_MAX_CONNECTIONS
-from inspect_ai._util.content import ContentText
+from inspect_ai._util.content import Content, ContentImage, ContentText
 from inspect_ai._util.hooks import init_hooks, override_api_key, send_telemetry
 from inspect_ai._util.platform import platform_init
 from inspect_ai._util.registry import (
@@ -40,6 +40,7 @@ from ._chat_message import (
     ChatMessage,
     ChatMessageAssistant,
     ChatMessageSystem,
+    ChatMessageTool,
     ChatMessageUser,
 )
 from ._generate_config import (
@@ -163,6 +164,10 @@ class ModelAPI(abc.ABC):
         """Any tool use in a message stream means that tools must be passed."""
         return False
+    def tool_result_images(self) -> bool:
+        """Tool results can containe images"""
+        return False
 class Model:
     """Model interface."""
@@ -291,6 +296,11 @@ class Model:
                 tools = []
             tool_choice = "none"
+        # break tool image content out into user messages if the model doesn't
+        # support tools returning images
+        if not self.api.tool_result_images():
+            input = tool_result_images_as_user_message(input)
         # optionally collapse *consecutive* messages into one -
         # (some apis e.g. anthropic require this)
         if self.api.collapse_user_messages():
@@ -693,6 +703,37 @@ def simple_input_messages(
     return messages
+def tool_result_images_as_user_message(
+    messages: list[ChatMessage],
+) -> list[ChatMessage]:
+    return functools.reduce(tool_result_images_reducer, messages, [])
+def tool_result_images_reducer(
+    messages: list[ChatMessage],
+    message: ChatMessage,
+) -> list[ChatMessage]:
+    # append the message
+    messages.append(message)
+    # if there are tool result images, pull them out into a ChatUserMessage
+    if isinstance(message, ChatMessageTool) and isinstance(message.content, list):
+        user_content: list[Content] = []
+        for i in range(0, len(message.content)):
+            if isinstance(message.content[i], ContentImage):
+                user_content.append(message.content[i])
+                message.content[i] = ContentText(
+                    text="Image content is in the message below."
+                )
+        if len(user_content) > 0:
+            messages.append(
+                ChatMessageUser(content=user_content, tool_call_id=message.tool_call_id)
+            )
+    # return messages
+    return messages
 # Functions to reduce consecutive user messages to a single user message -> required for some models
 def collapse_consecutive_user_messages(
     messages: list[ChatMessage],

inspect_ai/model/_providers/anthropic.py CHANGED Viewed

@@ -229,6 +229,10 @@ class AnthropicAPI(ModelAPI):
     def tools_required(self) -> bool:
         return True
+    @override
+    def tool_result_images(self) -> bool:
+        return True
     # convert some common BadRequestError states into 'refusal' model output
     def handle_bad_request(self, ex: BadRequestError) -> ModelOutput | None:
         error = exception_message(ex).lower()

inspect_ai/model/_providers/google.py CHANGED Viewed

@@ -194,7 +194,9 @@ class GoogleAPI(ModelAPI):
                 model=self.model_name, content=ex.message, stop_reason="model_length"
             )
         else:
-            raise ex
+            return ModelOutput.from_content(
+                model=self.model_name, content=ex.message, stop_reason="unknown"
+            )
     @override
     def is_rate_limit(self, ex: BaseException) -> bool:
@@ -408,25 +410,34 @@ def chat_tools(tools: list[ToolInfo]) -> list[Tool]:
 # https://ai.google.dev/gemini-api/tutorials/extract_structured_data#define_the_schema
-def schema_from_param(param: ToolParam | ToolParams) -> Schema:
+def schema_from_param(param: ToolParam | ToolParams, nullable: bool = False) -> Schema:
     if isinstance(param, ToolParams):
         param = ToolParam(
             type=param.type, properties=param.properties, required=param.required
         )
     if param.type == "number":
-        return Schema(type=Type.NUMBER, description=param.description)
+        return Schema(
+            type=Type.NUMBER, description=param.description, nullable=nullable
+        )
     elif param.type == "integer":
-        return Schema(type=Type.INTEGER, description=param.description)
+        return Schema(
+            type=Type.INTEGER, description=param.description, nullable=nullable
+        )
     elif param.type == "boolean":
-        return Schema(type=Type.BOOLEAN, description=param.description)
+        return Schema(
+            type=Type.BOOLEAN, description=param.description, nullable=nullable
+        )
     elif param.type == "string":
-        return Schema(type=Type.STRING, description=param.description)
+        return Schema(
+            type=Type.STRING, description=param.description, nullable=nullable
+        )
     elif param.type == "array":
         return Schema(
             type=Type.ARRAY,
             description=param.description,
             items=schema_from_param(param.items) if param.items else None,
+            nullable=nullable,
         )
     elif param.type == "object":
         return Schema(
@@ -436,7 +447,14 @@ def schema_from_param(param: ToolParam | ToolParams) -> Schema:
             if param.properties is not None
             else None,
             required=param.required,
+            nullable=nullable,
         )
+    # convert unions to optional params if the second type is 'null'
+    elif param.anyOf:
+        if len(param.anyOf) == 2 and param.anyOf[1].type == "null":
+            return schema_from_param(param.anyOf[0], nullable=True)
+        else:
+            return Schema(type=Type.TYPE_UNSPECIFIED)
     else:
         return Schema(type=Type.TYPE_UNSPECIFIED)

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -51,6 +51,7 @@ from .._model_output import (
     Logprobs,
     ModelOutput,
     ModelUsage,
+    StopReason,
 )
 from .openai_o1 import generate_o1
 from .util import (
@@ -262,7 +263,10 @@ class OpenAIAPI(ModelAPI):
             model=self.model_name,
         )
         if config.max_tokens is not None:
-            params["max_tokens"] = config.max_tokens
+            if self.is_o1():
+                params["max_completion_tokens"] = config.max_tokens
+            else:
+                params["max_tokens"] = config.max_tokens
         if config.frequency_penalty is not None:
             params["frequency_penalty"] = config.frequency_penalty
         if config.stop_seqs is not None:
@@ -303,13 +307,23 @@ class OpenAIAPI(ModelAPI):
     # convert some well known bad request errors into ModelOutput
     def handle_bad_request(self, e: BadRequestError) -> ModelOutput:
-        if e.status_code == 400 and e.code == "context_length_exceeded":
+        if e.status_code == 400:
+            # extract message
             if isinstance(e.body, dict) and "message" in e.body.keys():
                 content = str(e.body.get("message"))
             else:
                 content = e.message
+            # narrow stop_reason
+            if e.code == "context_length_exceeded":
+                stop_reason: StopReason = "model_length"
+            elif e.code == "invalid_prompt":
+                stop_reason = "content_filter"
+            else:
+                stop_reason = "unknown"
             return ModelOutput.from_content(
-                model=self.model_name, content=content, stop_reason="model_length"
+                model=self.model_name, content=content, stop_reason=stop_reason
             )
         else:
             raise e

inspect_ai/model/_providers/openai_o1.py CHANGED Viewed

@@ -25,7 +25,7 @@ from inspect_ai.model import (
 from inspect_ai.tool import ToolCall, ToolInfo
 from .._model_call import ModelCall
-from .._model_output import ModelUsage
+from .._model_output import ModelUsage, StopReason
 from .._providers.util import (
     ChatAPIHandler,
     ChatAPIMessage,
@@ -48,12 +48,6 @@ async def generate_o1(
     # create chatapi handler
     handler = O1PreviewChatAPIHandler()
-    # map max_tokens => max_completion_tokens
-    max_tokens = params.get("max_tokens", None)
-    if max_tokens:
-        params["max_completion_tokens"] = max_tokens
-        del params["max_tokens"]
     # call model
     request = dict(
         model=model,
@@ -89,12 +83,16 @@ async def generate_o1(
 def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput:
-    if ex.code == "invalid_prompt":
-        return ModelOutput.from_content(
-            model=model, content=str(ex), stop_reason="content_filter"
-        )
+    if ex.code == "context_length_exceeded":
+        stop_reason: StopReason = "model_length"
+    elif ex.code == "invalid_prompt":
+        stop_reason = "content_filter"
     else:
-        raise ex
+        stop_reason = "unknown"
+    return ModelOutput.from_content(
+        model=model, content=str(ex), stop_reason=stop_reason
+    )
 def chat_messages(

inspect_ai/model/_render.py CHANGED Viewed

@@ -3,13 +3,20 @@ from rich.console import RenderableType
 from inspect_ai.tool._tool_call import ToolCall
 from inspect_ai.tool._tool_transcript import transcript_tool_call
-from ._chat_message import ChatMessage, ChatMessageAssistant, ChatMessageTool
+from ._chat_message import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ChatMessageTool,
+    ChatMessageUser,
+)
 def messages_preceding_assistant(messages: list[ChatMessage]) -> list[ChatMessage]:
     preceding: list[ChatMessage] = []
     for m in reversed(messages):
-        if not isinstance(m, ChatMessageTool | ChatMessageAssistant):
+        if not isinstance(m, ChatMessageTool | ChatMessageAssistant) and not (
+            isinstance(m, ChatMessageUser) and m.tool_call_id
+        ):
             preceding.append(m)
         else:
             break

inspect_ai/scorer/_metric.py CHANGED Viewed

@@ -90,6 +90,13 @@ class Score(BaseModel):
         """Read the score as a boolean."""
         return bool(self._as_scalar())
+    def as_list(self) -> list[str | int | float | bool]:
+        """Read the score as a list."""
+        if isinstance(self.value, list):
+            return self.value
+        else:
+            raise ValueError("This score is not a list")
     def as_dict(self) -> dict[str, str | int | float | bool | None]:
         """Read the score as a dictionary."""
         if isinstance(self.value, dict):
@@ -104,13 +111,17 @@ class Score(BaseModel):
             raise ValueError("This score is not a scalar")
-class SampleScore(Score):
+class SampleScore(BaseModel):
     """Score for a Sample
     Args:
+       score: Score
        sample_id: (str | int | None) Unique id of a sample
     """
+    score: Score
+    """A score"""
     sample_id: str | int | None = Field(default=None)
     """A sample id"""

inspect_ai/solver/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from ._basic_agent import basic_agent
 from ._chain import chain
 from ._critique import self_critique
 from ._fork import fork
+from ._human_agent.agent import human_agent
 from ._multiple_choice import MultipleChoiceTemplate, multiple_choice
 from ._plan import Plan, plan
 from ._prompt import (
@@ -17,6 +18,7 @@ from ._use_tools import use_tools
 __all__ = [
     "basic_agent",
+    "human_agent",
     "chain",
     "fork",
     "generate",

inspect_ai/solver/_human_agent/agent.py ADDED Viewed

@@ -0,0 +1,83 @@
+import asyncio
+from inspect_ai.util import display_type, input_panel, sandbox
+from .._solver import Generate, Solver, solver
+from .._task_state import TaskState
+from .commands import human_agent_commands
+from .install import install_human_agent
+from .panel import HumanAgentPanel
+from .service import run_human_agent_service
+from .view import ConsoleView, HumanAgentView
+@solver
+def human_agent(
+    answer: bool | str = True,
+    intermediate_scoring: bool = False,
+    record_session: bool = True,
+) -> Solver:
+    """Human solver for agentic tasks that run in a Linux environment.
+    The Human agent solver installs agent task tools in the default
+    sandbox and presents the user with both task instructions and
+    documentation for the various tools (e.g. `task submit`,
+    `task start`, `task stop` `task instructions`, etc.). A human agent panel
+    is displayed with instructions for logging in to the sandbox.
+    If the user is running in VS Code with the Inspect extension,
+    they will also be presented with links to login to the sandbox
+    using a VS Code Window or Terminal.
+    Args:
+       answer (bool | str): Is an explicit answer required for this
+          task or is it scored based on files in the container? Pass a
+          `str` with a regex to validate that the answer matches
+          the expected format.
+       intermediate_scoring (bool): Allow the human agent to
+          check their score while working.
+       record_session (bool): Record all user commands and outputs in
+          the sandbox bash session.
+    Returns:
+       Solver: Human agent solver.
+    """
+    # we can only run one human agent interaction at a time (use lock to enforce)
+    agent_lock = asyncio.Lock()
+    async def solve(state: TaskState, generate: Generate) -> TaskState:
+        async with agent_lock:
+            # ensure that we have a sandbox to work with
+            try:
+                connection = await sandbox().connection()
+            except ProcessLookupError:
+                raise RuntimeError("Human agent must run in a task with a sandbox.")
+            except NotImplementedError:
+                raise RuntimeError(
+                    "Human agent must run with a sandbox that supports connections."
+                )
+            # helper function to run the agent (called for fullscreen vs. fallback below)
+            async def run_human_agent(view: HumanAgentView) -> TaskState:
+                # create agent commands
+                commands = human_agent_commands(
+                    state, answer, intermediate_scoring, record_session
+                )
+                # install agent tools
+                await install_human_agent(state, commands, record_session)
+                # hookup the view ui
+                view.connect(connection)
+                # run sandbox service
+                return await run_human_agent_service(state, commands, view)
+            # support both fullscreen ui and fallback
+            if display_type() == "full":
+                async with await input_panel(HumanAgentPanel) as panel:
+                    return await run_human_agent(panel)
+            else:
+                return await run_human_agent(ConsoleView())
+    return solve

inspect_ai/solver/_human_agent/commands/__init__.py ADDED Viewed

@@ -0,0 +1,36 @@
+from inspect_ai.solver._task_state import TaskState
+from .clock import StartCommand, StopCommand
+from .command import HumanAgentCommand
+from .instructions import InstructionsCommand
+from .note import NoteCommand
+from .score import ScoreCommand
+from .status import StatusCommand
+from .submit import SubmitCommand, ValidateCommand
+def human_agent_commands(
+    state: TaskState,
+    answer: bool | str,
+    intermediate_scoring: bool,
+    record_session: bool,
+) -> list[HumanAgentCommand]:
+    # base submit and validate
+    commands = [SubmitCommand(record_session), ValidateCommand(answer)]
+    # optional intermediate scoring
+    if intermediate_scoring:
+        commands.append(ScoreCommand(state))
+    # remaining commands
+    commands.extend(
+        [
+            NoteCommand(),
+            StatusCommand(),
+            StartCommand(),
+            StopCommand(),
+        ]
+    )
+    # with instructions (letting it see the other commands)
+    return commands + [InstructionsCommand(commands)]

inspect-ai 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl

inspect-ai 0.3.55py3-none-any.whl → 0.3.57py3-none-any.whl