PyPI - inspect-ai - Versions diffs - 0.3.51__py3-none-any.whl → 0.3.53__py3-none-any.whl - Mend

inspect-ai 0.3.51py3-none-any.whl → 0.3.53py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

inspect_ai/_cli/eval.py +44 -2
inspect_ai/_display/core/config.py +4 -0
inspect_ai/_display/core/panel.py +1 -1
inspect_ai/_display/core/progress.py +9 -3
inspect_ai/_display/core/results.py +8 -4
inspect_ai/_display/textual/widgets/task_detail.py +45 -13
inspect_ai/_display/textual/widgets/tasks.py +86 -5
inspect_ai/_display/textual/widgets/transcript.py +4 -17
inspect_ai/_eval/eval.py +29 -1
inspect_ai/_eval/evalset.py +7 -0
inspect_ai/_eval/registry.py +2 -2
inspect_ai/_eval/task/log.py +6 -1
inspect_ai/_eval/task/results.py +22 -4
inspect_ai/_eval/task/run.py +18 -12
inspect_ai/_eval/task/sandbox.py +72 -43
inspect_ai/_eval/task/task.py +4 -0
inspect_ai/_eval/task/util.py +17 -6
inspect_ai/_util/logger.py +10 -2
inspect_ai/_util/samples.py +7 -0
inspect_ai/_util/transcript.py +8 -0
inspect_ai/_view/www/App.css +13 -0
inspect_ai/_view/www/dist/assets/index.css +13 -0
inspect_ai/_view/www/dist/assets/index.js +105 -55
inspect_ai/_view/www/src/App.mjs +31 -6
inspect_ai/_view/www/src/Types.mjs +6 -0
inspect_ai/_view/www/src/components/JsonPanel.mjs +11 -17
inspect_ai/_view/www/src/components/MessageContent.mjs +9 -2
inspect_ai/_view/www/src/components/Tools.mjs +46 -18
inspect_ai/_view/www/src/navbar/Navbar.mjs +12 -0
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +18 -5
inspect_ai/_view/www/src/samples/SampleList.mjs +2 -2
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +2 -2
inspect_ai/log/_log.py +6 -0
inspect_ai/log/_recorders/eval.py +8 -7
inspect_ai/model/_call_tools.py +2 -6
inspect_ai/model/_generate_config.py +6 -0
inspect_ai/model/_model.py +18 -4
inspect_ai/model/_providers/azureai.py +22 -2
inspect_ai/model/_providers/bedrock.py +17 -1
inspect_ai/model/_providers/hf.py +1 -1
inspect_ai/model/_providers/openai.py +32 -8
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/vllm.py +1 -1
inspect_ai/model/_render.py +7 -6
inspect_ai/model/_trace.py +1 -1
inspect_ai/solver/_basic_agent.py +8 -1
inspect_ai/tool/_tool_transcript.py +28 -0
inspect_ai/util/_sandbox/context.py +1 -2
inspect_ai/util/_sandbox/docker/config.py +8 -10
inspect_ai/util/_sandbox/docker/docker.py +9 -5
inspect_ai/util/_sandbox/docker/util.py +3 -3
inspect_ai/util/_sandbox/environment.py +7 -2
inspect_ai/util/_sandbox/limits.py +1 -1
inspect_ai/util/_sandbox/local.py +8 -9
{inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/METADATA +2 -4
{inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/RECORD +60 -59
{inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.51.dist-info → inspect_ai-0.3.53.dist-info}/top_level.txt +0 -0

inspect_ai/_view/www/src/samples/SampleDisplay.mjs CHANGED Viewed

@@ -350,7 +350,17 @@ const metadataViewsForSample = (id, sample) => {
   return sampleMetadatas;
 };
-const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
+/**
+ * Component to display a sample with relevant context and visibility control.
+ *
+ * @param {Object} props - The properties passed to the component.
+ * @param {string} props.parent_id - The id of the parent com
+ * @param {import("../types/log").EvalSample} [props.sample] - the sample
+ * @param {Object} [props.style] - Inline styles for the table element.
+ * @param {import("../samples/SamplesDescriptor.mjs").SamplesDescriptor} props.sampleDescriptor - the sample descriptor
+ * @returns {import("preact").JSX.Element} The TranscriptView component.
+ */
+const SampleSummary = ({ parent_id, sample, style, sampleDescriptor }) => {
   const input =
     sampleDescriptor?.messageShape.normalized.input > 0
       ? Math.max(0.15, sampleDescriptor.messageShape.normalized.input)
@@ -386,7 +396,7 @@ const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
   const columns = [];
   columns.push({
     label: "Id",
-    value: id,
+    value: sample.id,
     size: `${idSize}em`,
   });
@@ -412,7 +422,8 @@ const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
   const fullAnswer =
     sample && sampleDescriptor
-      ? sampleDescriptor.selectedScorer(sample).answer()
+      ? // @ts-ignore
+        sampleDescriptor.selectedScorer(sample).answer()
       : undefined;
   if (fullAnswer) {
     columns.push({
@@ -445,14 +456,16 @@ const SampleSummary = ({ id, sample, style, sampleDescriptor }) => {
           message=${sample.error.message}
           style=${{ marginTop: "0.4rem" }}
         />`
-      : sampleDescriptor?.selectedScore(sample).render(),
+      : // TODO: Cleanup once the PR lands which makes sample / sample summary share common interface
+        // @ts-ignore
+        sampleDescriptor?.selectedScore(sample).render(),
     size: "minmax(2em, auto)",
     center: true,
   });
   return html`
     <div
-      id=${`sample-heading-${id}`}
+      id=${`sample-heading-${parent_id}`}
       style=${{
         display: "grid",
         gridTemplateColumns: `${columns

inspect_ai/_view/www/src/samples/SampleList.mjs CHANGED Viewed

@@ -145,7 +145,7 @@ export const SampleList = (props) => {
   );
   const listStyle = { ...style, flex: "1", overflowY: "auto", outline: "none" };
-  const { limit, answer } = gridColumns(sampleDescriptor);
+  const { limit, answer, target } = gridColumns(sampleDescriptor);
   const headerRow = html`<div
     style=${{
@@ -161,7 +161,7 @@ export const SampleList = (props) => {
   >
     <div>Id</div>
     <div>Input</div>
-    <div>Target</div>
+    <div>${target !== "0" ? "Target" : ""}</div>
     <div>${answer !== "0" ? "Answer" : ""}</div>
     <div>${limit !== "0" ? "Limit" : ""}</div>
     <div style=${{ justifySelf: "center" }}>Score</div>

inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs CHANGED Viewed

@@ -29,10 +29,10 @@ export const ToolEventView = ({ id, event, style, depth }) => {
     return e.event === "approval";
   });
-  const title = `Tool: ${event.function}`;
+  const title = `Tool: ${event.view?.title || event.function}`;
   return html`
   <${EventPanel} id=${id} title="${title}" subTitle=${formatDateTime(new Date(event.timestamp))} icon=${ApplicationIcons.solvers.use_tools} style=${style}>
-  <div name="Summary" style=${{ margin: "0.5em 0" }}>
+  <div name="Summary" style=${{ margin: "0.5em 0", width: "100%" }}>
     <${ToolCallView}
       functionCall=${functionCall}
       input=${input}

inspect_ai/log/_log.py CHANGED Viewed

@@ -37,6 +37,9 @@ class EvalConfig(BaseModel):
     limit: int | tuple[int, int] | None = Field(default=None)
     """Sample limit (number of samples or range of samples)."""
+    sample_id: str | int | list[str | int] | None = Field(default=None)
+    """Evaluate specific sample(s)."""
     epochs: int | None = Field(default=None)
     """Number of epochs to run samples over."""
@@ -76,6 +79,9 @@ class EvalConfig(BaseModel):
     max_subprocesses: int | None = Field(default=None)
     """Maximum number of subprocesses to run concurrently."""
+    max_sandboxes: int | None = Field(default=None)
+    """Maximum number of sandboxes to run concurrently."""
     sandbox_cleanup: bool | None = Field(default=None)
     """Cleanup sandbox environments after task completes."""

inspect_ai/log/_recorders/eval.py CHANGED Viewed

@@ -362,13 +362,14 @@ class ZipLogFile:
                     f"Error occurred during async write to {self._file}: {ex}. Falling back to sync write."
                 )
-            # write sync if we need to
-            if not written:
-                with file(self._file, "wb") as f:
-                    f.write(log_bytes)
-            # re-open zip file w/ self.temp_file pointer at end
-            self._open()
+            try:
+                # write sync if we need to
+                if not written:
+                    with file(self._file, "wb") as f:
+                        f.write(log_bytes)
+            finally:
+                # re-open zip file w/ self.temp_file pointer at end
+                self._open()
     async def close(self) -> EvalLog:
         async with self._lock:

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -68,10 +68,6 @@ async def call_tools(
             # create a transript for this call
             init_transcript(Transcript(name=call.function))
-            # Amend the tool call with a custom view
-            view = tool_call_view(call, tdefs)
-            call.view = view
             result: Any = ""
             tool_error: ToolCallError | None = None
             try:
@@ -142,7 +138,7 @@ async def call_tools(
                 arguments=call.arguments,
                 result=content,
                 truncated=truncated,
-                view=view,
+                view=call.view,
                 error=tool_error,
                 events=list(transcript().events),
             )
@@ -163,7 +159,7 @@ async def call_tools(
                 id=call.id,
                 function=call.function,
                 arguments=call.arguments,
-                view=tool_call_view(call, tdefs),
+                view=call.view,
                 pending=True,
             )
             transcript()._event(event)

inspect_ai/model/_generate_config.py CHANGED Viewed

@@ -72,6 +72,9 @@ class GenerateConfigArgs(TypedDict, total=False):
     cache_prompt: Literal["auto"] | bool | None
     """Whether to cache the prompt prefix. Defaults to "auto", which will enable caching for requests with tools. Anthropic only."""
+    reasoning_effort: Literal["low", "medium", "high"] | None
+    """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
 class GenerateConfig(BaseModel):
     """Base class for model generation configs."""
@@ -139,6 +142,9 @@ class GenerateConfig(BaseModel):
     cache_prompt: Literal["auto"] | bool | None = Field(default=None)
     """Whether to cache the prompt prefix. Defaults to "auto", which will enable caching for requests with tools. Anthropic only."""
+    reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
+    """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
     def merge(
         self, other: Union["GenerateConfig", GenerateConfigArgs]
     ) -> "GenerateConfig":

inspect_ai/model/_model.py CHANGED Viewed

@@ -31,11 +31,11 @@ from inspect_ai._util.registry import (
 )
 from inspect_ai._util.retry import log_rate_limit_retry
 from inspect_ai.tool import Tool, ToolChoice, ToolFunction, ToolInfo
-from inspect_ai.tool._tool_def import ToolDef
+from inspect_ai.tool._tool_def import ToolDef, tool_defs
 from inspect_ai.util import concurrency
 from ._cache import CacheEntry, CachePolicy, cache_fetch, cache_store
-from ._call_tools import disable_parallel_tools, tools_info
+from ._call_tools import disable_parallel_tools, tool_call_view, tools_info
 from ._chat_message import (
     ChatMessage,
     ChatMessageAssistant,
@@ -248,7 +248,7 @@ class Model:
         async with self._connection_concurrency(config):
             return await self._generate(
                 input=input,
-                tools=tools_info(tools),
+                tools=tools,
                 tool_choice=tool_choice,
                 config=config,
                 cache=cache,
@@ -257,7 +257,10 @@ class Model:
     async def _generate(
         self,
         input: list[ChatMessage],
-        tools: list[ToolInfo],
+        tools: list[Tool]
+        | list[ToolDef]
+        | list[ToolInfo]
+        | list[Tool | ToolDef | ToolInfo],
         tool_choice: ToolChoice | None,
         config: GenerateConfig,
         cache: bool | CachePolicy = False,
@@ -265,6 +268,12 @@ class Model:
         # default to 'auto' for tool_choice (same as underlying model apis)
         tool_choice = tool_choice if tool_choice else "auto"
+        # extract tool defs if we can
+        tdefs = tool_defs([tool for tool in tools if not isinstance(tool, ToolInfo)])
+        # resolve all tools into tool_info
+        tools = tools_info(tools)
         # if we have a specific tool selected then filter out the others
         if isinstance(tool_choice, ToolFunction):
             tools = [tool for tool in tools if tool.name == tool_choice.name]
@@ -374,6 +383,11 @@ class Model:
             # update output with time elapsed
             output.time = time_elapsed
+            # add views to tool calls
+            for choice in output.choices:
+                for tool_call in choice.message.tool_calls or []:
+                    tool_call.view = tool_call_view(tool_call, tdefs)
             # complete the transcript event
             complete(output, call)

inspect_ai/model/_providers/azureai.py CHANGED Viewed

@@ -89,6 +89,19 @@ class AzureAIAPI(ModelAPI):
             config=config,
         )
+        # collect known model_args (then delete them so we can pass the rest on)
+        def collect_model_arg(name: str) -> Any | None:
+            nonlocal model_args
+            value = model_args.get(name, None)
+            if value is not None:
+                model_args.pop(name)
+            return value
+        emulate_tools = collect_model_arg("emulate_tools")
+        self.emulate_tools = (
+            not not emulate_tools if emulate_tools is not None else None
+        )
         # resolve api_key
         if not self.api_key:
             self.api_key = os.environ.get(
@@ -118,8 +131,15 @@ class AzureAIAPI(ModelAPI):
         tool_choice: ToolChoice,
         config: GenerateConfig,
     ) -> ModelOutput | tuple[ModelOutput, ModelCall]:
-        # if its llama then do fake tool calls
-        handler: ChatAPIHandler | None = Llama31Handler() if self.is_llama() else None
+        # emulate tools (auto for llama, opt-in for others)
+        if self.emulate_tools is None and self.is_llama():
+            handler: ChatAPIHandler | None = Llama31Handler()
+        elif self.emulate_tools:
+            handler = Llama31Handler()
+        else:
+            handler = None
+        # resolve input
         if handler:
             input = handler.input_with_tools(input, tools)

inspect_ai/model/_providers/bedrock.py CHANGED Viewed

@@ -236,15 +236,21 @@ class BedrockAPI(ModelAPI):
         self,
         model_name: str,
         base_url: str | None,
+        api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
         **model_args: Any,
     ):
         super().__init__(
             model_name=model_name,
             base_url=model_base_url(base_url, "BEDROCK_BASE_URL"),
+            api_key=api_key,
+            api_key_vars=[],
             config=config,
         )
+        # save model_args
+        self.model_args = model_args
         # import aioboto3 on demand
         try:
             import aioboto3
@@ -263,6 +269,9 @@ class BedrockAPI(ModelAPI):
     @override
     def max_tokens(self) -> int | None:
+        if "llama3-70" in self.model_name or "llama3-8" in self.model_name:
+            return 2048
         if "llama3" in self.model_name or "claude3" in self.model_name:
             return 4096
@@ -316,6 +325,7 @@ class BedrockAPI(ModelAPI):
                     mode="adaptive",
                 ),
             ),
+            **self.model_args,
         ) as client:
             # Process the tools
             resolved_tools = converse_tools(tools)
@@ -658,6 +668,8 @@ def converse_image_type(type: str) -> ConverseImageFormat:
             return "png"
         case "image/webp":
             return "webp"
+        case "image/jpeg":
+            return "jpeg"
         case _:
             raise ValueError(
                 f"Image mime type {type} is not supported for Bedrock Converse models."
@@ -673,7 +685,11 @@ def converse_tools(tools: list[ToolInfo]) -> list[ConverseTool] | None:
         tool_spec = ConverseToolSpec(
             name=tool.name,
             description=tool.description,
-            inputSchema={"json": tool.parameters.model_dump(exclude_none=True)},
+            inputSchema={
+                "json": tool.parameters.model_dump(
+                    exclude_none=True, exclude={"additionalProperties"}
+                )
+            },
         )
         result.append(ConverseTool(toolSpec=tool_spec))
     return result

inspect_ai/model/_providers/hf.py CHANGED Viewed

@@ -64,7 +64,7 @@ class HuggingFaceAPI(ModelAPI):
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
             value = model_args.get(name, None)
-            if value:
+            if value is not None:
                 model_args.pop(name)
             return value

inspect_ai/model/_providers/openai.py CHANGED Viewed

@@ -18,6 +18,7 @@ from openai.types.chat import (
     ChatCompletionContentPartImageParam,
     ChatCompletionContentPartParam,
     ChatCompletionContentPartTextParam,
+    ChatCompletionDeveloperMessageParam,
     ChatCompletionMessage,
     ChatCompletionMessageParam,
     ChatCompletionMessageToolCallParam,
@@ -141,6 +142,18 @@ class OpenAIAPI(ModelAPI):
                 **model_args,
             )
+    def is_o1(self) -> bool:
+        return self.model_name.startswith("o1")
+    def is_o1_full(self) -> bool:
+        return self.is_o1() and not self.is_o1_mini() and not self.is_o1_preview()
+    def is_o1_mini(self) -> bool:
+        return self.model_name.startswith("o1-mini")
+    def is_o1_preview(self) -> bool:
+        return self.model_name.startswith("o1-preview")
     async def generate(
         self,
         input: list[ChatMessage],
@@ -148,8 +161,8 @@ class OpenAIAPI(ModelAPI):
         tool_choice: ToolChoice,
         config: GenerateConfig,
     ) -> ModelOutput | tuple[ModelOutput, ModelCall]:
-        # short-circuit to call o1- model
-        if self.model_name.startswith("o1-"):
+        # short-circuit to call o1- models that are text only
+        if self.is_o1_preview() or self.is_o1_mini():
             return await generate_o1(
                 client=self.client,
                 input=input,
@@ -179,7 +192,7 @@ class OpenAIAPI(ModelAPI):
         # prepare request (we do this so we can log the ModelCall)
         request = dict(
-            messages=await as_openai_chat_messages(input),
+            messages=await as_openai_chat_messages(input, self.is_o1_full()),
             tools=chat_tools(tools) if len(tools) > 0 else NOT_GIVEN,
             tool_choice=chat_tool_choice(tool_choice) if len(tools) > 0 else NOT_GIVEN,
             **self.completion_params(config, len(tools) > 0),
@@ -271,8 +284,10 @@ class OpenAIAPI(ModelAPI):
             params["logprobs"] = config.logprobs
         if config.top_logprobs is not None:
             params["top_logprobs"] = config.top_logprobs
-        if tools and config.parallel_tool_calls is not None:
+        if tools and config.parallel_tool_calls is not None and not self.is_o1():
             params["parallel_tool_calls"] = config.parallel_tool_calls
+        if config.reasoning_effort is not None and self.is_o1_full():
+            params["reasoning_effort"] = config.reasoning_effort
         return params
@@ -291,14 +306,23 @@ class OpenAIAPI(ModelAPI):
 async def as_openai_chat_messages(
-    messages: list[ChatMessage],
+    messages: list[ChatMessage], o1_full: bool
 ) -> list[ChatCompletionMessageParam]:
-    return [await openai_chat_message(message) for message in messages]
+    return [await openai_chat_message(message, o1_full) for message in messages]
-async def openai_chat_message(message: ChatMessage) -> ChatCompletionMessageParam:
+async def openai_chat_message(
+    message: ChatMessage, o1_full: bool
+) -> ChatCompletionMessageParam:
     if message.role == "system":
-        return ChatCompletionSystemMessageParam(role=message.role, content=message.text)
+        if o1_full:
+            return ChatCompletionDeveloperMessageParam(
+                role="developer", content=message.text
+            )
+        else:
+            return ChatCompletionSystemMessageParam(
+                role=message.role, content=message.text
+            )
     elif message.role == "user":
         return ChatCompletionUserMessageParam(
             role=message.role,

inspect_ai/model/_providers/providers.py CHANGED Viewed

@@ -242,7 +242,7 @@ def mockllm() -> type[ModelAPI]:
 def validate_openai_client(feature: str) -> None:
     FEATURE = feature
     PACKAGE = "openai"
-    MIN_VERSION = "1.45.0"
+    MIN_VERSION = "1.58.1"
     # verify we have the package
     try:

inspect_ai/model/_providers/vllm.py CHANGED Viewed

@@ -75,7 +75,7 @@ class VLLMAPI(ModelAPI):
         def collect_model_arg(name: str) -> Any | None:
             nonlocal model_args
             value = model_args.get(name, None)
-            if value:
+            if value is not None:
                 model_args.pop(name)
             return value

inspect_ai/model/_render.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from rich.console import RenderableType
-from inspect_ai._util.format import format_function_call
-from inspect_ai._util.transcript import transcript_markdown
 from inspect_ai.tool._tool_call import ToolCall
+from inspect_ai.tool._tool_transcript import transcript_tool_call
 from ._chat_message import ChatMessage, ChatMessageAssistant, ChatMessageTool
@@ -17,8 +16,10 @@ def messages_preceding_assistant(messages: list[ChatMessage]) -> list[ChatMessag
     return list(reversed(preceding))
-def render_tool_calls(tool_calls: list[ToolCall]) -> RenderableType:
-    formatted_calls: list[str] = []
+def render_tool_calls(tool_calls: list[ToolCall]) -> list[RenderableType]:
+    formatted_calls: list[RenderableType] = []
     for call in tool_calls:
-        formatted_calls.append(format_function_call(call.function, call.arguments))
-    return transcript_markdown("```python\n" + "\n\n".join(formatted_calls) + "\n```\n")
+        formatted_calls.extend(transcript_tool_call(call))
+    return formatted_calls

inspect_ai/model/_trace.py CHANGED Viewed

@@ -42,7 +42,7 @@ def trace_assistant_message(
         # print tool calls
         if message.tool_calls:
             content.append(Text())
-            content.append(render_tool_calls(message.tool_calls))
+            content.extend(render_tool_calls(message.tool_calls))
         # print the assistant message
         trace_panel(title="Assistant", content=content)

inspect_ai/solver/_basic_agent.py CHANGED Viewed

@@ -54,6 +54,7 @@ def basic_agent(
     max_attempts: int = 1,
     message_limit: int | None = None,
     token_limit: int | None = None,
+    max_tool_output: int | None = None,
     score_value: ValueToFloat | None = None,
     incorrect_message: str
     | Callable[[TaskState, list[Score]], str] = DEFAULT_INCORRECT_MESSAGE,
@@ -87,6 +88,8 @@ def basic_agent(
           If not specified, will use limit_messages defined for the task. If there is none
           defined for the task, 50 will be used as a default.
        token_limit (int | None): Limit on tokens used in sample before terminating agent.
+       max_tool_output (int | None): Maximum output length (in bytes).
+          Defaults to max_tool_output from active GenerateConfig.
        score_value (ValueToFloat): Function used to extract float from scores (defaults
          to standard value_to_float())
        incorrect_message (str | Callable[[TaskState, list[Score]], str]): User message reply for an
@@ -182,7 +185,9 @@ def basic_agent(
                 # resolve tools calls (if any)
                 if state.output.message.tool_calls:
                     # call tool functions
-                    tool_results = await call_tools(state.output.message, state.tools)
+                    tool_results = await call_tools(
+                        state.output.message, state.tools, max_output=max_tool_output
+                    )
                     state.messages.extend(tool_results)
                     # was an answer submitted?
@@ -194,11 +199,13 @@ def basic_agent(
                         # exit if we are at max_attempts
                         attempts += 1
                         if attempts >= max_attempts:
+                            state.completed = True
                             break
                         # exit if the submission is successful
                         answer_scores = await score(state)
                         if score_value_fn(answer_scores[0].value) == 1.0:
+                            state.completed = True
                             break
                         # otherwise notify the model that it was incorrect and continue

inspect_ai/tool/_tool_transcript.py ADDED Viewed

@@ -0,0 +1,28 @@
+from pydantic import JsonValue
+from rich.console import RenderableType
+from rich.text import Text
+from typing_extensions import Protocol
+from inspect_ai._util.transcript import transcript_function, transcript_markdown
+from ._tool_call import ToolCallContent
+class TranscriptToolCall(Protocol):
+    function: str
+    arguments: dict[str, JsonValue]
+    view: ToolCallContent | None
+def transcript_tool_call(call: TranscriptToolCall) -> list[RenderableType]:
+    content: list[RenderableType] = []
+    if call.view:
+        if call.view.title:
+            content.append(Text.from_markup(f"[bold]{call.view.title}[/bold]\n"))
+        if call.view.format == "markdown":
+            content.append(transcript_markdown(call.view.content))
+        else:
+            content.append(call.view.content)
+    else:
+        content.append(transcript_function(call.function, call.arguments))
+    return content

inspect_ai/util/_sandbox/context.py CHANGED Viewed

@@ -109,7 +109,7 @@ def raise_no_sandbox() -> NoReturn:
 async def init_sandbox_environments_sample(
-    type: str,
+    sandboxenv_type: type[SandboxEnvironment],
     task_name: str,
     config: SandboxEnvironmentConfigType | None,
     files: dict[str, bytes],
@@ -117,7 +117,6 @@ async def init_sandbox_environments_sample(
     metadata: dict[str, Any],
 ) -> dict[str, SandboxEnvironment]:
     # get setup and cleanup functions
-    sandboxenv_type = registry_find_sandboxenv(type)
     sample_init = cast(SampleInit, getattr(sandboxenv_type, "sample_init"))
     sample_cleanup = cast(SampleCleanup, getattr(sandboxenv_type, "sample_cleanup"))

inspect_ai/util/_sandbox/docker/config.py CHANGED Viewed

@@ -2,8 +2,6 @@ import os
 from logging import getLogger
 from pathlib import Path
-import aiofiles
 logger = getLogger(__name__)
@@ -17,7 +15,7 @@ CONFIG_FILES = [
 DOCKERFILE = "Dockerfile"
-async def resolve_compose_file(parent: str = "") -> str:
+def resolve_compose_file(parent: str = "") -> str:
     # existing compose file provides all the config we need
     compose = find_compose_file(parent)
     if compose is not None:
@@ -29,11 +27,11 @@ async def resolve_compose_file(parent: str = "") -> str:
     # dockerfile just needs a compose.yaml synthesized
     elif has_dockerfile(parent):
-        return await auto_compose_file(COMPOSE_DOCKERFILE_YAML, parent)
+        return auto_compose_file(COMPOSE_DOCKERFILE_YAML, parent)
     # otherwise provide a generic python container
     else:
-        return await auto_compose_file(COMPOSE_GENERIC_YAML, parent)
+        return auto_compose_file(COMPOSE_GENERIC_YAML, parent)
 def find_compose_file(parent: str = "") -> str | None:
@@ -59,9 +57,9 @@ def is_auto_compose_file(file: str) -> bool:
     return os.path.basename(file) == AUTO_COMPOSE_YAML
-async def ensure_auto_compose_file(file: str | None) -> None:
+def ensure_auto_compose_file(file: str | None) -> None:
     if file is not None and is_auto_compose_file(file) and not os.path.exists(file):
-        await resolve_compose_file(os.path.dirname(file))
+        resolve_compose_file(os.path.dirname(file))
 def safe_cleanup_auto_compose(file: str | None) -> None:
@@ -100,8 +98,8 @@ services:
 """
-async def auto_compose_file(contents: str, parent: str = "") -> str:
+def auto_compose_file(contents: str, parent: str = "") -> str:
     path = os.path.join(parent, AUTO_COMPOSE_YAML)
-    async with aiofiles.open(path, "w", encoding="utf-8") as f:
-        await f.write(contents)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(contents)
     return Path(path).resolve().as_posix()

inspect-ai 0.3.51__py3-none-any.whl → 0.3.53__py3-none-any.whl

inspect-ai 0.3.51py3-none-any.whl → 0.3.53py3-none-any.whl