PyPI - inspect-ai - Versions diffs - 0.3.75__py3-none-any.whl → 0.3.77__py3-none-any.whl - Mend

inspect-ai 0.3.75py3-none-any.whl → 0.3.77py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

inspect_ai/_cli/eval.py +16 -0
inspect_ai/_display/core/results.py +6 -1
inspect_ai/_eval/eval.py +8 -1
inspect_ai/_eval/evalset.py +6 -2
inspect_ai/_eval/registry.py +3 -5
inspect_ai/_eval/run.py +7 -2
inspect_ai/_eval/task/run.py +4 -0
inspect_ai/_util/content.py +3 -0
inspect_ai/_util/logger.py +3 -0
inspect_ai/_view/www/dist/assets/index.css +28 -16
inspect_ai/_view/www/dist/assets/index.js +4811 -4609
inspect_ai/_view/www/log-schema.json +79 -9
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +22 -4
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
inspect_ai/_view/www/src/types/log.d.ts +11 -5
inspect_ai/log/_recorders/json.py +8 -0
inspect_ai/log/_transcript.py +13 -4
inspect_ai/model/_call_tools.py +13 -4
inspect_ai/model/_chat_message.py +3 -0
inspect_ai/model/_model.py +5 -1
inspect_ai/model/_model_output.py +6 -1
inspect_ai/model/_openai.py +78 -10
inspect_ai/model/_openai_responses.py +277 -0
inspect_ai/model/_providers/anthropic.py +134 -75
inspect_ai/model/_providers/azureai.py +2 -2
inspect_ai/model/_providers/mistral.py +29 -13
inspect_ai/model/_providers/openai.py +64 -57
inspect_ai/model/_providers/openai_responses.py +177 -0
inspect_ai/model/_providers/openrouter.py +52 -2
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/vertex.py +5 -2
inspect_ai/tool/__init__.py +6 -0
inspect_ai/tool/_tool.py +23 -3
inspect_ai/tool/_tool_call.py +5 -2
inspect_ai/tool/_tool_support_helpers.py +200 -0
inspect_ai/tool/_tools/_bash_session.py +119 -0
inspect_ai/tool/_tools/_computer/_computer.py +1 -1
inspect_ai/tool/_tools/_text_editor.py +121 -0
inspect_ai/tool/_tools/_think.py +48 -0
inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
inspect_ai/tool/_tools/_web_search.py +1 -1
inspect_ai/util/_json.py +28 -0
inspect_ai/util/_sandbox/context.py +16 -7
inspect_ai/util/_sandbox/docker/config.py +1 -1
inspect_ai/util/_sandbox/docker/internal.py +3 -3
{inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/METADATA +5 -2
{inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/RECORD +56 -80
{inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/WHEEL +1 -1
inspect_ai/model/_image.py +0 -15
inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
{inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info/licenses}/LICENSE +0 -0
{inspect_ai-0.3.75.dist-info → inspect_ai-0.3.77.dist-info}/top_level.txt +0 -0

inspect_ai/_view/www/log-schema.json CHANGED Viewed

@@ -200,8 +200,16 @@
       "description": "Assistant chat message.",
       "properties": {
         "id": {
-          "title": "Id",
-          "type": "string"
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Id"
         },
         "content": {
           "anyOf": [
@@ -286,8 +294,16 @@
       "description": "System chat message.",
       "properties": {
         "id": {
-          "title": "Id",
-          "type": "string"
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Id"
         },
         "content": {
           "anyOf": [
@@ -356,8 +372,16 @@
       "description": "Tool chat message.",
       "properties": {
         "id": {
-          "title": "Id",
-          "type": "string"
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Id"
         },
         "content": {
           "anyOf": [
@@ -435,6 +459,18 @@
           "default": null,
           "title": "Function"
         },
+        "internal_name": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Internal Name"
+        },
         "error": {
           "anyOf": [
             {
@@ -454,6 +490,7 @@
         "role",
         "tool_call_id",
         "function",
+        "internal_name",
         "error"
       ],
       "title": "ChatMessageTool",
@@ -464,8 +501,16 @@
       "description": "User chat message.",
       "properties": {
         "id": {
-          "title": "Id",
-          "type": "string"
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Id"
         },
         "content": {
           "anyOf": [
@@ -4431,10 +4476,21 @@
           "type": "object"
         },
         "type": {
-          "const": "function",
           "title": "Type",
           "type": "string"
         },
+        "internal_name": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Internal Name"
+        },
         "parse_error": {
           "anyOf": [
             {
@@ -4464,6 +4520,7 @@
         "function",
         "arguments",
         "type",
+        "internal_name",
         "parse_error",
         "view"
       ],
@@ -4623,6 +4680,18 @@
           "title": "Arguments",
           "type": "object"
         },
+        "internal_name": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Internal Name"
+        },
         "view": {
           "anyOf": [
             {
@@ -4809,6 +4878,7 @@
         "id",
         "function",
         "arguments",
+        "internal_name",
         "view",
         "result",
         "truncated",

inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx CHANGED Viewed

@@ -83,8 +83,24 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
     : !isContentImage(output);
   const normalizedContent = useMemo(() => normalizeContent(output), [output]);
-  const contents = mode !== "compact" ? input : input || functionCall;
+  const hasContent = normalizedContent.find((c) => {
+    if (c.type === "tool") {
+      for (const t of c.content) {
+        if (t.type === "text") {
+          if (t.text) {
+            return true;
+          }
+        } else {
+          return true;
+        }
+      }
+      return false;
+    } else {
+      return true;
+    }
+  });
+  const contents = mode !== "compact" ? input : input || functionCall;
   return (
     <div>
       {mode !== "compact" && (!view || view.title) ? (
@@ -99,9 +115,11 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
             contents={contents}
             toolCallView={view}
           />
-          <ExpandablePanel collapse={collapse} border={true} lines={15}>
-            <MessageContent contents={normalizedContent} />
-          </ExpandablePanel>
+          {hasContent ? (
+            <ExpandablePanel collapse={collapse} border={true} lines={15}>
+              <MessageContent contents={normalizedContent} />
+            </ExpandablePanel>
+          ) : undefined}
         </div>
       </div>
     </div>

inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx CHANGED Viewed

@@ -64,7 +64,7 @@ export const ToolInput: FC<ToolInputProps> = memo((props) => {
         className={clsx(
           "source-code",
           "sourceCode",
-          `language-${highlightLanguage}`,
+          highlightLanguage ? `language-${highlightLanguage}` : undefined,
           styles.outputCode,
         )}
       >

inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx CHANGED Viewed

@@ -9,7 +9,7 @@ export const categoricalScoreDescriptor = (
     scoreType: kScoreTypeCategorical,
     categories: values,
     compare: (a, b) => {
-      return String(a).localeCompare(String(b));
+      return String(a.value).localeCompare(String(b.value));
     },
     render: (score) => {
       return String(score);

inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx CHANGED Viewed

@@ -13,8 +13,8 @@ export const numericScoreDescriptor = (values: Value2[]): ScoreDescriptor => {
     min: Math.min(...onlyNumeric),
     max: Math.max(...onlyNumeric),
     compare: (a, b) => {
-      if (typeof a === "number" && typeof b === "number") {
-        return a - b;
+      if (typeof a.value === "number" && typeof b.value === "number") {
+        return a.value - b.value;
       } else {
         console.warn("Comparing non-numerics using a numeric score descriptor");
         return 0;

inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx CHANGED Viewed

@@ -165,8 +165,8 @@ export const sortSamples = (
         }
         return samplesDescriptor.selectedScoreDescriptor.compare(
-          aScore,
           bScore,
+          aScore,
         );
       }
       default:

inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css CHANGED Viewed

@@ -31,8 +31,8 @@
 }
 .code {
-  white-space: pre-wrap;
-  word-wrap: anywhere;
+  white-space: pre-wrap !important;
+  word-wrap: anywhere !important;
 }
 .toolConfig {

inspect_ai/_view/www/src/types/log.d.ts CHANGED Viewed

@@ -148,7 +148,7 @@ export type Input =
       | ChatMessageAssistant
       | ChatMessageTool
     )[];
-export type Id1 = string;
+export type Id1 = string | null;
 export type Content =
   | string
   | (
@@ -175,7 +175,7 @@ export type Video = string;
 export type Format1 = "mp4" | "mpeg" | "mov";
 export type Source = ("input" | "generate") | null;
 export type Role = "system";
-export type Id2 = string;
+export type Id2 = string | null;
 export type Content1 =
   | string
   | (
@@ -188,7 +188,7 @@ export type Content1 =
 export type Source1 = ("input" | "generate") | null;
 export type Role1 = "user";
 export type ToolCallId = string[] | null;
-export type Id3 = string;
+export type Id3 = string | null;
 export type Content2 =
   | string
   | (
@@ -203,12 +203,13 @@ export type Role2 = "assistant";
 export type ToolCalls = ToolCall[] | null;
 export type Id4 = string;
 export type Function = string;
-export type Type8 = "function";
+export type Type8 = string;
+export type InternalName = string | null;
 export type ParseError = string | null;
 export type Title = string | null;
 export type Format2 = "text" | "markdown";
 export type Content3 = string;
-export type Id5 = string;
+export type Id5 = string | null;
 export type Content4 =
   | string
   | (
@@ -222,6 +223,7 @@ export type Source3 = ("input" | "generate") | null;
 export type Role3 = "tool";
 export type ToolCallId1 = string | null;
 export type Function1 = string | null;
+export type InternalName1 = string | null;
 export type Type9 =
   | "parsing"
   | "timeout"
@@ -369,6 +371,7 @@ export type Event6 = "tool";
 export type Type12 = "function";
 export type Id7 = string;
 export type Function2 = string;
+export type InternalName2 = string | null;
 export type Result1 =
   | string
   | number
@@ -911,6 +914,7 @@ export interface ToolCall {
   function: Function;
   arguments: Arguments;
   type: Type8;
+  internal_name: InternalName;
   parse_error: ParseError;
   view: ToolCallContent | null;
 }
@@ -933,6 +937,7 @@ export interface ChatMessageTool {
   role: Role3;
   tool_call_id: ToolCallId1;
   function: Function1;
+  internal_name: InternalName1;
   error: ToolCallError | null;
 }
 export interface ToolCallError {
@@ -1201,6 +1206,7 @@ export interface ToolEvent {
   id: Id7;
   function: Function2;
   arguments: Arguments1;
+  internal_name: InternalName2;
   view: ToolCallContent | null;
   result: Result1;
   truncated: Truncated;

inspect_ai/log/_recorders/json.py CHANGED Viewed

@@ -217,6 +217,11 @@ def _read_header_streaming(log_file: str) -> EvalLog:
         # Parse the log file, stopping before parsing samples
         status: Literal["started", "success", "cancelled", "error"] | None = None
+        eval: EvalSpec | None = None
+        plan: EvalPlan | None = None
+        results: EvalResults | None = None
+        stats: EvalStats | None = None
+        error: EvalError | None = None
         for k, v in ijson.kvitems(f, ""):
             if k == "status":
                 assert v in get_args(
@@ -239,6 +244,9 @@ def _read_header_streaming(log_file: str) -> EvalLog:
                 break
     assert status, "Must encounter a 'status'"
+    assert eval, "Must encounter a 'eval'"
+    assert plan, "Must encounter a 'plan'"
+    assert stats, "Must encounter a 'stats'"
     return EvalLog(
         eval=eval,

inspect_ai/log/_transcript.py CHANGED Viewed

@@ -146,7 +146,7 @@ class ModelEvent(BaseEvent):
     """working time for model call that succeeded (i.e. was not retried)."""
     @field_serializer("completed")
-    def serialize_completed(self, dt: datetime) -> str:
+    def serialize_completed(self, dt: datetime | None) -> str | None:
         if dt is None:
             return None
         return dt.astimezone().isoformat()
@@ -170,6 +170,9 @@ class ToolEvent(BaseEvent):
     arguments: dict[str, JsonValue]
     """Arguments to function."""
+    internal_name: str | None = Field(default=None)
+    """Internal name for tool (if any)."""
     view: ToolCallContent | None = Field(default=None)
     """Custom view of tool call input."""
@@ -235,7 +238,9 @@ class ToolEvent(BaseEvent):
     """Required so that we can include '_cancel_fn' as a member."""
     @field_serializer("completed")
-    def serialize_completed(self, dt: datetime) -> str:
+    def serialize_completed(self, dt: datetime | None) -> str | None:
+        if dt is None:
+            return None
         return dt.astimezone().isoformat()
@@ -270,7 +275,9 @@ class SandboxEvent(BaseEvent):
     """Time that sandbox action completed (see `timestamp` for started)"""
     @field_serializer("completed")
-    def serialize_completed(self, dt: datetime) -> str:
+    def serialize_completed(self, dt: datetime | None) -> str | None:
+        if dt is None:
+            return None
         return dt.astimezone().isoformat()
@@ -412,7 +419,9 @@ class SubtaskEvent(BaseEvent):
     """Working time for subtask (i.e. time not spent waiting on semaphores or model retries)."""
     @field_serializer("completed")
-    def serialize_completed(self, dt: datetime) -> str:
+    def serialize_completed(self, dt: datetime | None) -> str | None:
+        if dt is None:
+            return None
         return dt.astimezone().isoformat()

inspect_ai/model/_call_tools.py CHANGED Viewed

@@ -25,7 +25,6 @@ from typing import (
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup
 import anyio
 import yaml
 from anyio.streams.memory import MemoryObjectSendStream
@@ -168,6 +167,7 @@ async def call_tools(
                 id=call.id,
                 function=call.function,
                 arguments=call.arguments,
+                internal_name=call.internal_name,
                 result=content,
                 truncated=truncated,
                 view=call.view,
@@ -183,6 +183,7 @@ async def call_tools(
                             content=content,
                             tool_call_id=call.id,
                             function=call.function,
+                            internal_name=call.internal_name,
                             error=tool_error,
                         ),
                         event,
@@ -201,6 +202,7 @@ async def call_tools(
                 id=call.id,
                 function=call.function,
                 arguments=call.arguments,
+                internal_name=call.internal_name,
                 view=call.view,
                 pending=True,
             )
@@ -216,9 +218,7 @@ async def call_tools(
                     tg.start_soon(call_tool_task, call, send_stream)
                     event._set_cancel_fn(tg.cancel_scope.cancel)
                     async with receive_stream:
-                        async for result in receive_stream:
-                            tool_message, result_event = result
-                            break
+                        tool_message, result_event = await receive_stream.receive()
             except ExceptionGroup as ex:
                 raise ex.exceptions[0]
@@ -226,6 +226,7 @@ async def call_tools(
                 tool_message = ChatMessageTool(
                     content="",
                     function=call.function,
+                    internal_name=call.internal_name,
                     tool_call_id=call.id,
                     error=ToolCallError(
                         "timeout", "Command timed out before completing."
@@ -235,6 +236,7 @@ async def call_tools(
                     id=call.id,
                     function=call.function,
                     arguments=call.arguments,
+                    internal_name=call.internal_name,
                     result=tool_message.content,
                     truncated=None,
                     view=call.view,
@@ -508,6 +510,13 @@ def tool_parse_error_message(arguments: str, ex: Exception) -> str:
 def parse_tool_call(
     id: str, function: str, arguments: str, tools: list[ToolInfo] | None = None
 ) -> ToolCall:
+    """Parse a tool call from a JSON payload.
+    Note that this function doesn't know about internal tool names so the caller
+    should ammend the returned `ToolCall` by mapping the parsed `function` field from
+    from an internal name to an inspect tool name and fixing up the `ToolCall` object
+    as required to reflect this change.
+    """
     error: str | None = None
     arguments_dict: dict[str, Any] = {}

inspect_ai/model/_chat_message.py CHANGED Viewed

@@ -158,6 +158,9 @@ class ChatMessageTool(ChatMessageBase):
     function: str | None = Field(default=None)
     """Name of function called."""
+    internal_name: str | None = Field(default=None)
+    """Internal name for tool (if any)."""
     error: ToolCallError | None = Field(default=None)
     """Error which occurred during tool call."""

inspect_ai/model/_model.py CHANGED Viewed

@@ -454,6 +454,7 @@ class Model:
         async def generate() -> ModelOutput:
             check_sample_interrupt()
+            cache_entry: CacheEntry | None
             if cache:
                 if isinstance(cache, CachePolicy):
                     policy = cache
@@ -481,6 +482,8 @@ class Model:
                         call=None,
                     )
                     return existing
+            else:
+                cache_entry = None
             # verify that model apis are allowed
             self.verify_model_apis()
@@ -550,7 +553,7 @@ class Model:
                     json.dumps(dict(model=str(self), usage=output.usage.model_dump())),
                 )
-            if cache:
+            if cache and cache_entry:
                 cache_store(entry=cache_entry, output=output)
             return output
@@ -1112,6 +1115,7 @@ def tool_result_images_reducer(
                     content=edited_tool_message_content,
                     tool_call_id=message.tool_call_id,
                     function=message.function,
+                    internal_name=message.internal_name,
                 )
             ],
             pending_content + new_user_message_content,

inspect_ai/model/_model_output.py CHANGED Viewed

@@ -188,8 +188,10 @@ class ModelOutput(BaseModel):
         model: str,
         tool_name: str,
         tool_arguments: dict[str, Any],
+        internal_tool_name: str | None = None,
         tool_call_id: str | None = None,
         content: str | None = None,
+        type: str = "function",
     ) -> "ModelOutput":
         """
         Returns a ModelOutput for requesting a tool call.
@@ -197,6 +199,8 @@ class ModelOutput(BaseModel):
         Args:
             model: model name
             tool_name: The name of the tool.
+            internal_tool_name: The model's internal name for the tool (if any).
+            type: The model's type for the tool. e.g. "function", "computer_use_preview"
             tool_arguments: The arguments passed to the tool.
             tool_call_id: Optional ID for the tool call. Defaults to a random UUID.
             content: Optional content to include in the message. Defaults to "tool call for tool {tool_name}".
@@ -221,8 +225,9 @@ class ModelOutput(BaseModel):
                             ToolCall(
                                 id=tool_call_id,
                                 function=tool_name,
+                                internal_name=internal_tool_name,
                                 arguments=tool_arguments,
-                                type="function",
+                                type=type,
                             )
                         ],
                     ),

inspect-ai 0.3.75__py3-none-any.whl → 0.3.77__py3-none-any.whl

inspect-ai 0.3.75py3-none-any.whl → 0.3.77py3-none-any.whl