PyPI - inspect-ai - Versions diffs - 0.3.76__py3-none-any.whl → 0.3.77__py3-none-any.whl - Mend

inspect-ai 0.3.76py3-none-any.whl → 0.3.77py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

inspect_ai/_cli/eval.py +16 -0
inspect_ai/_display/core/results.py +6 -1
inspect_ai/_eval/eval.py +8 -1
inspect_ai/_eval/evalset.py +3 -0
inspect_ai/_eval/run.py +3 -2
inspect_ai/_util/content.py +3 -0
inspect_ai/_view/www/dist/assets/index.js +18 -2
inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +22 -4
inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +1 -1
inspect_ai/model/_openai.py +67 -4
inspect_ai/model/_openai_responses.py +277 -0
inspect_ai/model/_providers/anthropic.py +1 -0
inspect_ai/model/_providers/azureai.py +2 -2
inspect_ai/model/_providers/mistral.py +29 -13
inspect_ai/model/_providers/openai.py +53 -49
inspect_ai/model/_providers/openai_responses.py +177 -0
inspect_ai/model/_providers/openrouter.py +52 -2
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/tool/__init__.py +2 -0
inspect_ai/tool/_tool.py +23 -3
inspect_ai/tool/_tools/_think.py +48 -0
{inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/METADATA +1 -1
{inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/RECORD +27 -25
{inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/WHEEL +1 -1
inspect_ai/model/_image.py +0 -15
{inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.76.dist-info → inspect_ai-0.3.77.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -115,6 +115,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         help="Tags to associate with this evaluation run.",
         envvar="INSPECT_EVAL_TAGS",
     )
+    @click.option(
+        "--metadata",
+        multiple=True,
+        type=str,
+        help="Metadata to associate with this evaluation run (more than one --metadata argument can be specified).",
+        envvar="INSPECT_EVAL_METADATA",
+    )
     @click.option(
         "--trace",
         type=bool,
@@ -449,6 +456,7 @@ def eval_command(
     s: tuple[str] | None,
     solver_config: str | None,
     tags: str | None,
+    metadata: tuple[str] | None,
     trace: bool | None,
     approval: str | None,
     sandbox: str | None,
@@ -525,6 +533,7 @@ def eval_command(
         s=s,
         solver_config=solver_config,
         tags=tags,
+        metadata=metadata,
         trace=trace,
         approval=approval,
         sandbox=sandbox,
@@ -616,6 +625,7 @@ def eval_set_command(
     s: tuple[str] | None,
     solver_config: str | None,
     tags: str | None,
+    metadata: tuple[str] | None,
     sandbox: str | None,
     no_sandbox_cleanup: bool | None,
     epochs: int | None,
@@ -695,6 +705,7 @@ def eval_set_command(
         s=s,
         solver_config=solver_config,
         tags=tags,
+        metadata=metadata,
         trace=trace,
         approval=approval,
         sandbox=sandbox,
@@ -749,6 +760,7 @@ def eval_exec(
     s: tuple[str] | None,
     solver_config: str | None,
     tags: str | None,
+    metadata: tuple[str] | None,
     trace: bool | None,
     approval: str | None,
     sandbox: str | None,
@@ -790,6 +802,9 @@ def eval_exec(
     # parse tags
     eval_tags = parse_comma_separated(tags)
+    # parse metadata
+    eval_metadata = parse_cli_args(metadata)
     # resolve epochs
     eval_epochs = (
         Epochs(epochs, create_reducers(parse_comma_separated(epochs_reducer)))
@@ -825,6 +840,7 @@ def eval_exec(
             task_args=task_args,
             solver=SolverSpec(solver, solver_args) if solver else None,
             tags=eval_tags,
+            metadata=eval_metadata,
             trace=trace,
             approval=approval,
             sandbox=parse_sandbox(sandbox),

inspect_ai/_display/core/results.py CHANGED Viewed

@@ -131,9 +131,14 @@ def task_stats(stats: EvalStats) -> RenderableType:
         else:
             input_tokens = f"[bold]I: [/bold]{usage.input_tokens:,}"
+        if usage.reasoning_tokens is not None:
+            reasoning_tokens = f", [bold]R: [/bold]{usage.reasoning_tokens:,}"
+        else:
+            reasoning_tokens = ""
         table.add_row(
             Text(model, style="bold"),
-            f"  {usage.total_tokens:,} tokens [{input_tokens}, [bold]O: [/bold]{usage.output_tokens:,}]",
+            f"  {usage.total_tokens:,} tokens [{input_tokens}, [bold]O: [/bold]{usage.output_tokens:,}{reasoning_tokens}]",
             style=theme.light,
         )

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -68,6 +68,7 @@ def eval(
     sandbox_cleanup: bool | None = None,
     solver: Solver | list[Solver] | SolverSpec | None = None,
     tags: list[str] | None = None,
+    metadata: dict[str, Any] | None = None,
     trace: bool | None = None,
     display: DisplayType | None = None,
     approval: str | list[ApprovalPolicy] | None = None,
@@ -116,6 +117,7 @@ def eval(
         solver: Alternative solver for task(s).
             Optional (uses task solver by default).
         tags: Tags to associate with this evaluation run.
+        metadata: Metadata to associate with this evaluation run.
         trace: Trace message interactions with evaluated model to terminal.
         display: Task display type (defaults to 'full').
         approval: Tool use approval policies.
@@ -186,6 +188,7 @@ def eval(
                 sandbox_cleanup=sandbox_cleanup,
                 solver=solver,
                 tags=tags,
+                metadata=metadata,
                 approval=approval,
                 log_level=log_level,
                 log_level_transcript=log_level_transcript,
@@ -235,6 +238,7 @@ async def eval_async(
     sandbox_cleanup: bool | None = None,
     solver: Solver | list[Solver] | SolverSpec | None = None,
     tags: list[str] | None = None,
+    metadata: dict[str, Any] | None = None,
     approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
     log_level: str | None = None,
     log_level_transcript: str | None = None,
@@ -274,7 +278,8 @@ async def eval_async(
         sandbox: Sandbox environment type (or optionally a str or tuple with a shorthand spec)
         sandbox_cleanup: Cleanup sandbox environments after task completes (defaults to True)
         solver: Alternative solver for task(s).  Optional (uses task solver by default).
-        tags (list[str] | None): Tags to associate with this evaluation run.
+        tags: Tags to associate with this evaluation run.
+        metadata: Metadata to associate with this evaluation run.
         approval: Tool use approval policies.
           Either a path to an approval policy config file or a list of approval policies.
           Defaults to no approval policy.
@@ -449,6 +454,7 @@ async def eval_async(
                         epochs_reducer=epochs_reducer,
                         solver=solver,
                         tags=tags,
+                        metadata=metadata,
                         score=score,
                         debug_errors=debug_errors is True,
                         **kwargs,
@@ -473,6 +479,7 @@ async def eval_async(
                 epochs_reducer=epochs_reducer,
                 solver=solver,
                 tags=tags,
+                metadata=metadata,
                 score=score,
                 **kwargs,
             )

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -68,6 +68,7 @@ def eval_set(
     sandbox_cleanup: bool | None = None,
     solver: Solver | list[Solver] | SolverSpec | None = None,
     tags: list[str] | None = None,
+    metadata: dict[str, Any] | None = None,
     trace: bool | None = None,
     display: DisplayType | None = None,
     approval: str | list[ApprovalPolicy] | None = None,
@@ -127,6 +128,7 @@ def eval_set(
         solver: Alternative solver(s) for
             evaluating task(s). ptional (uses task solver by default).
         tags: Tags to associate with this evaluation run.
+        metadata: Metadata to associate with this evaluation run.
         trace: Trace message interactions with evaluated model to terminal.
         display: Task display type (defaults to 'full').
         approval: Tool use approval policies.
@@ -193,6 +195,7 @@ def eval_set(
             sandbox_cleanup=sandbox_cleanup,
             solver=solver,
             tags=tags,
+            metadata=metadata,
             trace=trace,
             display=display,
             approval=approval,

inspect_ai/_eval/run.py CHANGED Viewed

@@ -2,7 +2,7 @@ import functools
 import logging
 import os
 import sys
-from typing import Awaitable, Callable, Set, cast
+from typing import Any, Awaitable, Callable, Set, cast
 from inspect_ai._eval.task.task import Task
 from inspect_ai._util.trace import trace_action
@@ -68,6 +68,7 @@ async def eval_run(
     epochs_reducer: list[ScoreReducer] | None = None,
     solver: Solver | SolverSpec | None = None,
     tags: list[str] | None = None,
+    metadata: dict[str, Any] | None = None,
     debug_errors: bool = False,
     score: bool = True,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -205,7 +206,7 @@ async def eval_run(
                     task_args=resolved_task.task_args,
                     model_args=resolved_task.model.model_args,
                     eval_config=task_eval_config,
-                    metadata=task.metadata,
+                    metadata=((metadata or {}) | (task.metadata or {})) or None,
                     recorder=recorder,
                 )
                 await logger.init()

inspect_ai/_util/content.py CHANGED Viewed

@@ -12,6 +12,9 @@ class ContentText(BaseModel):
     text: str
     """Text content."""
+    refusal: bool | None = Field(default=None)
+    """Was this a refusal message?"""
 class ContentReasoning(BaseModel):
     """Reasoning content.

inspect_ai/_view/www/dist/assets/index.js CHANGED Viewed

@@ -21577,7 +21577,7 @@ var require_assets = __commonJS({
           className: clsx(
             "source-code",
             "sourceCode",
-            `language-${highlightLanguage}`,
+            highlightLanguage ? `language-${highlightLanguage}` : void 0,
             styles$10.outputCode
           ),
           children: formattedContent
@@ -21613,6 +21613,22 @@ var require_assets = __commonJS({
       }
       const collapse = Array.isArray(output2) ? output2.every((item2) => !isContentImage(item2)) : !isContentImage(output2);
       const normalizedContent = reactExports.useMemo(() => normalizeContent$1(output2), [output2]);
+      const hasContent = normalizedContent.find((c2) => {
+        if (c2.type === "tool") {
+          for (const t2 of c2.content) {
+            if (t2.type === "text") {
+              if (t2.text) {
+                return true;
+              }
+            } else {
+              return true;
+            }
+          }
+          return false;
+        } else {
+          return true;
+        }
+      });
       const contents2 = mode !== "compact" ? input2 : input2 || functionCall;
       return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [
         mode !== "compact" && (!view || view.title) ? /* @__PURE__ */ jsxRuntimeExports.jsx(ToolTitle, { title: (view == null ? void 0 : view.title) || functionCall }) : "",
@@ -21625,7 +21641,7 @@ var require_assets = __commonJS({
               toolCallView: view
             }
           ),
-          /* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse, border: true, lines: 15, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: normalizedContent }) })
+          hasContent ? /* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse, border: true, lines: 15, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: normalizedContent }) }) : void 0
         ] }) })
       ] });
     };

inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx CHANGED Viewed

@@ -83,8 +83,24 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
     : !isContentImage(output);
   const normalizedContent = useMemo(() => normalizeContent(output), [output]);
-  const contents = mode !== "compact" ? input : input || functionCall;
+  const hasContent = normalizedContent.find((c) => {
+    if (c.type === "tool") {
+      for (const t of c.content) {
+        if (t.type === "text") {
+          if (t.text) {
+            return true;
+          }
+        } else {
+          return true;
+        }
+      }
+      return false;
+    } else {
+      return true;
+    }
+  });
+  const contents = mode !== "compact" ? input : input || functionCall;
   return (
     <div>
       {mode !== "compact" && (!view || view.title) ? (
@@ -99,9 +115,11 @@ export const ToolCallView: FC<ToolCallViewProps> = ({
             contents={contents}
             toolCallView={view}
           />
-          <ExpandablePanel collapse={collapse} border={true} lines={15}>
-            <MessageContent contents={normalizedContent} />
-          </ExpandablePanel>
+          {hasContent ? (
+            <ExpandablePanel collapse={collapse} border={true} lines={15}>
+              <MessageContent contents={normalizedContent} />
+            </ExpandablePanel>
+          ) : undefined}
         </div>
       </div>
     </div>

inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx CHANGED Viewed

@@ -64,7 +64,7 @@ export const ToolInput: FC<ToolInputProps> = memo((props) => {
         className={clsx(
           "source-code",
           "sourceCode",
-          `language-${highlightLanguage}`,
+          highlightLanguage ? `language-${highlightLanguage}` : undefined,
           styles.outputCode,
         )}
       >

inspect_ai/model/_openai.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import json
 import re
+from copy import copy
 from typing import Literal
+from openai import BadRequestError, OpenAIError
 from openai.types.chat import (
     ChatCompletion,
     ChatCompletionAssistantMessageParam,
@@ -26,7 +28,9 @@ from openai.types.chat.chat_completion import Choice, ChoiceLogprobs
 from openai.types.chat.chat_completion_message_tool_call import Function
 from openai.types.completion_usage import CompletionUsage
 from openai.types.shared_params.function_definition import FunctionDefinition
+from pydantic import JsonValue
+from inspect_ai._util.constants import BASE_64_DATA_REMOVED
 from inspect_ai._util.content import (
     Content,
     ContentAudio,
@@ -48,7 +52,16 @@ from ._chat_message import (
     ChatMessageTool,
     ChatMessageUser,
 )
-from ._model_output import ModelUsage, StopReason, as_stop_reason
+from ._model_output import ModelOutput, ModelUsage, StopReason, as_stop_reason
+class OpenAIResponseError(OpenAIError):
+    def __init__(self, code: str, message: str) -> None:
+        self.code = code
+        self.message = message
+    def __str__(self) -> str:
+        return f"{self.code}: {self.message}"
 def is_o_series(name: str) -> bool:
@@ -58,6 +71,10 @@ def is_o_series(name: str) -> bool:
         return not is_gpt(name) and bool(re.search(r"o\d+", name))
+def is_o1_pro(name: str) -> bool:
+    return "o1-pro" in name
 def is_o1_mini(name: str) -> bool:
     return "o1-mini" in name
@@ -320,6 +337,7 @@ def chat_messages_from_openai(
                 chat_messages.append(ChatMessageUser(content=content))
         elif message["role"] == "assistant":
             # resolve content
+            refusal: Literal[True] | None = None
             asst_content = message.get("content", None)
             if isinstance(asst_content, str):
                 result = parse_content_with_reasoning(asst_content)
@@ -336,6 +354,8 @@ def chat_messages_from_openai(
                     content = asst_content
             elif asst_content is None:
                 content = message.get("refusal", None) or ""
+                if content:
+                    refusal = True
             else:
                 content = []
                 for ac in asst_content:
@@ -348,7 +368,7 @@ def chat_messages_from_openai(
             )
             if reasoning is not None:
                 if isinstance(content, str):
-                    content = [ContentText(text=content)]
+                    content = [ContentText(text=content, refusal=refusal)]
                 else:
                     content.insert(0, ContentReasoning(reasoning=str(reasoning)))
@@ -437,7 +457,7 @@ def content_from_openai(
             )
         ]
     elif content["type"] == "refusal":
-        return [ContentText(text=content["refusal"])]
+        return [ContentText(text=content["refusal"], refusal=True)]
     else:
         content_type = content["type"]
         raise ValueError(f"Unexpected content type '{content_type}' in message.")
@@ -455,8 +475,10 @@ def chat_message_assistant_from_openai(
     if reasoning is not None:
         content: str | list[Content] = [
             ContentReasoning(reasoning=str(reasoning)),
-            ContentText(text=msg_content),
+            ContentText(text=msg_content, refusal=True if refusal else None),
         ]
+    elif refusal is not None:
+        content = [ContentText(text=msg_content, refusal=True)]
     else:
         content = msg_content
@@ -484,3 +506,44 @@ def chat_choices_from_openai(
         )
         for choice in choices
     ]
+def openai_handle_bad_request(
+    model_name: str, e: BadRequestError
+) -> ModelOutput | Exception:
+    # extract message
+    if isinstance(e.body, dict) and "message" in e.body.keys():
+        content = str(e.body.get("message"))
+    else:
+        content = e.message
+    # narrow stop_reason
+    stop_reason: StopReason | None = None
+    if e.code == "context_length_exceeded":
+        stop_reason = "model_length"
+    elif (
+        e.code == "invalid_prompt"  # seems to happen for o1/o3
+        or e.code == "content_policy_violation"  # seems to happen for vision
+        or e.code == "content_filter"  # seems to happen on azure
+    ):
+        stop_reason = "content_filter"
+    if stop_reason:
+        return ModelOutput.from_content(
+            model=model_name, content=content, stop_reason=stop_reason
+        )
+    else:
+        return e
+def openai_media_filter(key: JsonValue | None, value: JsonValue) -> JsonValue:
+    # remove images from raw api call
+    if key == "image_url" and isinstance(value, dict) and "url" in value:
+        url = str(value.get("url"))
+        if url.startswith("data:"):
+            value = copy(value)
+            value.update(url=BASE_64_DATA_REMOVED)
+    elif key == "input_audio" and isinstance(value, dict) and "data" in value:
+        value = copy(value)
+        value.update(data=BASE_64_DATA_REMOVED)
+    return value

inspect-ai 0.3.76__py3-none-any.whl → 0.3.77__py3-none-any.whl

inspect-ai 0.3.76py3-none-any.whl → 0.3.77py3-none-any.whl