PyPI - inspect-ai - Versions diffs - 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl - Mend

inspect-ai 0.3.56py3-none-any.whl → 0.3.58py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_cli/common.py +4 -2
inspect_ai/_cli/eval.py +2 -0
inspect_ai/_cli/trace.py +21 -2
inspect_ai/_display/core/active.py +0 -2
inspect_ai/_display/core/panel.py +1 -1
inspect_ai/_display/rich/display.py +4 -4
inspect_ai/_display/textual/app.py +4 -1
inspect_ai/_display/textual/widgets/samples.py +41 -5
inspect_ai/_eval/eval.py +32 -20
inspect_ai/_eval/evalset.py +7 -5
inspect_ai/_eval/run.py +16 -11
inspect_ai/_eval/task/__init__.py +2 -2
inspect_ai/_eval/task/images.py +40 -25
inspect_ai/_eval/task/run.py +141 -119
inspect_ai/_eval/task/task.py +140 -25
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/content.py +23 -1
inspect_ai/_util/datetime.py +1 -1
inspect_ai/_util/deprecation.py +1 -1
inspect_ai/_util/images.py +20 -17
inspect_ai/_util/json.py +11 -1
inspect_ai/_util/kvstore.py +73 -0
inspect_ai/_util/logger.py +2 -1
inspect_ai/_util/notgiven.py +18 -0
inspect_ai/_util/thread.py +5 -0
inspect_ai/_util/trace.py +39 -3
inspect_ai/_util/transcript.py +36 -7
inspect_ai/_view/www/.prettierrc.js +12 -0
inspect_ai/_view/www/dist/assets/index.js +322 -226
inspect_ai/_view/www/log-schema.json +221 -138
inspect_ai/_view/www/src/App.mjs +18 -9
inspect_ai/_view/www/src/Types.mjs +0 -1
inspect_ai/_view/www/src/api/Types.mjs +15 -4
inspect_ai/_view/www/src/api/api-http.mjs +2 -0
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +18 -3
inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
inspect_ai/_view/www/src/types/log.d.ts +53 -35
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
inspect_ai/approval/_human/util.py +2 -2
inspect_ai/dataset/_sources/csv.py +2 -1
inspect_ai/dataset/_sources/json.py +2 -1
inspect_ai/dataset/_sources/util.py +15 -7
inspect_ai/log/_condense.py +11 -1
inspect_ai/log/_log.py +27 -5
inspect_ai/log/_recorders/eval.py +21 -8
inspect_ai/log/_samples.py +10 -5
inspect_ai/log/_transcript.py +28 -1
inspect_ai/model/__init__.py +10 -2
inspect_ai/model/_call_tools.py +82 -17
inspect_ai/model/_chat_message.py +2 -4
inspect_ai/model/{_trace.py → _conversation.py} +9 -8
inspect_ai/model/_model.py +2 -2
inspect_ai/model/_providers/anthropic.py +9 -7
inspect_ai/model/_providers/azureai.py +6 -4
inspect_ai/model/_providers/bedrock.py +6 -4
inspect_ai/model/_providers/google.py +103 -14
inspect_ai/model/_providers/groq.py +7 -5
inspect_ai/model/_providers/hf.py +11 -6
inspect_ai/model/_providers/mistral.py +6 -9
inspect_ai/model/_providers/openai.py +34 -8
inspect_ai/model/_providers/openai_o1.py +10 -12
inspect_ai/model/_providers/vertex.py +17 -4
inspect_ai/scorer/__init__.py +13 -2
inspect_ai/scorer/_metrics/__init__.py +2 -2
inspect_ai/scorer/_metrics/std.py +3 -3
inspect_ai/tool/__init__.py +9 -1
inspect_ai/tool/_tool.py +9 -2
inspect_ai/tool/_tool_info.py +2 -1
inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
inspect_ai/util/__init__.py +4 -3
inspect_ai/util/{_trace.py → _conversation.py} +3 -17
inspect_ai/util/_display.py +14 -4
inspect_ai/util/_sandbox/context.py +12 -13
inspect_ai/util/_sandbox/docker/compose.py +24 -13
inspect_ai/util/_sandbox/docker/docker.py +20 -13
inspect_ai/util/_sandbox/docker/util.py +2 -1
inspect_ai/util/_sandbox/environment.py +13 -1
inspect_ai/util/_sandbox/local.py +1 -0
inspect_ai/util/_sandbox/self_check.py +18 -18
inspect_ai/util/_store.py +2 -2
inspect_ai/util/_subprocess.py +3 -3
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0

inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs CHANGED Viewed

@@ -37,7 +37,7 @@ export const ToolEventView = ({ id, event, style, depth }) => {
       functionCall=${functionCall}
       input=${input}
       inputType=${inputType}
-      output=${event.result}
+      output=${event.error?.message || event.result}
       mode="compact"
       view=${event.view}
       />

inspect_ai/_view/www/src/types/log.d.ts CHANGED Viewed

@@ -32,7 +32,6 @@ export type Limit = number | [unknown, unknown] | null;
 export type SampleId = string | number | (string | number)[] | null;
 export type Epochs = number | null;
 export type EpochsReducer = string[] | null;
-export type Trace = boolean | null;
 export type Name1 = string;
 export type Tools = string | string[];
 export type Approvers = ApproverPolicyConfig[];
@@ -112,35 +111,49 @@ export type Input =
       | ChatMessageAssistant
       | ChatMessageTool
     )[];
-export type Content = string | (ContentText | ContentImage)[];
+export type Content =
+  | string
+  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
 export type Type1 = "text";
 export type Text = string;
 export type Type2 = "image";
 export type Image = string;
 export type Detail = "auto" | "low" | "high";
+export type Type3 = "audio";
+export type Audio = string;
+export type Format = "wav" | "mp3";
+export type Type4 = "video";
+export type Video = string;
+export type Format1 = "mp4" | "mpeg" | "mov";
 export type Source = ("input" | "generate") | null;
 export type Role = "system";
-export type Content1 = string | (ContentText | ContentImage)[];
+export type Content1 =
+  | string
+  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
 export type Source1 = ("input" | "generate") | null;
 export type Role1 = "user";
 export type ToolCallId = string | null;
-export type Content2 = string | (ContentText | ContentImage)[];
+export type Content2 =
+  | string
+  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
 export type Source2 = ("input" | "generate") | null;
 export type Role2 = "assistant";
 export type ToolCalls = ToolCall[] | null;
 export type Id1 = string;
 export type Function = string;
-export type Type3 = "function";
+export type Type5 = "function";
 export type ParseError = string | null;
 export type Title = string | null;
-export type Format = "text" | "markdown";
+export type Format2 = "text" | "markdown";
 export type Content3 = string;
-export type Content4 = string | (ContentText | ContentImage)[];
+export type Content4 =
+  | string
+  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
 export type Source3 = ("input" | "generate") | null;
 export type Role3 = "tool";
 export type ToolCallId1 = string | null;
 export type Function1 = string | null;
-export type Type4 =
+export type Type6 =
   | "parsing"
   | "timeout"
   | "unicode_decode"
@@ -218,7 +231,7 @@ export type JsonValue = unknown;
 export type Timestamp1 = string;
 export type Pending1 = boolean | null;
 export type Event1 = "sample_limit";
-export type Type5 = "message" | "time" | "token" | "operator";
+export type Type7 = "message" | "time" | "token" | "operator";
 export type Message2 = string;
 export type Limit1 = number | null;
 export type Timestamp2 = string;
@@ -244,8 +257,8 @@ export type Input2 = (
 )[];
 export type Name5 = string;
 export type Description = string;
-export type Type6 = "object";
-export type Type7 =
+export type Type8 = "object";
+export type Type9 =
   | ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
   | null;
 export type Description1 = string | null;
@@ -265,7 +278,7 @@ export type Cache = ("read" | "write") | null;
 export type Timestamp5 = string;
 export type Pending5 = boolean | null;
 export type Event5 = "tool";
-export type Type8 = "function";
+export type Type10 = "function";
 export type Id3 = string;
 export type Function2 = string;
 export type Result =
@@ -274,7 +287,9 @@ export type Result =
   | boolean
   | ContentText
   | ContentImage
-  | (ContentText | ContentImage)[];
+  | ContentAudio
+  | ContentVideo
+  | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
 export type Truncated = [unknown, unknown] | null;
 export type Timestamp6 = string;
 export type Pending6 = boolean | null;
@@ -324,13 +339,13 @@ export type Timestamp12 = string;
 export type Pending12 = boolean | null;
 export type Event12 = "step";
 export type Action = "begin" | "end";
-export type Type9 = string | null;
+export type Type11 = string | null;
 export type Name8 = string;
 export type Timestamp13 = string;
 export type Pending13 = boolean | null;
 export type Event13 = "subtask";
 export type Name9 = string;
-export type Type10 = string | null;
+export type Type12 = string | null;
 export type Events2 = (
   | SampleInitEvent
   | SampleLimitEvent
@@ -379,7 +394,7 @@ export type Events = (
   | StepEvent
   | SubtaskEvent
 )[];
-export type Type11 = "context" | "time" | "message" | "token" | "operator";
+export type Type13 = "context" | "time" | "message" | "token" | "operator";
 export type Limit2 = number;
 export type Reductions = EvalSampleReductions[] | null;
 export type Scorer1 = string;
@@ -396,7 +411,7 @@ export type Answer1 = string | null;
 export type Explanation2 = string | null;
 export type Metadata8 = {} | null;
 export type SampleId1 = string | number | null;
-export type Samples2 = SampleScore[];
+export type Samples2 = EvalSampleScore[];
 export type Location1 = string;
 export interface EvalLog {
@@ -448,7 +463,6 @@ export interface EvalConfig {
   sample_id: SampleId;
   epochs: Epochs;
   epochs_reducer: EpochsReducer;
-  trace: Trace;
   approval: ApprovalPolicyConfig | null;
   fail_on_error: FailOnError;
   message_limit: MessageLimit;
@@ -614,6 +628,16 @@ export interface ContentImage {
   image: Image;
   detail: Detail;
 }
+export interface ContentAudio {
+  type: Type3;
+  audio: Audio;
+  format: Format;
+}
+export interface ContentVideo {
+  type: Type4;
+  video: Video;
+  format: Format1;
+}
 export interface ChatMessageUser {
   content: Content1;
   source: Source1;
@@ -630,7 +654,7 @@ export interface ToolCall {
   id: Id1;
   function: Function;
   arguments: Arguments;
-  type: Type3;
+  type: Type5;
   parse_error: ParseError;
   view: ToolCallContent | null;
 }
@@ -640,7 +664,7 @@ export interface Arguments {}
  */
 export interface ToolCallContent {
   title: Title;
-  format: Format;
+  format: Format2;
   content: Content3;
 }
 export interface ChatMessageTool {
@@ -652,7 +676,7 @@ export interface ChatMessageTool {
   error: ToolCallError | null;
 }
 export interface ToolCallError {
-  type: Type4;
+  type: Type6;
   message: Message1;
 }
 export interface ModelOutput {
@@ -735,7 +759,7 @@ export interface SampleLimitEvent {
   timestamp: Timestamp1;
   pending: Pending1;
   event: Event1;
-  type: Type5;
+  type: Type7;
   message: Message2;
   limit: Limit1;
 }
@@ -822,7 +846,7 @@ export interface ToolInfo {
  * Description of tool parameters object in JSON Schema format.
  */
 export interface ToolParams {
-  type: Type6;
+  type: Type8;
   properties: Properties;
   required: Required1;
   additionalProperties: Additionalproperties1;
@@ -834,7 +858,7 @@ export interface Properties {
  * Description of tool parameter in JSON Schema format.
  */
 export interface ToolParam {
-  type: Type7;
+  type: Type9;
   description: Description1;
   default: Default;
   enum: Enum;
@@ -897,7 +921,7 @@ export interface ToolEvent {
   timestamp: Timestamp5;
   pending: Pending5;
   event: Event5;
-  type: Type8;
+  type: Type10;
   id: Id3;
   function: Function2;
   arguments: Arguments1;
@@ -999,7 +1023,7 @@ export interface StepEvent {
   pending: Pending12;
   event: Event12;
   action: Action;
-  type: Type9;
+  type: Type11;
   name: Name8;
 }
 /**
@@ -1010,7 +1034,7 @@ export interface SubtaskEvent {
   pending: Pending13;
   event: Event13;
   name: Name9;
-  type: Type10;
+  type: Type12;
   input: Input4;
   result: Result1;
   events: Events2;
@@ -1026,7 +1050,7 @@ export interface Attachments {
   [k: string]: string;
 }
 export interface EvalSampleLimit {
-  type: Type11;
+  type: Type13;
   limit: Limit2;
 }
 export interface EvalSampleReductions {
@@ -1034,13 +1058,7 @@ export interface EvalSampleReductions {
   reducer: Reducer1;
   samples: Samples2;
 }
-/**
- * Score for a Sample
- *
- * Args:
- *    sample_id: (str | int | None) Unique id of a sample
- */
-export interface SampleScore {
+export interface EvalSampleScore {
   value: Value2;
   answer: Answer1;
   explanation: Explanation2;

inspect_ai/_view/www/src/workspace/WorkSpace.mjs CHANGED Viewed

@@ -150,7 +150,7 @@ export const WorkSpace = ({
     // The samples tab
     // Currently only appears when the result is successful
-    if (evalStatus !== "error" && sampleMode !== "none") {
+    if (sampleMode !== "none") {
       resolvedTabs.samples = {
         id: kEvalWorkspaceTabId,
         scrollable: samples.length === 1,

inspect_ai/approval/_human/util.py CHANGED Viewed

@@ -5,7 +5,7 @@ from rich.text import Text
 from inspect_ai._util.transcript import transcript_markdown
 from inspect_ai.tool._tool_call import ToolCallContent, ToolCallView
-from inspect_ai.util._trace import trace_enabled
+from inspect_ai.util._display import display_type
 HUMAN_APPROVED = "Human operator approved tool call."
 HUMAN_REJECTED = "Human operator rejected the tool call."
@@ -18,7 +18,7 @@ def render_tool_approval(message: str, view: ToolCallView) -> list[RenderableTyp
     text_highlighter = ReprHighlighter()
     # ignore content if trace enabled
-    message = message.strip() if not trace_enabled() else ""
+    message = message.strip() if display_type() != "conversation" else ""
     def add_view_content(view_content: ToolCallContent) -> None:
         if view_content.title:

inspect_ai/dataset/_sources/csv.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import csv
+import os
 from io import TextIOWrapper
 from pathlib import Path
 from typing import Any
@@ -75,7 +76,7 @@ def csv_dataset(
         dataset = MemoryDataset(
             samples=data_to_samples(valid_data, data_to_sample, auto_id),
             name=name,
-            location=csv_file,
+            location=os.path.abspath(csv_file),
         )
         # resolve relative file paths

inspect_ai/dataset/_sources/json.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import os
 from io import TextIOWrapper
 from pathlib import Path
 from typing import Any, cast
@@ -75,7 +76,7 @@ def json_dataset(
         dataset = MemoryDataset(
             samples=data_to_samples(dataset_reader(f), data_to_sample, auto_id),
             name=name,
-            location=json_file,
+            location=os.path.abspath(json_file),
         )
         # resolve relative file paths

inspect_ai/dataset/_sources/util.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Callable
-from inspect_ai._util.content import Content, ContentImage
+from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentVideo
 from inspect_ai._util.file import filesystem
 from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
@@ -44,24 +44,28 @@ def resolve_sample_files(dataset: Dataset) -> None:
             for path in sample.files.keys():
                 sample.files[path] = resolve_file(sample.files[path])
+        # check for setup script
+        if sample.setup is not None:
+            sample.setup = resolve_file(sample.setup)
         # check for image paths
         if not isinstance(sample.input, str):
-            sample.input = messages_with_resolved_images(sample.input, resolve_file)
+            sample.input = messages_with_resolved_content(sample.input, resolve_file)
-def messages_with_resolved_images(
+def messages_with_resolved_content(
     messages: list[ChatMessage], resolver: Callable[[str], str]
 ) -> list[ChatMessage]:
-    return [message_with_resolved_image(message, resolver) for message in messages]
+    return [message_with_resolved_content(message, resolver) for message in messages]
-def message_with_resolved_image(
+def message_with_resolved_content(
     message: ChatMessage, resolver: Callable[[str], str]
 ) -> ChatMessage:
     if isinstance(message, ChatMessageUser) and not isinstance(message.content, str):
         return ChatMessageUser(
             content=[
-                chat_content_with_resolved_image(content, resolver)
+                chat_content_with_resolved_content(content, resolver)
                 for content in message.content
             ],
             source=message.source,
@@ -70,7 +74,7 @@ def message_with_resolved_image(
         return message
-def chat_content_with_resolved_image(
+def chat_content_with_resolved_content(
     content: Content, resolver: Callable[[str], str]
 ) -> Content:
     if isinstance(content, ContentImage):
@@ -78,5 +82,9 @@ def chat_content_with_resolved_image(
             image=resolver(content.image),
             detail=content.detail,
         )
+    elif isinstance(content, ContentAudio):
+        return ContentAudio(audio=resolver(content.audio), format=content.format)
+    elif isinstance(content, ContentVideo):
+        return ContentVideo(video=resolver(content.video), format=content.format)
     else:
         return content

inspect_ai/log/_condense.py CHANGED Viewed

@@ -6,7 +6,13 @@ from typing import (
 from pydantic import JsonValue
 from inspect_ai._util.constants import BASE_64_DATA_REMOVED
-from inspect_ai._util.content import Content, ContentImage, ContentText
+from inspect_ai._util.content import (
+    Content,
+    ContentAudio,
+    ContentImage,
+    ContentText,
+    ContentVideo,
+)
 from inspect_ai._util.hash import mm3_hash
 from inspect_ai._util.json import JsonChange
 from inspect_ai._util.url import is_data_uri
@@ -304,3 +310,7 @@ def walk_content(content: Content, content_fn: Callable[[str], str]) -> Content:
         return content.model_copy(update=dict(text=content_fn(content.text)))
     elif isinstance(content, ContentImage):
         return content.model_copy(update=dict(image=content_fn(content.image)))
+    elif isinstance(content, ContentAudio):
+        return content.model_copy(update=dict(audio=content_fn(content.audio)))
+    elif isinstance(content, ContentVideo):
+        return content.model_copy(update=dict(video=content_fn(content.video)))

inspect_ai/log/_log.py CHANGED Viewed

@@ -16,6 +16,7 @@ from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH, PKG_NAME
 from inspect_ai._util.error import EvalError, exception_message
 from inspect_ai._util.logger import warn_once
 from inspect_ai.approval._policy import ApprovalPolicyConfig
+from inspect_ai.dataset._dataset import MT, metadata_as
 from inspect_ai.model import (
     ChatMessage,
     GenerateConfig,
@@ -24,6 +25,8 @@ from inspect_ai.model import (
 )
 from inspect_ai.scorer import Score
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
+from inspect_ai.util._store import Store
+from inspect_ai.util._store_model import SMT
 from ._transcript import Event
@@ -45,9 +48,6 @@ class EvalConfig(BaseModel):
     epochs_reducer: list[str] | None = Field(default=None)
     """Reducers for aggregating per-sample scores."""
-    trace: bool | None = Field(default=None)
-    """Trace message interactions with evaluated model to terminal."""
     approval: ApprovalPolicyConfig | None = Field(default=None)
     """Approval policy for tool use."""
@@ -158,9 +158,31 @@ class EvalSample(BaseModel):
     metadata: dict[str, Any]
     """Additional sample metadata."""
+    def metadata_as(self, metadata_cls: Type[MT]) -> MT:
+        """Pydantic model interface to metadata.
+        Args:
+          metadata_cls: Pydantic model type
+        Returns:
+          BaseModel: Instance of metadata_cls bound to sample metadata.
+        """
+        return metadata_as(self.metadata, metadata_cls)
     store: dict[str, Any] = Field(default_factory=dict)
     """State at end of sample execution."""
+    def store_as(self, model_cls: Type[SMT]) -> SMT:
+        """Pydantic model interface to the store.
+        Args:
+          model_cls: Pydantic model type (must derive from StoreModel)
+        Returns:
+          StoreModel: Instance of model_cls bound to sample store data.
+        """
+        return model_cls(store=Store(self.store))
     events: list[Event] = Field(default_factory=list)
     """Events that occurred during sample execution."""
@@ -330,7 +352,7 @@ class EvalResults(BaseModel):
         """Scorer used to compute results (deprecated)."""
         warn_once(
             logger,
-            "The 'scorer' field is deprecated. Use 'scorers' instead.",
+            "The 'scorer' field is deprecated. Use 'scores' instead.",
         )
         return self.scores[0] if self.scores else None
@@ -339,7 +361,7 @@ class EvalResults(BaseModel):
         """Metrics computed (deprecated)."""
         warn_once(
             logger,
-            "The 'metrics' field is deprecated. Access metrics through 'scorers' instead.",
+            "The 'metrics' field is deprecated. Access metrics through 'scores' instead.",
         )
         return self.scores[0].metrics if self.scores else {}

inspect_ai/log/_recorders/eval.py CHANGED Viewed

@@ -13,7 +13,12 @@ from pydantic_core import to_json
 from typing_extensions import override
 from inspect_ai._util.constants import LOG_SCHEMA_VERSION
-from inspect_ai._util.content import ContentImage, ContentText
+from inspect_ai._util.content import (
+    ContentAudio,
+    ContentImage,
+    ContentText,
+    ContentVideo,
+)
 from inspect_ai._util.error import EvalError
 from inspect_ai._util.file import FileSystem, async_fileystem, dirname, file, filesystem
 from inspect_ai._util.json import jsonable_python
@@ -90,9 +95,11 @@ class EvalRecorder(FileRecorder):
         self.data: dict[str, ZipLogFile] = {}
     @override
-    async def log_init(self, eval: EvalSpec, location: str | None = None) -> str:
+    async def log_init(
+        self, eval: EvalSpec, location: str | None = None, *, clean: bool = False
+    ) -> str:
         # if the file exists then read summaries
-        if location is not None and self.fs.exists(location):
+        if not clean and location is not None and self.fs.exists(location):
             with file(location, "rb") as f:
                 with ZipFile(f, "r") as zip:
                     log_start = _read_start(zip)
@@ -229,7 +236,7 @@ class EvalRecorder(FileRecorder):
     async def write_log(cls, location: str, log: EvalLog) -> None:
         # write using the recorder (so we get all of the extra streams)
         recorder = EvalRecorder(dirname(location))
-        await recorder.log_init(log.eval, location)
+        await recorder.log_init(log.eval, location, clean=True)
         await recorder.log_start(log.eval, log.plan)
         for sample in log.samples or []:
             await recorder.log_sample(log.eval, sample)
@@ -244,14 +251,20 @@ def text_inputs(inputs: str | list[ChatMessage]) -> str | list[ChatMessage]:
         input: list[ChatMessage] = []
         for message in inputs:
             if not isinstance(message.content, str):
-                filtered_content: list[ContentText | ContentImage] = []
+                filtered_content: list[
+                    ContentText | ContentImage | ContentAudio | ContentVideo
+                ] = []
                 for content in message.content:
-                    if content.type != "image":
+                    if content.type == "text":
                         filtered_content.append(content)
-                if len(filtered_content) == 0:
-                    filtered_content.append(ContentText(text="(Image)"))
+                    else:
+                        filtered_content.append(
+                            ContentText(text=f"({content.type.capitalize()})")
+                        )
                 message.content = filtered_content
                 input.append(message)
+            else:
+                input.append(message)
         return input
     else:

inspect_ai/log/_samples.py CHANGED Viewed

@@ -29,7 +29,7 @@ class ActiveSample:
         sandboxes: dict[str, SandboxConnection],
     ) -> None:
         self.id = uuid()
-        self.started = datetime.now().timestamp()
+        self.started: float | None = None
         self.completed: float | None = None
         self.task = task
         self.model = model
@@ -48,10 +48,15 @@ class ActiveSample:
     @property
     def execution_time(self) -> float:
-        completed = (
-            self.completed if self.completed is not None else datetime.now().timestamp()
-        )
-        return completed - self.started
+        if self.started is not None:
+            completed = (
+                self.completed
+                if self.completed is not None
+                else datetime.now().timestamp()
+            )
+            return completed - self.started
+        else:
+            return 0
     def interrupt(self, action: Literal["score", "error"]) -> None:
         self._interrupt_action = action

inspect_ai/log/_transcript.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import asyncio
 import contextlib
 from contextvars import ContextVar
 from datetime import datetime
@@ -11,7 +12,7 @@ from typing import (
     Union,
 )
-from pydantic import BaseModel, Field, JsonValue, field_serializer
+from pydantic import BaseModel, ConfigDict, Field, JsonValue, field_serializer
 from inspect_ai._util.constants import SAMPLE_SUBTASK
 from inspect_ai._util.error import EvalError
@@ -176,6 +177,32 @@ class ToolEvent(BaseEvent):
         self.events = events
         self.pending = None
+    # mechanism for operator to cancel the tool call
+    def set_task(self, task: asyncio.Task[Any]) -> None:
+        """Set the tool task (for possible cancellation)"""
+        self._task = task
+    def cancel(self) -> None:
+        """Cancel the tool task."""
+        if self._task:
+            self._cancelled = True
+            self._task.cancel()
+    @property
+    def cancelled(self) -> bool:
+        """Was the task cancelled?"""
+        return self._cancelled is True
+    _cancelled: bool | None = None
+    """Was this tool call cancelled?"""
+    _task: asyncio.Task[Any] | None = None
+    """Handle to task (used for cancellation)"""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    """Required so that we can include '_task' as a member."""
 class ApprovalEvent(BaseEvent):
     """Tool approval."""

inspect_ai/model/__init__.py CHANGED Viewed

@@ -1,6 +1,12 @@
 # ruff: noqa: F401 F403 F405
-from inspect_ai._util.content import Content, ContentImage, ContentText
+from inspect_ai._util.content import (
+    Content,
+    ContentAudio,
+    ContentImage,
+    ContentText,
+    ContentVideo,
+)
 from inspect_ai._util.deprecation import relocated_module_attribute
 from ._cache import (
@@ -42,8 +48,10 @@ __all__ = [
     "GenerateConfig",
     "GenerateConfigArgs",
     "CachePolicy",
-    "ContentText",
+    "ContentAudio",
     "ContentImage",
+    "ContentText",
+    "ContentVideo",
     "Content",
     "ChatMessage",
     "ChatMessageSystem",

inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl

inspect-ai 0.3.56py3-none-any.whl → 0.3.58py3-none-any.whl