PyPI - inspect-ai - Versions diffs - 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl - Mend

inspect-ai 0.3.56py3-none-any.whl → 0.3.58py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_cli/common.py +4 -2
inspect_ai/_cli/eval.py +2 -0
inspect_ai/_cli/trace.py +21 -2
inspect_ai/_display/core/active.py +0 -2
inspect_ai/_display/core/panel.py +1 -1
inspect_ai/_display/rich/display.py +4 -4
inspect_ai/_display/textual/app.py +4 -1
inspect_ai/_display/textual/widgets/samples.py +41 -5
inspect_ai/_eval/eval.py +32 -20
inspect_ai/_eval/evalset.py +7 -5
inspect_ai/_eval/run.py +16 -11
inspect_ai/_eval/task/__init__.py +2 -2
inspect_ai/_eval/task/images.py +40 -25
inspect_ai/_eval/task/run.py +141 -119
inspect_ai/_eval/task/task.py +140 -25
inspect_ai/_util/constants.py +1 -0
inspect_ai/_util/content.py +23 -1
inspect_ai/_util/datetime.py +1 -1
inspect_ai/_util/deprecation.py +1 -1
inspect_ai/_util/images.py +20 -17
inspect_ai/_util/json.py +11 -1
inspect_ai/_util/kvstore.py +73 -0
inspect_ai/_util/logger.py +2 -1
inspect_ai/_util/notgiven.py +18 -0
inspect_ai/_util/thread.py +5 -0
inspect_ai/_util/trace.py +39 -3
inspect_ai/_util/transcript.py +36 -7
inspect_ai/_view/www/.prettierrc.js +12 -0
inspect_ai/_view/www/dist/assets/index.js +322 -226
inspect_ai/_view/www/log-schema.json +221 -138
inspect_ai/_view/www/src/App.mjs +18 -9
inspect_ai/_view/www/src/Types.mjs +0 -1
inspect_ai/_view/www/src/api/Types.mjs +15 -4
inspect_ai/_view/www/src/api/api-http.mjs +2 -0
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
inspect_ai/_view/www/src/components/MessageContent.mjs +44 -2
inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +18 -3
inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +242 -178
inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
inspect_ai/_view/www/src/types/log.d.ts +53 -35
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
inspect_ai/approval/_human/util.py +2 -2
inspect_ai/dataset/_sources/csv.py +2 -1
inspect_ai/dataset/_sources/json.py +2 -1
inspect_ai/dataset/_sources/util.py +15 -7
inspect_ai/log/_condense.py +11 -1
inspect_ai/log/_log.py +27 -5
inspect_ai/log/_recorders/eval.py +21 -8
inspect_ai/log/_samples.py +10 -5
inspect_ai/log/_transcript.py +28 -1
inspect_ai/model/__init__.py +10 -2
inspect_ai/model/_call_tools.py +82 -17
inspect_ai/model/_chat_message.py +2 -4
inspect_ai/model/{_trace.py → _conversation.py} +9 -8
inspect_ai/model/_model.py +2 -2
inspect_ai/model/_providers/anthropic.py +9 -7
inspect_ai/model/_providers/azureai.py +6 -4
inspect_ai/model/_providers/bedrock.py +6 -4
inspect_ai/model/_providers/google.py +103 -14
inspect_ai/model/_providers/groq.py +7 -5
inspect_ai/model/_providers/hf.py +11 -6
inspect_ai/model/_providers/mistral.py +6 -9
inspect_ai/model/_providers/openai.py +34 -8
inspect_ai/model/_providers/openai_o1.py +10 -12
inspect_ai/model/_providers/vertex.py +17 -4
inspect_ai/scorer/__init__.py +13 -2
inspect_ai/scorer/_metrics/__init__.py +2 -2
inspect_ai/scorer/_metrics/std.py +3 -3
inspect_ai/tool/__init__.py +9 -1
inspect_ai/tool/_tool.py +9 -2
inspect_ai/tool/_tool_info.py +2 -1
inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
inspect_ai/tool/_tools/_web_browser/_web_browser.py +3 -3
inspect_ai/util/__init__.py +4 -3
inspect_ai/util/{_trace.py → _conversation.py} +3 -17
inspect_ai/util/_display.py +14 -4
inspect_ai/util/_sandbox/context.py +12 -13
inspect_ai/util/_sandbox/docker/compose.py +24 -13
inspect_ai/util/_sandbox/docker/docker.py +20 -13
inspect_ai/util/_sandbox/docker/util.py +2 -1
inspect_ai/util/_sandbox/environment.py +13 -1
inspect_ai/util/_sandbox/local.py +1 -0
inspect_ai/util/_sandbox/self_check.py +18 -18
inspect_ai/util/_store.py +2 -2
inspect_ai/util/_subprocess.py +3 -3
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/METADATA +3 -3
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/RECORD +107 -103
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.56.dist-info → inspect_ai-0.3.58.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/task/images.py CHANGED Viewed

@@ -1,66 +1,69 @@
 import asyncio
 from inspect_ai._util.constants import BASE_64_DATA_REMOVED
-from inspect_ai._util.images import image_as_data_uri
+from inspect_ai._util.content import Content, ContentAudio, ContentImage, ContentVideo
+from inspect_ai._util.images import file_as_data_uri
 from inspect_ai._util.url import is_data_uri
 from inspect_ai.dataset import Sample
-from inspect_ai.model import ChatMessage, ChatMessageUser, Content, ContentImage
+from inspect_ai.model import ChatMessage, ChatMessageUser
 from inspect_ai.solver import TaskState
-async def states_with_base64_images(states: list[TaskState]) -> list[TaskState]:
-    return await asyncio.gather(*[state_with_base64_images(state) for state in states])
+async def states_with_base64_content(states: list[TaskState]) -> list[TaskState]:
+    return await asyncio.gather(*[state_with_base64_content(state) for state in states])
-async def state_with_base64_images(state: TaskState) -> TaskState:
-    state.messages = await messages_with_base64_images(state.messages)
+async def state_with_base64_content(state: TaskState) -> TaskState:
+    state.messages = await messages_with_base64_content(state.messages)
     return state
-def state_without_base64_images(state: TaskState) -> TaskState:
-    state.messages = messages_without_base64_images(state.messages)
+def state_without_base64_content(state: TaskState) -> TaskState:
+    state.messages = messages_without_base64_content(state.messages)
     return state
-async def samples_with_base64_images(samples: list[Sample]) -> list[Sample]:
+async def samples_with_base64_content(samples: list[Sample]) -> list[Sample]:
     return await asyncio.gather(
-        *[sample_with_base64_images(sample) for sample in samples]
+        *[sample_with_base64_content(sample) for sample in samples]
     )
-async def sample_with_base64_images(sample: Sample) -> Sample:
+async def sample_with_base64_content(sample: Sample) -> Sample:
     if isinstance(sample.input, list):
         return sample.model_copy(
-            update={"input": await messages_with_base64_images(sample.input)}
+            update={"input": await messages_with_base64_content(sample.input)}
         )
     else:
         return sample
-def sample_without_base64_images(sample: Sample) -> Sample:
+def sample_without_base64_content(sample: Sample) -> Sample:
     if isinstance(sample.input, list):
         return sample.model_copy(
-            update={"input": messages_without_base64_images(sample.input)}
+            update={"input": messages_without_base64_content(sample.input)}
         )
     else:
         return sample
-async def messages_with_base64_images(messages: list[ChatMessage]) -> list[ChatMessage]:
+async def messages_with_base64_content(
+    messages: list[ChatMessage],
+) -> list[ChatMessage]:
     return await asyncio.gather(
-        *[message_with_base64_image(message) for message in messages]
+        *[message_with_base64_content(message) for message in messages]
     )
-def messages_without_base64_images(messages: list[ChatMessage]) -> list[ChatMessage]:
-    return [message_without_base64_image(message) for message in messages]
+def messages_without_base64_content(messages: list[ChatMessage]) -> list[ChatMessage]:
+    return [message_without_base64_content(message) for message in messages]
-async def message_with_base64_image(message: ChatMessage) -> ChatMessage:
+async def message_with_base64_content(message: ChatMessage) -> ChatMessage:
     if isinstance(message, ChatMessageUser) and not isinstance(message.content, str):
         return ChatMessageUser(
             content=[
-                await chat_content_with_base64_image(content)
+                await chat_content_with_base64_content(content)
                 for content in message.content
             ],
             source=message.source,
@@ -69,11 +72,11 @@ async def message_with_base64_image(message: ChatMessage) -> ChatMessage:
         return message
-def message_without_base64_image(message: ChatMessage) -> ChatMessage:
+def message_without_base64_content(message: ChatMessage) -> ChatMessage:
     if isinstance(message, ChatMessageUser) and not isinstance(message.content, str):
         return ChatMessageUser(
             content=[
-                chat_content_without_base64_image(content)
+                chat_content_without_base64_content(content)
                 for content in message.content
             ],
             source=message.source,
@@ -82,18 +85,30 @@ def message_without_base64_image(message: ChatMessage) -> ChatMessage:
         return message
-async def chat_content_with_base64_image(content: Content) -> Content:
+async def chat_content_with_base64_content(content: Content) -> Content:
     if isinstance(content, ContentImage):
         return ContentImage(
-            image=await image_as_data_uri(content.image),
+            image=await file_as_data_uri(content.image),
             detail=content.detail,
         )
+    elif isinstance(content, ContentAudio):
+        return ContentAudio(
+            audio=await file_as_data_uri(content.audio), format=content.format
+        )
+    elif isinstance(content, ContentVideo):
+        return ContentVideo(
+            video=await file_as_data_uri(content.video), format=content.format
+        )
     else:
         return content
-def chat_content_without_base64_image(content: Content) -> Content:
+def chat_content_without_base64_content(content: Content) -> Content:
     if isinstance(content, ContentImage) and is_data_uri(content.image):
         return ContentImage(image=BASE_64_DATA_REMOVED, detail=content.detail)
+    elif isinstance(content, ContentAudio) and is_data_uri(content.audio):
+        return ContentAudio(audio=BASE_64_DATA_REMOVED, format="mp3")
+    elif isinstance(content, ContentVideo) and is_data_uri(content.video):
+        return ContentVideo(video=BASE_64_DATA_REMOVED, format="mp4")
     else:
         return content

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sys
 import time
 from copy import deepcopy
 from dataclasses import dataclass, field
+from datetime import datetime
 from logging import getLogger
 from pathlib import PurePath
 from typing import Callable, Literal
@@ -71,6 +72,7 @@ from inspect_ai.solver._chain import Chain, unroll
 from inspect_ai.solver._fork import set_task_generate
 from inspect_ai.solver._solver import Solver
 from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
+from inspect_ai.util._sandbox.context import sandbox_connections
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from inspect_ai.util._subtask import init_subtask
@@ -79,10 +81,10 @@ from ..task import Task
 from .error import SampleErrorHandler
 from .generate import task_generate
 from .images import (
-    sample_without_base64_images,
-    samples_with_base64_images,
-    state_without_base64_images,
-    states_with_base64_images,
+    sample_without_base64_content,
+    samples_with_base64_content,
+    state_without_base64_content,
+    states_with_base64_content,
 )
 from .log import TaskLogger, collect_eval_data, log_start
 from .results import eval_results
@@ -533,11 +535,6 @@ async def task_run_sample(
         else contextlib.nullcontext()
     )
-    # use timeout if provided
-    timeout_cm = (
-        timeout(time_limit) if time_limit is not None else contextlib.nullcontext()
-    )
     # helper to handle exceptions (will throw if we've exceeded the limit)
     def handle_error(ex: BaseException) -> EvalError:
         err = sample_error(ex)
@@ -547,7 +544,6 @@ async def task_run_sample(
     # solver loop
     async with (
         semaphore_cm,
-        sandboxenv_cm,
         active_sample(
             task=task_name,
             model=str(state.model),
@@ -561,125 +557,151 @@ async def task_run_sample(
         ) as active,
     ):
         error: EvalError | None = None
+        results: dict[str, SampleScore] = {}
         try:
-            async with timeout_cm:
-                # sample init event (remove file bodies as they have content or absolute paths)
-                event_sample = sample.model_copy(
-                    update=dict(files={k: "" for k in sample.files.keys()})
-                    if sample.files
-                    else None
-                )
-                transcript()._event(
-                    SampleInitEvent(sample=event_sample, state=state_jsonable(state))
-                )
+            async with sandboxenv_cm:
+                try:
+                    # update active sample wth sandboxes now that we are initialised
+                    active.sandboxes = await sandbox_connections()
+                    # initialise timeout context manager
+                    timeout_cm = (
+                        timeout(time_limit)
+                        if time_limit is not None
+                        else contextlib.nullcontext()
+                    )
-                # set progress for plan then run it
-                state = await plan(state, generate)
+                    # run sample w/ optional timeout
+                    async with timeout_cm:
+                        # mark started
+                        active.started = datetime.now().timestamp()
-        except TimeoutError:
-            if time_limit is not None:
-                transcript()._event(
-                    SampleLimitEvent(
-                        type="time",
-                        message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
-                        limit=time_limit,
-                    )
-                )
-            else:
-                py_logger.warning(
-                    "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
-                )
+                        # sample init event (remove file bodies as they have content or absolute paths)
+                        event_sample = sample.model_copy(
+                            update=dict(files={k: "" for k in sample.files.keys()})
+                            if sample.files
+                            else None
+                        )
+                        transcript()._event(
+                            SampleInitEvent(
+                                sample=event_sample, state=state_jsonable(state)
+                            )
+                        )
-            # capture most recent state for scoring
-            state = sample_state() or state
+                        # set progress for plan then run it
+                        state = await plan(state, generate)
-        except asyncio.CancelledError as ex:
-            if active.interrupt_action:
-                # record eve t
-                transcript()._event(
-                    SampleLimitEvent(
-                        type="operator",
-                        message="Sample completed: interrupted by operator",
-                    )
-                )
+                except TimeoutError:
+                    if time_limit is not None:
+                        transcript()._event(
+                            SampleLimitEvent(
+                                type="time",
+                                message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
+                                limit=time_limit,
+                            )
+                        )
+                    else:
+                        py_logger.warning(
+                            "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
+                        )
-                # handle the action
-                match active.interrupt_action:
-                    case "score":
-                        # continue to scoring (capture the most recent state)
-                        state = sample_state() or state
-                    case "error":
-                        # default error handling
-                        error = handle_error(ex)
+                    # capture most recent state for scoring
+                    state = sample_state() or state
-            else:
-                raise
+                except asyncio.CancelledError as ex:
+                    if active.interrupt_action:
+                        # record eve t
+                        transcript()._event(
+                            SampleLimitEvent(
+                                type="operator",
+                                message="Sample completed: interrupted by operator",
+                            )
+                        )
-        except BaseException as ex:
-            error = handle_error(ex)
+                        # handle the action
+                        match active.interrupt_action:
+                            case "score":
+                                # continue to scoring (capture the most recent state)
+                                state = sample_state() or state
+                            case "error":
+                                # default error handling
+                                error = handle_error(ex)
+                    else:
+                        raise
+                except BaseException as ex:
+                    error = handle_error(ex)
+                # set timeout for scoring. if the original timeout was never hit
+                # then just create a new timeout_cm targeting the original
+                # timeout time. if the original timeout was hit we still want
+                # to provide an opportunity for scoring, but we don't necessarily
+                # want to wait the full timeout again (especially in the case where
+                # the cause of the timeout is a hung container and scoring requires
+                # interacting with the container). as a middle ground we use half
+                # of the original timeout value for scoring.
+                if isinstance(timeout_cm, Timeout):
+                    if not timeout_cm.expired():
+                        timeout_cm = timeout_at(timeout_cm.when())
+                    else:
+                        assert time_limit
+                        timeout_cm = timeout(time_limit / 2)
+                # scoring
+                try:
+                    # timeout during scoring will result in an ordinary sample error
+                    async with timeout_cm:
+                        if scorers and error is None:
+                            for scorer in scorers:
+                                scorer_name = unique_scorer_name(
+                                    scorer, list(results.keys())
+                                )
+                                with transcript().step(name=scorer_name, type="scorer"):
+                                    score_result = (
+                                        await scorer(state, Target(sample.target))
+                                        if scorer
+                                        else None
+                                    )
+                                    if score_result is not None:
+                                        sample_score = SampleScore(
+                                            score=score_result,
+                                            sample_id=sample.id,
+                                        )
+                                        transcript()._event(
+                                            ScoreEvent(
+                                                score=score_result, target=sample.target
+                                            )
+                                        )
+                                        results[scorer_name] = sample_score
+                except asyncio.CancelledError:
+                    if active.interrupt_action:
+                        transcript()._event(
+                            SampleLimitEvent(
+                                type="operator",
+                                message="Unable to score sample due to operator interruption",
+                            )
+                        )
-        # set timeout for scoring. if the original timeout was never hit
-        # then just create a new timeout_cm targeting the original
-        # timeout time. if the original timeout was hit we still want
-        # to provide an opportunity for scoring, but we don't necessarily
-        # want to wait the full timeout again (especially in the case where
-        # the cause of the timeout is a hung container and scoring requires
-        # interacting with the container). as a middle ground we use half
-        # of the original timeout value for scoring.
-        if isinstance(timeout_cm, Timeout):
-            if not timeout_cm.expired():
-                timeout_cm = timeout_at(timeout_cm.when())
-            else:
-                assert time_limit
-                timeout_cm = timeout(time_limit / 2)
+                    raise
-        # scoring
-        try:
-            # timeout during scoring will result in an ordinary sample error
-            async with timeout_cm:
-                results: dict[str, SampleScore] = {}
-                if scorers and error is None:
-                    for scorer in scorers:
-                        scorer_name = unique_scorer_name(scorer, list(results.keys()))
-                        with transcript().step(name=scorer_name, type="scorer"):
-                            score_result = (
-                                await scorer(state, Target(sample.target))
-                                if scorer
-                                else None
+                except BaseException as ex:
+                    # note timeout
+                    if isinstance(ex, TimeoutError):
+                        transcript()._event(
+                            SampleLimitEvent(
+                                type="time",
+                                message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
+                                limit=time_limit,
                             )
-                            if score_result is not None:
-                                sample_score = SampleScore(
-                                    score=score_result,
-                                    sample_id=sample.id,
-                                )
-                                transcript()._event(
-                                    ScoreEvent(score=score_result, target=sample.target)
-                                )
-                                results[scorer_name] = sample_score
-        except asyncio.CancelledError:
-            if active.interrupt_action:
-                transcript()._event(
-                    SampleLimitEvent(
-                        type="operator",
-                        message="Unable to score sample due to operator interruption",
-                    )
-                )
+                        )
-            raise
+                    # handle error (this will throw if we've exceeded the limit)
+                    error = handle_error(ex)
+        # handle sandboxenv init errors
         except BaseException as ex:
-            # note timeout
-            if isinstance(ex, TimeoutError):
-                transcript()._event(
-                    SampleLimitEvent(
-                        type="time",
-                        message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
-                        limit=time_limit,
-                    )
-                )
-            # handle error (this will throw if we've exceeded the limit)
             error = handle_error(ex)
         # complete the sample
@@ -689,12 +711,12 @@ async def task_run_sample(
         if logger is not None:
             # if we are logging images then be sure to base64 images injected by solvers
             if log_images:
-                state = (await states_with_base64_images([state]))[0]
+                state = (await states_with_base64_content([state]))[0]
             # otherwise ensure there are no base64 images in sample or messages
             else:
-                sample = sample_without_base64_images(sample)
-                state = state_without_base64_images(state)
+                sample = sample_without_base64_content(sample)
+                state = state_without_base64_content(state)
             # log the sample
             await log_sample(
@@ -784,7 +806,7 @@ async def resolve_dataset(
     # if we are logging images then resolve sample images here
     if log_images:
-        samples = await samples_with_base64_images(samples)
+        samples = await samples_with_base64_content(samples)
     # prime the eval tasks (deep copy so they share no state w/ sample)
     sample_epochs: list[int] = []

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from copy import deepcopy
 from dataclasses import dataclass
 from logging import getLogger
 from typing import Any, Callable, Sequence, cast
@@ -6,6 +7,7 @@ from pydantic import BaseModel
 from typing_extensions import TypedDict, Unpack
 from inspect_ai._util.logger import warn_once
+from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
 from inspect_ai._util.registry import is_registry_object, registry_info
 from inspect_ai.approval._policy import ApprovalPolicy, approval_policies_from_config
 from inspect_ai.dataset import Dataset, MemoryDataset, Sample
@@ -115,35 +117,15 @@ class Task:
                     f"DEPRECATED: the '{arg}' parameter is deprecated (please use the '{newarg}' parameter instead)",
                 )
-        # resolve epochs / epochs_reducer
-        if isinstance(epochs, int):
-            epochs = Epochs(epochs)
-        if epochs is not None and epochs.epochs < 1:
-            raise ValueError("epochs must be a positive integer.")
-        # resolve dataset (provide empty sample to bootstrap tasks w/o samples,
-        # which could occur for testing or for an interactive mode eval)
-        dataset = dataset or [Sample(input="prompt")]
-        self.dataset: Dataset = (
-            dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
-        )
+        self.dataset = resolve_dataset(dataset)
         self.setup = setup
-        self.solver = chain(solver) if isinstance(solver, list) else solver
-        self.scorer = (
-            scorer
-            if isinstance(scorer, list)
-            else [scorer]
-            if scorer is not None
-            else None
-        )
+        self.solver = resolve_solver(solver)
+        self.scorer = resolve_scorer(scorer)
         self.metrics = metrics
         self.config = config
         self.sandbox = resolve_sandbox_environment(sandbox)
-        self.approval = (
-            approval_policies_from_config(approval)
-            if isinstance(approval, str)
-            else approval
-        )
+        self.approval = resolve_approval(approval)
+        epochs = resolve_epochs(epochs)
         self.epochs = epochs.epochs if epochs else None
         self.epochs_reducer = epochs.reducer if epochs else None
         self.fail_on_error = fail_on_error
@@ -171,6 +153,106 @@ class Task:
             return dict()
+def task_with(
+    task: Task,
+    *,
+    dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
+    setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
+    solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
+    scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
+    metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
+    config: GenerateConfig | NotGiven = NOT_GIVEN,
+    sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
+    approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
+    epochs: int | Epochs | None | NotGiven = NOT_GIVEN,
+    fail_on_error: bool | float | None | NotGiven = NOT_GIVEN,
+    message_limit: int | None | NotGiven = NOT_GIVEN,
+    token_limit: int | None | NotGiven = NOT_GIVEN,
+    time_limit: int | None | NotGiven = NOT_GIVEN,
+    name: str | None | NotGiven = NOT_GIVEN,
+    version: int | NotGiven = NOT_GIVEN,
+    metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
+) -> Task:
+    """Task adapted with alternate values for one or more options.
+    Args:
+        task (Task): Task to adapt (it is deep copied prior to mutating options)
+        dataset (Dataset | Sequence[Sample]): Dataset to evaluate
+        setup: (Solver | list[Solver] | None): Setup step (always run
+          even when the main `solver` is replaced).
+        solver: (Solver | list[Solver]): Solver or list of solvers.
+          Defaults to generate(), a normal call to the model.
+        scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
+        metrics (list[Metric] | dict[str, list[Metric]] | None):
+          Alternative metrics (overrides the metrics provided by the specified scorer).
+        config (GenerateConfig): Model generation config.
+        sandbox (SandboxEnvironmentType | None): Sandbox environment type
+          (or optionally a str or tuple with a shorthand spec)
+        approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
+          Either a path to an approval policy config file or a list of approval policies.
+          Defaults to no approval policy.
+        epochs (int | Epochs | None): Epochs to repeat samples for and optional score
+           reducer function(s) used to combine sample scores (defaults to "mean")
+        fail_on_error (bool | float | None): `True` to fail on first sample error
+           (default); `False` to never fail on sample errors; Value between 0 and 1
+           to fail if a proportion of total samples fails. Value greater than 1 to fail
+           eval if a count of samples fails.
+        message_limit (int | None): Limit on total messages used for each sample.
+        token_limit (int | None): Limit on total tokens used for each sample.
+        time_limit (int | None): Limit on time (in seconds) for execution of each sample.
+        name: (str | None): Task name. If not specified is automatically
+          determined based on the name of the task directory (or "task")
+          if its anonymous task (e.g. created in a notebook and passed to
+          eval() directly)
+        version: (int): Version of task (to distinguish evolutions
+          of the task spec or breaking changes to it)
+        metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
+    Returns:
+        Task: Task adapted with alternate options.
+    """
+    # deep copy the task
+    task = deepcopy(task)
+    if not isinstance(dataset, NotGiven):
+        task.dataset = resolve_dataset(dataset)
+    if not isinstance(setup, NotGiven):
+        task.setup = setup
+    if not isinstance(solver, NotGiven):
+        task.solver = resolve_solver(solver)
+    if not isinstance(scorer, NotGiven):
+        task.scorer = resolve_scorer(scorer)
+    if not isinstance(metrics, NotGiven):
+        task.metrics = metrics
+    if not isinstance(config, NotGiven):
+        task.config = config
+    if not isinstance(sandbox, NotGiven):
+        task.sandbox = resolve_sandbox_environment(sandbox)
+    if not isinstance(approval, NotGiven):
+        task.approval = resolve_approval(approval)
+    if not isinstance(epochs, NotGiven):
+        epochs = resolve_epochs(epochs)
+        task.epochs = epochs.epochs if epochs else None
+        task.epochs_reducer = epochs.reducer if epochs else None
+    if not isinstance(fail_on_error, NotGiven):
+        task.fail_on_error = fail_on_error
+    if not isinstance(message_limit, NotGiven):
+        task.message_limit = message_limit
+    if not isinstance(token_limit, NotGiven):
+        task.token_limit = token_limit
+    if not isinstance(time_limit, NotGiven):
+        task.time_limit = time_limit
+    if not isinstance(version, NotGiven):
+        task.version = version
+    if not isinstance(name, NotGiven):
+        task._name = name
+    if not isinstance(metadata, NotGiven):
+        task.metadata = metadata
+    # return modified task
+    return task
 class TaskInfo(BaseModel):
     """Task information (file, name, and attributes)."""
@@ -225,3 +307,36 @@ classes, and task instances (a single task or list of tasks
 can be specified). None is a request to read a task out
 of the current working directory.
 """
+def resolve_approval(
+    approval: str | list[ApprovalPolicy] | None,
+) -> list[ApprovalPolicy] | None:
+    return (
+        approval_policies_from_config(approval)
+        if isinstance(approval, str)
+        else approval
+    )
+def resolve_epochs(epochs: int | Epochs | None) -> Epochs | None:
+    if isinstance(epochs, int):
+        epochs = Epochs(epochs)
+    if epochs is not None and epochs.epochs < 1:
+        raise ValueError("epochs must be a positive integer.")
+    return epochs
+def resolve_dataset(dataset: Dataset | Sequence[Sample] | None) -> Dataset:
+    dataset = dataset or [Sample(input="prompt")]
+    return dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
+def resolve_solver(solver: Solver | list[Solver]) -> Solver:
+    return chain(solver) if isinstance(solver, list) else solver
+def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
+    return (
+        scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None
+    )

inspect_ai/_util/constants.py CHANGED Viewed

@@ -36,3 +36,4 @@ SCORED_SUFFIX = "-scored"
 SAMPLE_SUBTASK = "sample"
 CONSOLE_DISPLAY_WIDTH = 120
 BASE_64_DATA_REMOVED = "<base64-data-removed>"
+SANDBOX_SETUP_TIMEOUT = 300

inspect-ai 0.3.56__py3-none-any.whl → 0.3.58__py3-none-any.whl

inspect-ai 0.3.56py3-none-any.whl → 0.3.58py3-none-any.whl