PyPI - inspect-ai - Versions diffs - 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl - Mend

inspect-ai 0.3.57py3-none-any.whl → 0.3.59py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

inspect_ai/__init__.py +2 -1
inspect_ai/_cli/common.py +7 -3
inspect_ai/_cli/eval.py +17 -2
inspect_ai/_cli/trace.py +21 -2
inspect_ai/_display/core/active.py +4 -3
inspect_ai/_display/core/config.py +3 -3
inspect_ai/_display/core/panel.py +7 -3
inspect_ai/_display/plain/__init__.py +0 -0
inspect_ai/_display/plain/display.py +203 -0
inspect_ai/_display/rich/display.py +4 -9
inspect_ai/_display/textual/app.py +4 -1
inspect_ai/_display/textual/widgets/port_mappings.py +110 -0
inspect_ai/_display/textual/widgets/samples.py +119 -16
inspect_ai/_display/textual/widgets/sandbox.py +37 -0
inspect_ai/_eval/eval.py +32 -20
inspect_ai/_eval/evalset.py +7 -5
inspect_ai/_eval/score.py +1 -0
inspect_ai/_eval/task/__init__.py +2 -2
inspect_ai/_eval/task/images.py +40 -25
inspect_ai/_eval/task/results.py +50 -22
inspect_ai/_eval/task/run.py +180 -124
inspect_ai/_eval/task/sandbox.py +10 -5
inspect_ai/_eval/task/task.py +140 -25
inspect_ai/_util/constants.py +2 -0
inspect_ai/_util/content.py +23 -1
inspect_ai/_util/images.py +20 -17
inspect_ai/_util/kvstore.py +73 -0
inspect_ai/_util/notgiven.py +18 -0
inspect_ai/_util/port_names.py +61 -0
inspect_ai/_util/text.py +23 -0
inspect_ai/_util/thread.py +5 -0
inspect_ai/_view/www/App.css +31 -1
inspect_ai/_view/www/dist/assets/index.css +31 -1
inspect_ai/_view/www/dist/assets/index.js +25375 -1846
inspect_ai/_view/www/log-schema.json +129 -15
inspect_ai/_view/www/package.json +2 -0
inspect_ai/_view/www/src/App.mjs +8 -10
inspect_ai/_view/www/src/Types.mjs +0 -1
inspect_ai/_view/www/src/components/ChatView.mjs +133 -43
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -4
inspect_ai/_view/www/src/components/LargeModal.mjs +19 -20
inspect_ai/_view/www/src/components/MessageBand.mjs +2 -2
inspect_ai/_view/www/src/components/MessageContent.mjs +43 -1
inspect_ai/_view/www/src/components/TabSet.mjs +3 -1
inspect_ai/_view/www/src/components/VirtualList.mjs +266 -84
inspect_ai/_view/www/src/index.js +75 -2
inspect_ai/_view/www/src/navbar/Navbar.mjs +3 -0
inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +18 -9
inspect_ai/_view/www/src/samples/SampleDialog.mjs +5 -1
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +23 -15
inspect_ai/_view/www/src/samples/SampleList.mjs +18 -48
inspect_ai/_view/www/src/samples/SampleTranscript.mjs +8 -3
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +29 -13
inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -1
inspect_ai/_view/www/src/samples/SamplesTools.mjs +8 -8
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +712 -89
inspect_ai/_view/www/src/samples/tools/filters.mjs +260 -87
inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +29 -24
inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +1 -1
inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +31 -10
inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +23 -2
inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +24 -2
inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +33 -3
inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +25 -2
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +25 -2
inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +193 -11
inspect_ai/_view/www/src/samples/transcript/Types.mjs +10 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +26 -2
inspect_ai/_view/www/src/types/log.d.ts +62 -27
inspect_ai/_view/www/src/utils/Format.mjs +10 -3
inspect_ai/_view/www/src/utils/Json.mjs +12 -6
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +10 -4
inspect_ai/_view/www/vite.config.js +7 -0
inspect_ai/_view/www/yarn.lock +116 -0
inspect_ai/approval/_human/__init__.py +0 -0
inspect_ai/approval/_human/util.py +2 -2
inspect_ai/approval/_policy.py +12 -6
inspect_ai/dataset/_sources/csv.py +2 -1
inspect_ai/dataset/_sources/json.py +2 -1
inspect_ai/dataset/_sources/util.py +15 -7
inspect_ai/log/_condense.py +11 -1
inspect_ai/log/_log.py +3 -6
inspect_ai/log/_recorders/eval.py +19 -8
inspect_ai/log/_samples.py +26 -5
inspect_ai/log/_transcript.py +32 -2
inspect_ai/model/__init__.py +10 -2
inspect_ai/model/_call_tools.py +59 -12
inspect_ai/model/_chat_message.py +2 -4
inspect_ai/model/_conversation.py +61 -0
inspect_ai/model/_generate_config.py +10 -4
inspect_ai/model/_model.py +117 -18
inspect_ai/model/_model_output.py +7 -2
inspect_ai/model/_providers/anthropic.py +109 -51
inspect_ai/model/_providers/azureai.py +26 -24
inspect_ai/model/_providers/bedrock.py +43 -44
inspect_ai/model/_providers/google.py +121 -58
inspect_ai/model/_providers/groq.py +7 -5
inspect_ai/model/_providers/hf.py +11 -6
inspect_ai/model/_providers/mistral.py +17 -20
inspect_ai/model/_providers/openai.py +32 -21
inspect_ai/model/_providers/openai_o1.py +9 -8
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/model/_providers/together.py +8 -8
inspect_ai/model/_providers/vertex.py +18 -8
inspect_ai/scorer/__init__.py +13 -2
inspect_ai/scorer/_metrics/__init__.py +2 -2
inspect_ai/scorer/_metrics/std.py +3 -3
inspect_ai/scorer/_reducer/reducer.py +1 -1
inspect_ai/scorer/_scorer.py +2 -2
inspect_ai/solver/__init__.py +2 -5
inspect_ai/solver/_prompt.py +35 -5
inspect_ai/solver/_task_state.py +80 -38
inspect_ai/tool/__init__.py +11 -1
inspect_ai/tool/_tool.py +21 -3
inspect_ai/tool/_tool_call.py +10 -0
inspect_ai/tool/_tool_def.py +16 -5
inspect_ai/tool/_tool_with.py +21 -4
inspect_ai/tool/beta/__init__.py +5 -0
inspect_ai/tool/beta/_computer/__init__.py +3 -0
inspect_ai/tool/beta/_computer/_common.py +133 -0
inspect_ai/tool/beta/_computer/_computer.py +155 -0
inspect_ai/tool/beta/_computer/_computer_split.py +198 -0
inspect_ai/tool/beta/_computer/_resources/Dockerfile +100 -0
inspect_ai/tool/beta/_computer/_resources/README.md +30 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/entrypoint.sh +18 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/novnc_startup.sh +20 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/x11vnc_startup.sh +48 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/xfce_startup.sh +13 -0
inspect_ai/tool/beta/_computer/_resources/entrypoint/xvfb_startup.sh +48 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/image_home_dir/Desktop/XPaint.desktop +10 -0
inspect_ai/tool/beta/_computer/_resources/tool/__init__.py +0 -0
inspect_ai/tool/beta/_computer/_resources/tool/_logger.py +22 -0
inspect_ai/tool/beta/_computer/_resources/tool/_run.py +42 -0
inspect_ai/tool/beta/_computer/_resources/tool/_tool_result.py +33 -0
inspect_ai/tool/beta/_computer/_resources/tool/_x11_client.py +262 -0
inspect_ai/tool/beta/_computer/_resources/tool/computer_tool.py +85 -0
inspect_ai/tool/beta/_computer/_resources/tool/requirements.txt +0 -0
inspect_ai/util/__init__.py +2 -3
inspect_ai/util/{_trace.py → _conversation.py} +3 -17
inspect_ai/util/_display.py +14 -4
inspect_ai/util/_limit.py +26 -0
inspect_ai/util/_sandbox/context.py +12 -13
inspect_ai/util/_sandbox/docker/compose.py +24 -11
inspect_ai/util/_sandbox/docker/docker.py +84 -14
inspect_ai/util/_sandbox/docker/internal.py +3 -1
inspect_ai/util/_sandbox/environment.py +27 -1
inspect_ai/util/_sandbox/local.py +1 -0
{inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/METADATA +2 -2
{inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/RECORD +159 -128
inspect_ai/_view/www/src/samples/transcript/TranscriptState.mjs +0 -70
inspect_ai/model/_trace.py +0 -48
{inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.57.dist-info → inspect_ai-0.3.59.dist-info}/top_level.txt +0 -0

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sys
 import time
 from copy import deepcopy
 from dataclasses import dataclass, field
+from datetime import datetime
 from logging import getLogger
 from pathlib import PurePath
 from typing import Callable, Literal
@@ -26,10 +27,7 @@ from inspect_ai._util.constants import (
 from inspect_ai._util.datetime import iso_now
 from inspect_ai._util.error import exception_message
 from inspect_ai._util.hooks import send_telemetry
-from inspect_ai._util.registry import (
-    is_registry_object,
-    registry_log_name,
-)
+from inspect_ai._util.registry import is_registry_object, registry_log_name
 from inspect_ai._util.timeouts import Timeout, timeout, timeout_at
 from inspect_ai._view.notify import view_notify_eval
 from inspect_ai.dataset import Dataset, Sample
@@ -44,7 +42,11 @@ from inspect_ai.log import (
 from inspect_ai.log._condense import condense_sample
 from inspect_ai.log._file import eval_log_json_str
 from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
-from inspect_ai.log._samples import active_sample
+from inspect_ai.log._samples import (
+    active_sample,
+    set_active_sample_message_limit,
+    set_active_sample_token_limit,
+)
 from inspect_ai.log._transcript import (
     ErrorEvent,
     SampleInitEvent,
@@ -71,6 +73,8 @@ from inspect_ai.solver._chain import Chain, unroll
 from inspect_ai.solver._fork import set_task_generate
 from inspect_ai.solver._solver import Solver
 from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
+from inspect_ai.util._limit import SampleLimitExceededError
+from inspect_ai.util._sandbox.context import sandbox_connections
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from inspect_ai.util._subtask import init_subtask
@@ -79,10 +83,10 @@ from ..task import Task
 from .error import SampleErrorHandler
 from .generate import task_generate
 from .images import (
-    sample_without_base64_images,
-    samples_with_base64_images,
-    state_without_base64_images,
-    states_with_base64_images,
+    sample_without_base64_content,
+    samples_with_base64_content,
+    state_without_base64_content,
+    states_with_base64_content,
 )
 from .log import TaskLogger, collect_eval_data, log_start
 from .results import eval_results
@@ -533,21 +537,18 @@ async def task_run_sample(
         else contextlib.nullcontext()
     )
-    # use timeout if provided
-    timeout_cm = (
-        timeout(time_limit) if time_limit is not None else contextlib.nullcontext()
-    )
     # helper to handle exceptions (will throw if we've exceeded the limit)
     def handle_error(ex: BaseException) -> EvalError:
         err = sample_error(ex)
+        py_logger.warning(
+            f"Sample error (id: {sample.id}, epoch: {state.epoch}): {exception_message(ex)})"
+        )
         transcript()._event(ErrorEvent(error=err))
         return err
     # solver loop
     async with (
         semaphore_cm,
-        sandboxenv_cm,
         active_sample(
             task=task_name,
             model=str(state.model),
@@ -561,125 +562,179 @@ async def task_run_sample(
         ) as active,
     ):
         error: EvalError | None = None
+        results: dict[str, SampleScore] = {}
         try:
-            async with timeout_cm:
-                # sample init event (remove file bodies as they have content or absolute paths)
-                event_sample = sample.model_copy(
-                    update=dict(files={k: "" for k in sample.files.keys()})
-                    if sample.files
-                    else None
-                )
-                transcript()._event(
-                    SampleInitEvent(sample=event_sample, state=state_jsonable(state))
-                )
+            async with sandboxenv_cm:
+                try:
+                    # update active sample wth sandboxes now that we are initialised
+                    active.sandboxes = await sandbox_connections()
+                    # initialise timeout context manager
+                    timeout_cm = (
+                        timeout(time_limit)
+                        if time_limit is not None
+                        else contextlib.nullcontext()
+                    )
-                # set progress for plan then run it
-                state = await plan(state, generate)
+                    # run sample w/ optional timeout
+                    async with timeout_cm:
+                        # mark started
+                        active.started = datetime.now().timestamp()
-        except TimeoutError:
-            if time_limit is not None:
-                transcript()._event(
-                    SampleLimitEvent(
-                        type="time",
-                        message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
-                        limit=time_limit,
-                    )
-                )
-            else:
-                py_logger.warning(
-                    "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
-                )
+                        # sample init event (remove file bodies as they have content or absolute paths)
+                        event_sample = sample.model_copy(
+                            update=dict(files={k: "" for k in sample.files.keys()})
+                            if sample.files
+                            else None
+                        )
+                        transcript()._event(
+                            SampleInitEvent(
+                                sample=event_sample, state=state_jsonable(state)
+                            )
+                        )
-            # capture most recent state for scoring
-            state = sample_state() or state
+                        # set progress for plan then run it
+                        state = await plan(state, generate)
-        except asyncio.CancelledError as ex:
-            if active.interrupt_action:
-                # record eve t
-                transcript()._event(
-                    SampleLimitEvent(
-                        type="operator",
-                        message="Sample completed: interrupted by operator",
-                    )
-                )
+                except TimeoutError:
+                    if time_limit is not None:
+                        transcript()._event(
+                            SampleLimitEvent(
+                                type="time",
+                                message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
+                                limit=time_limit,
+                            )
+                        )
+                    else:
+                        py_logger.warning(
+                            "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
+                        )
-                # handle the action
-                match active.interrupt_action:
-                    case "score":
-                        # continue to scoring (capture the most recent state)
-                        state = sample_state() or state
-                    case "error":
-                        # default error handling
-                        error = handle_error(ex)
+                    # capture most recent state for scoring
+                    state = sample_state() or state
-            else:
-                raise
+                except asyncio.CancelledError as ex:
+                    if active.interrupt_action:
+                        # record eve t
+                        transcript()._event(
+                            SampleLimitEvent(
+                                type="operator",
+                                message="Sample completed: interrupted by operator",
+                            )
+                        )
-        except BaseException as ex:
-            error = handle_error(ex)
+                        # handle the action
+                        match active.interrupt_action:
+                            case "score":
+                                # continue to scoring (capture the most recent state)
+                                state = sample_state() or state
+                            case "error":
+                                # default error handling
+                                error = handle_error(ex)
+                    else:
+                        raise
+                except SampleLimitExceededError as ex:
+                    # sample limit event
+                    transcript()._event(
+                        SampleLimitEvent(
+                            type=ex.type,
+                            limit=ex.limit,
+                            message=f"Sample completed: {ex.message}",
+                        )
+                    )
-        # set timeout for scoring. if the original timeout was never hit
-        # then just create a new timeout_cm targeting the original
-        # timeout time. if the original timeout was hit we still want
-        # to provide an opportunity for scoring, but we don't necessarily
-        # want to wait the full timeout again (especially in the case where
-        # the cause of the timeout is a hung container and scoring requires
-        # interacting with the container). as a middle ground we use half
-        # of the original timeout value for scoring.
-        if isinstance(timeout_cm, Timeout):
-            if not timeout_cm.expired():
-                timeout_cm = timeout_at(timeout_cm.when())
-            else:
-                assert time_limit
-                timeout_cm = timeout(time_limit / 2)
+                    # capture most recent state for scoring
+                    state = sample_state() or state
+                    state.completed = True
+                except BaseException as ex:
+                    error = handle_error(ex)
+                # set timeout for scoring. if the original timeout was never hit
+                # then just create a new timeout_cm targeting the original
+                # timeout time. if the original timeout was hit we still want
+                # to provide an opportunity for scoring, but we don't necessarily
+                # want to wait the full timeout again (especially in the case where
+                # the cause of the timeout is a hung container and scoring requires
+                # interacting with the container). as a middle ground we use half
+                # of the original timeout value for scoring.
+                if isinstance(timeout_cm, Timeout):
+                    if not timeout_cm.expired():
+                        timeout_cm = timeout_at(timeout_cm.when())
+                    else:
+                        assert time_limit
+                        timeout_cm = timeout(time_limit / 2)
+                # turn off sample limits
+                set_active_sample_token_limit(None)
+                set_active_sample_message_limit(None)
+                # scoring
+                try:
+                    # timeout during scoring will result in an ordinary sample error
+                    async with timeout_cm:
+                        if error is None:
+                            for scorer in scorers or []:
+                                scorer_name = unique_scorer_name(
+                                    scorer, list(results.keys())
+                                )
+                                with transcript().step(name=scorer_name, type="scorer"):
+                                    score_result = (
+                                        await scorer(state, Target(sample.target))
+                                        if scorer
+                                        else None
+                                    )
+                                    if score_result is not None:
+                                        sample_score = SampleScore(
+                                            score=score_result,
+                                            sample_id=sample.id,
+                                        )
+                                        transcript()._event(
+                                            ScoreEvent(
+                                                score=score_result, target=sample.target
+                                            )
+                                        )
+                                        results[scorer_name] = sample_score
+                            # add scores returned by solvers
+                            if state.scores is not None:
+                                for name, score in state.scores.items():
+                                    results[name] = SampleScore(
+                                        score=score, sample_id=state.sample_id
+                                    )
+                            # propagate results into scores
+                            state.scores = {k: v.score for k, v in results.items()}
+                except asyncio.CancelledError:
+                    if active.interrupt_action:
+                        transcript()._event(
+                            SampleLimitEvent(
+                                type="operator",
+                                message="Unable to score sample due to operator interruption",
+                            )
+                        )
-        # scoring
-        try:
-            # timeout during scoring will result in an ordinary sample error
-            async with timeout_cm:
-                results: dict[str, SampleScore] = {}
-                if scorers and error is None:
-                    for scorer in scorers:
-                        scorer_name = unique_scorer_name(scorer, list(results.keys()))
-                        with transcript().step(name=scorer_name, type="scorer"):
-                            score_result = (
-                                await scorer(state, Target(sample.target))
-                                if scorer
-                                else None
+                    raise
+                except BaseException as ex:
+                    # note timeout
+                    if isinstance(ex, TimeoutError):
+                        transcript()._event(
+                            SampleLimitEvent(
+                                type="time",
+                                message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
+                                limit=time_limit,
                             )
-                            if score_result is not None:
-                                sample_score = SampleScore(
-                                    score=score_result,
-                                    sample_id=sample.id,
-                                )
-                                transcript()._event(
-                                    ScoreEvent(score=score_result, target=sample.target)
-                                )
-                                results[scorer_name] = sample_score
-        except asyncio.CancelledError:
-            if active.interrupt_action:
-                transcript()._event(
-                    SampleLimitEvent(
-                        type="operator",
-                        message="Unable to score sample due to operator interruption",
-                    )
-                )
+                        )
-            raise
+                    # handle error (this will throw if we've exceeded the limit)
+                    error = handle_error(ex)
+        # handle sandboxenv init errors
         except BaseException as ex:
-            # note timeout
-            if isinstance(ex, TimeoutError):
-                transcript()._event(
-                    SampleLimitEvent(
-                        type="time",
-                        message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
-                        limit=time_limit,
-                    )
-                )
-            # handle error (this will throw if we've exceeded the limit)
             error = handle_error(ex)
         # complete the sample
@@ -689,12 +744,12 @@ async def task_run_sample(
         if logger is not None:
             # if we are logging images then be sure to base64 images injected by solvers
             if log_images:
-                state = (await states_with_base64_images([state]))[0]
+                state = (await states_with_base64_content([state]))[0]
             # otherwise ensure there are no base64 images in sample or messages
             else:
-                sample = sample_without_base64_images(sample)
-                state = state_without_base64_images(state)
+                sample = sample_without_base64_content(sample)
+                state = state_without_base64_content(state)
             # log the sample
             await log_sample(
@@ -784,7 +839,7 @@ async def resolve_dataset(
     # if we are logging images then resolve sample images here
     if log_images:
-        samples = await samples_with_base64_images(samples)
+        samples = await samples_with_base64_content(samples)
     # prime the eval tasks (deep copy so they share no state w/ sample)
     sample_epochs: list[int] = []
@@ -797,6 +852,7 @@ async def resolve_dataset(
                 epoch=epoch,
                 model=model_name,
                 input=sample.input,
+                target=Target(sample.target),
                 choices=sample.choices,
                 messages=sample_messages(sample),
                 message_limit=message_limit,

inspect_ai/_eval/task/sandbox.py CHANGED Viewed

@@ -4,11 +4,13 @@ import contextlib
 from random import random
 from typing import AsyncGenerator, Callable, NamedTuple, cast
+import httpx
 from inspect_ai._eval.task.task import Task
 from inspect_ai._eval.task.util import task_run_dir
 from inspect_ai._util.file import file, filesystem
 from inspect_ai._util.registry import registry_unqualified_name
-from inspect_ai._util.url import data_uri_to_base64, is_data_uri
+from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url
 from inspect_ai.dataset import Sample
 from inspect_ai.util._concurrency import concurrency
 from inspect_ai.util._sandbox.context import (
@@ -65,12 +67,12 @@ async def sandboxenv_context(
         files: dict[str, bytes] = {}
         if sample.files:
             for path, contents in sample.files.items():
-                files[path] = read_sandboxenv_file(contents)
+                files[path] = await read_sandboxenv_file(contents)
         # read setup script from sample (add bash shebang if necessary)
         setup: bytes | None = None
         if sample.setup:
-            setup = read_sandboxenv_file(sample.setup)
+            setup = await read_sandboxenv_file(sample.setup)
             setup_str = setup.decode(encoding="utf-8")
             if not setup_str.strip().startswith("#!"):
                 setup_str = f"#!/usr/bin/env bash\n\n{setup_str}"
@@ -108,13 +110,16 @@ async def sandboxenv_context(
                 )
-def read_sandboxenv_file(contents: str) -> bytes:
+async def read_sandboxenv_file(contents: str) -> bytes:
     if is_data_uri(contents):
         contents_base64 = data_uri_to_base64(contents)
         file_bytes = base64.b64decode(contents_base64)
+    elif is_http_url(contents):
+        client = httpx.AsyncClient()
+        file_bytes = (await client.get(contents, follow_redirects=True)).content
     else:
         # try to read as a file (if it doesn't exist or has a path not cool w/
-        # the fileystem then we fall back to contents)
+        # the filesystem then we fall back to contents)
         try:
             fs = filesystem(contents)
             if fs.exists(contents):

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from copy import deepcopy
 from dataclasses import dataclass
 from logging import getLogger
 from typing import Any, Callable, Sequence, cast
@@ -6,6 +7,7 @@ from pydantic import BaseModel
 from typing_extensions import TypedDict, Unpack
 from inspect_ai._util.logger import warn_once
+from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
 from inspect_ai._util.registry import is_registry_object, registry_info
 from inspect_ai.approval._policy import ApprovalPolicy, approval_policies_from_config
 from inspect_ai.dataset import Dataset, MemoryDataset, Sample
@@ -115,35 +117,15 @@ class Task:
                     f"DEPRECATED: the '{arg}' parameter is deprecated (please use the '{newarg}' parameter instead)",
                 )
-        # resolve epochs / epochs_reducer
-        if isinstance(epochs, int):
-            epochs = Epochs(epochs)
-        if epochs is not None and epochs.epochs < 1:
-            raise ValueError("epochs must be a positive integer.")
-        # resolve dataset (provide empty sample to bootstrap tasks w/o samples,
-        # which could occur for testing or for an interactive mode eval)
-        dataset = dataset or [Sample(input="prompt")]
-        self.dataset: Dataset = (
-            dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
-        )
+        self.dataset = resolve_dataset(dataset)
         self.setup = setup
-        self.solver = chain(solver) if isinstance(solver, list) else solver
-        self.scorer = (
-            scorer
-            if isinstance(scorer, list)
-            else [scorer]
-            if scorer is not None
-            else None
-        )
+        self.solver = resolve_solver(solver)
+        self.scorer = resolve_scorer(scorer)
         self.metrics = metrics
         self.config = config
         self.sandbox = resolve_sandbox_environment(sandbox)
-        self.approval = (
-            approval_policies_from_config(approval)
-            if isinstance(approval, str)
-            else approval
-        )
+        self.approval = resolve_approval(approval)
+        epochs = resolve_epochs(epochs)
         self.epochs = epochs.epochs if epochs else None
         self.epochs_reducer = epochs.reducer if epochs else None
         self.fail_on_error = fail_on_error
@@ -171,6 +153,106 @@ class Task:
             return dict()
+def task_with(
+    task: Task,
+    *,
+    dataset: Dataset | Sequence[Sample] | None | NotGiven = NOT_GIVEN,
+    setup: Solver | list[Solver] | None | NotGiven = NOT_GIVEN,
+    solver: Solver | list[Solver] | NotGiven = NOT_GIVEN,
+    scorer: Scorer | list[Scorer] | None | NotGiven = NOT_GIVEN,
+    metrics: list[Metric] | dict[str, list[Metric]] | None | NotGiven = NOT_GIVEN,
+    config: GenerateConfig | NotGiven = NOT_GIVEN,
+    sandbox: SandboxEnvironmentType | None | NotGiven = NOT_GIVEN,
+    approval: str | list[ApprovalPolicy] | None | NotGiven = NOT_GIVEN,
+    epochs: int | Epochs | None | NotGiven = NOT_GIVEN,
+    fail_on_error: bool | float | None | NotGiven = NOT_GIVEN,
+    message_limit: int | None | NotGiven = NOT_GIVEN,
+    token_limit: int | None | NotGiven = NOT_GIVEN,
+    time_limit: int | None | NotGiven = NOT_GIVEN,
+    name: str | None | NotGiven = NOT_GIVEN,
+    version: int | NotGiven = NOT_GIVEN,
+    metadata: dict[str, Any] | None | NotGiven = NOT_GIVEN,
+) -> Task:
+    """Task adapted with alternate values for one or more options.
+    Args:
+        task (Task): Task to adapt (it is deep copied prior to mutating options)
+        dataset (Dataset | Sequence[Sample]): Dataset to evaluate
+        setup: (Solver | list[Solver] | None): Setup step (always run
+          even when the main `solver` is replaced).
+        solver: (Solver | list[Solver]): Solver or list of solvers.
+          Defaults to generate(), a normal call to the model.
+        scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output.
+        metrics (list[Metric] | dict[str, list[Metric]] | None):
+          Alternative metrics (overrides the metrics provided by the specified scorer).
+        config (GenerateConfig): Model generation config.
+        sandbox (SandboxEnvironmentType | None): Sandbox environment type
+          (or optionally a str or tuple with a shorthand spec)
+        approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
+          Either a path to an approval policy config file or a list of approval policies.
+          Defaults to no approval policy.
+        epochs (int | Epochs | None): Epochs to repeat samples for and optional score
+           reducer function(s) used to combine sample scores (defaults to "mean")
+        fail_on_error (bool | float | None): `True` to fail on first sample error
+           (default); `False` to never fail on sample errors; Value between 0 and 1
+           to fail if a proportion of total samples fails. Value greater than 1 to fail
+           eval if a count of samples fails.
+        message_limit (int | None): Limit on total messages used for each sample.
+        token_limit (int | None): Limit on total tokens used for each sample.
+        time_limit (int | None): Limit on time (in seconds) for execution of each sample.
+        name: (str | None): Task name. If not specified is automatically
+          determined based on the name of the task directory (or "task")
+          if its anonymous task (e.g. created in a notebook and passed to
+          eval() directly)
+        version: (int): Version of task (to distinguish evolutions
+          of the task spec or breaking changes to it)
+        metadata: (dict[str, Any] | None): Additional metadata to associate with the task.
+    Returns:
+        Task: Task adapted with alternate options.
+    """
+    # deep copy the task
+    task = deepcopy(task)
+    if not isinstance(dataset, NotGiven):
+        task.dataset = resolve_dataset(dataset)
+    if not isinstance(setup, NotGiven):
+        task.setup = setup
+    if not isinstance(solver, NotGiven):
+        task.solver = resolve_solver(solver)
+    if not isinstance(scorer, NotGiven):
+        task.scorer = resolve_scorer(scorer)
+    if not isinstance(metrics, NotGiven):
+        task.metrics = metrics
+    if not isinstance(config, NotGiven):
+        task.config = config
+    if not isinstance(sandbox, NotGiven):
+        task.sandbox = resolve_sandbox_environment(sandbox)
+    if not isinstance(approval, NotGiven):
+        task.approval = resolve_approval(approval)
+    if not isinstance(epochs, NotGiven):
+        epochs = resolve_epochs(epochs)
+        task.epochs = epochs.epochs if epochs else None
+        task.epochs_reducer = epochs.reducer if epochs else None
+    if not isinstance(fail_on_error, NotGiven):
+        task.fail_on_error = fail_on_error
+    if not isinstance(message_limit, NotGiven):
+        task.message_limit = message_limit
+    if not isinstance(token_limit, NotGiven):
+        task.token_limit = token_limit
+    if not isinstance(time_limit, NotGiven):
+        task.time_limit = time_limit
+    if not isinstance(version, NotGiven):
+        task.version = version
+    if not isinstance(name, NotGiven):
+        task._name = name
+    if not isinstance(metadata, NotGiven):
+        task.metadata = metadata
+    # return modified task
+    return task
 class TaskInfo(BaseModel):
     """Task information (file, name, and attributes)."""
@@ -225,3 +307,36 @@ classes, and task instances (a single task or list of tasks
 can be specified). None is a request to read a task out
 of the current working directory.
 """
+def resolve_approval(
+    approval: str | list[ApprovalPolicy] | None,
+) -> list[ApprovalPolicy] | None:
+    return (
+        approval_policies_from_config(approval)
+        if isinstance(approval, str)
+        else approval
+    )
+def resolve_epochs(epochs: int | Epochs | None) -> Epochs | None:
+    if isinstance(epochs, int):
+        epochs = Epochs(epochs)
+    if epochs is not None and epochs.epochs < 1:
+        raise ValueError("epochs must be a positive integer.")
+    return epochs
+def resolve_dataset(dataset: Dataset | Sequence[Sample] | None) -> Dataset:
+    dataset = dataset or [Sample(input="prompt")]
+    return dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
+def resolve_solver(solver: Solver | list[Solver]) -> Solver:
+    return chain(solver) if isinstance(solver, list) else solver
+def resolve_scorer(scorer: Scorer | list[Scorer] | None) -> list[Scorer] | None:
+    return (
+        scorer if isinstance(scorer, list) else [scorer] if scorer is not None else None
+    )

inspect_ai/_util/constants.py CHANGED Viewed

@@ -36,3 +36,5 @@ SCORED_SUFFIX = "-scored"
 SAMPLE_SUBTASK = "sample"
 CONSOLE_DISPLAY_WIDTH = 120
 BASE_64_DATA_REMOVED = "<base64-data-removed>"
+SANDBOX_SETUP_TIMEOUT = 300
+NO_CONTENT = "(no content)"

inspect-ai 0.3.57__py3-none-any.whl → 0.3.59__py3-none-any.whl

inspect-ai 0.3.57py3-none-any.whl → 0.3.59py3-none-any.whl