PyPI - inspect-ai - Versions diffs - 0.3.90__py3-none-any.whl → 0.3.92__py3-none-any.whl - Mend

inspect-ai 0.3.90py3-none-any.whl → 0.3.92py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (370) hide show

inspect_ai/_cli/common.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import functools
+import os
 from typing import Any, Callable, Literal, cast
 import click
@@ -21,6 +22,7 @@ class CommonOptions(TypedDict):
     log_dir: str
     display: Literal["full", "conversation", "rich", "plain", "none"]
     no_ansi: bool | None
+    traceback_locals: bool
     env: tuple[str] | None
     debug: bool
     debug_port: int
@@ -72,6 +74,13 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         help="Do not print ANSI control characters.",
         envvar="INSPECT_NO_ANSI",
     )
+    @click.option(
+        "--traceback-locals",
+        type=bool,
+        is_flag=True,
+        envvar="INSPECT_TRACEBACK_LOCALS",
+        help="Include values of local variables in tracebacks (note that this can leak private data e.g. API keys so should typically only be enabled for targeted debugging).",
+    )
     @click.option(
         "--env",
         multiple=True,
@@ -107,6 +116,10 @@ def process_common_options(options: CommonOptions) -> None:
     env_args = parse_cli_args(options["env"])
     init_cli_env(env_args)
+    # set traceback locals env var
+    if options.get("traceback_locals", False):
+        os.environ["INSPECT_TRACEBACK_LOCALS"] = "1"
     # propagate display
     if options["no_ansi"]:
         display = "rich"

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -12,6 +12,7 @@ from inspect_ai._util.constants import (
     DEFAULT_LOG_LEVEL_TRANSCRIPT,
     DEFAULT_LOG_SHARED,
     DEFAULT_MAX_CONNECTIONS,
+    DEFAULT_RETRY_ON_ERROR,
 )
 from inspect_ai._util.file import filesystem
 from inspect_ai._util.samples import parse_sample_id, parse_samples_limit
@@ -43,6 +44,7 @@ NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task comple
 FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
 NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
 NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)"
+RETRY_ON_ERROR_HELP = "Retry samples if they encounter errors (by default, no retries occur). Specify --retry-on-error to retry a single time, or specify e.g. `--retry-on-error=3` to retry multiple times."
 LOG_IMAGES_HELP = (
     "Include base64 encoded versions of filename or URL based images in the log file."
 )
@@ -263,6 +265,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         help=NO_FAIL_ON_ERROR_HELP,
         envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
     )
+    @click.option(
+        "--retry-on-error",
+        is_flag=False,
+        flag_value="true",
+        default=None,
+        callback=int_or_bool_flag_callback(DEFAULT_RETRY_ON_ERROR),
+        help=RETRY_ON_ERROR_HELP,
+        envvar="INSPECT_EVAL_RETRY_ON_ERROR",
+    )
     @click.option(
         "--no-log-samples",
         type=bool,
@@ -428,6 +439,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         help="Maximum number of tokens to use for reasoning. Anthropic Claude models only.",
         envvar="INSPECT_EVAL_REASONING_TOKENS",
     )
+    @click.option(
+        "--reasoning-summary",
+        type=click.Choice(["concise", "detailed", "auto"]),
+        help="Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only.",
+        envvar="INSPECT_EVAL_REASONING_SUMMARY",
+    )
     @click.option(
         "--reasoning-history",
         type=click.Choice(["none", "all", "last", "auto"]),
@@ -512,6 +529,7 @@ def eval_command(
     cache_prompt: str | None,
     reasoning_effort: str | None,
     reasoning_tokens: int | None,
+    reasoning_summary: Literal["concise", "detailed", "auto"] | None,
     reasoning_history: Literal["none", "all", "last", "auto"] | None,
     response_schema: ResponseSchema | None,
     message_limit: int | None,
@@ -524,6 +542,7 @@ def eval_command(
     max_sandboxes: int | None,
     fail_on_error: bool | float | None,
     no_fail_on_error: bool | None,
+    retry_on_error: int | None,
     no_log_samples: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
@@ -578,6 +597,7 @@ def eval_command(
         max_sandboxes=max_sandboxes,
         fail_on_error=fail_on_error,
         no_fail_on_error=no_fail_on_error,
+        retry_on_error=retry_on_error,
         debug_errors=common["debug_errors"],
         no_log_samples=no_log_samples,
         log_images=log_images,
@@ -683,6 +703,7 @@ def eval_set_command(
     cache_prompt: str | None,
     reasoning_effort: str | None,
     reasoning_tokens: int | None,
+    reasoning_summary: Literal["concise", "detailed", "auto"] | None,
     reasoning_history: Literal["none", "all", "last", "auto"] | None,
     response_schema: ResponseSchema | None,
     message_limit: int | None,
@@ -695,6 +716,7 @@ def eval_set_command(
     max_sandboxes: int | None,
     fail_on_error: bool | float | None,
     no_fail_on_error: bool | None,
+    retry_on_error: int | None,
     no_log_samples: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
@@ -754,6 +776,7 @@ def eval_set_command(
         max_sandboxes=max_sandboxes,
         fail_on_error=fail_on_error,
         no_fail_on_error=no_fail_on_error,
+        retry_on_error=retry_on_error,
         debug_errors=common["debug_errors"],
         no_log_samples=no_log_samples,
         log_images=log_images,
@@ -811,6 +834,7 @@ def eval_exec(
     max_sandboxes: int | None,
     fail_on_error: bool | float | None,
     no_fail_on_error: bool | None,
+    retry_on_error: int | None,
     debug_errors: bool | None,
     no_log_samples: bool | None,
     log_images: bool | None,
@@ -858,6 +882,10 @@ def eval_exec(
     elif fail_on_error == 0.0:
         fail_on_error = True
+    # resolve retry_on_error
+    if retry_on_error == 0:
+        retry_on_error = None
     # resolve negating options
     sandbox_cleanup = False if no_sandbox_cleanup else None
     log_samples = False if no_log_samples else None
@@ -890,6 +918,7 @@ def eval_exec(
             sample_id=eval_sample_id,
             epochs=eval_epochs,
             fail_on_error=fail_on_error,
+            retry_on_error=retry_on_error,
             debug_errors=debug_errors,
             message_limit=message_limit,
             token_limit=token_limit,
@@ -1024,6 +1053,15 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
     help=NO_FAIL_ON_ERROR_HELP,
     envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
 )
+@click.option(
+    "--retry-on-error",
+    is_flag=False,
+    flag_value="true",
+    default=None,
+    callback=int_or_bool_flag_callback(DEFAULT_RETRY_ON_ERROR),
+    help=RETRY_ON_ERROR_HELP,
+    envvar="INSPECT_EVAL_RETRY_ON_ERROR",
+)
 @click.option(
     "--no-log-samples",
     type=bool,
@@ -1096,6 +1134,7 @@ def eval_retry_command(
     trace: bool | None,
     fail_on_error: bool | float | None,
     no_fail_on_error: bool | None,
+    retry_on_error: int | None,
     no_log_samples: bool | None,
     log_images: bool | None,
     log_buffer: int | None,
@@ -1125,6 +1164,10 @@ def eval_retry_command(
     elif fail_on_error == 0.0:
         fail_on_error = True
+    # resolve retry on error
+    if retry_on_error == 0:
+        retry_on_error = None
     # resolve log file
     retry_log_files = [
         log_file_info(filesystem(log_file).info(log_file)) for log_file in log_files
@@ -1143,6 +1186,7 @@ def eval_retry_command(
         sandbox_cleanup=sandbox_cleanup,
         trace=trace,
         fail_on_error=fail_on_error,
+        retry_on_error=retry_on_error,
         debug_errors=common["debug_errors"],
         log_samples=log_samples,
         log_images=log_images,

inspect_ai/_display/textual/widgets/samples.py CHANGED Viewed

@@ -1,11 +1,18 @@
 import time
 from typing import cast
+from urllib.parse import urlencode, urlparse, urlunparse
 from rich.console import RenderableType
 from rich.table import Table
 from rich.text import Text
 from textual.app import ComposeResult
-from textual.containers import Horizontal, HorizontalGroup, Vertical, VerticalGroup
+from textual.containers import (
+    Horizontal,
+    HorizontalGroup,
+    Right,
+    Vertical,
+    VerticalGroup,
+)
 from textual.css.query import NoMatches
 from textual.reactive import reactive
 from textual.widget import Widget
@@ -20,9 +27,12 @@ from textual.widgets import (
 from textual.widgets.option_list import Option, OptionDoesNotExist
 from inspect_ai._display.textual.widgets.port_mappings import get_url
+from inspect_ai._display.textual.widgets.vscode import conditional_vscode_link
+from inspect_ai._util.file import to_uri
 from inspect_ai._util.format import format_progress_time
 from inspect_ai._util.port_names import get_service_by_port
 from inspect_ai._util.registry import registry_unqualified_name
+from inspect_ai._util.vscode import EXTENSION_COMMAND_OPEN_SAMPLE, VSCodeCommand
 from inspect_ai.log._samples import ActiveSample
 from inspect_ai.log._transcript import ToolEvent
@@ -272,6 +282,16 @@ class SampleInfo(Vertical):
         background: $surface;
         color: $secondary;
     }
+    SampleInfo #sample-link {
+        height: auto;
+        width: 11;
+        margin-left: 1;
+        background: $background;
+    }
+    SampleInfo #sample-link Link {
+        color: $accent;
+        background: $background;
+    }
     """
     def __init__(self) -> None:
@@ -280,9 +300,12 @@ class SampleInfo(Vertical):
         self._sandbox_count: int | None = None
     def compose(self) -> ComposeResult:
-        with Collapsible(title=""):
-            yield SampleLimits()
-            yield SandboxesView()
+        with Horizontal():
+            with Collapsible(title=""):
+                yield SampleLimits()
+                yield SandboxesView()
+            yield Right(id="sample-link")
         yield SampleVNC()
     async def sync_sample(self, sample: ActiveSample | None) -> None:
@@ -311,6 +334,28 @@ class SampleInfo(Vertical):
             await sandboxes.sync_sample(sample)
             await self.query_one(SampleVNC).sync_sample(sample)
+            # View Log Link
+            base_uri = sample.log_location
+            query_params = {
+                "sample_id": sample.sample.id,
+                "epoch": sample.epoch,
+            }
+            parsed = urlparse(to_uri(base_uri))
+            view_link = urlunparse(parsed._replace(query=urlencode(query_params)))
+            link_container = self.query_one("#sample-link")
+            link_container.remove_children()
+            link = conditional_vscode_link(
+                "[View Log]",
+                VSCodeCommand(
+                    command="inspect.openLogViewer",
+                    args=[view_link] if sample.log_location else [],
+                ),
+                EXTENSION_COMMAND_OPEN_SAMPLE,
+            )
+            link_container.mount(link)
 class SampleLimits(Widget):
     DEFAULT_CSS = """

inspect_ai/_display/textual/widgets/vscode.py CHANGED Viewed

@@ -8,8 +8,10 @@ from inspect_ai._util.vscode import (
 )
-def conditional_vscode_link(text: str, command: VSCodeCommand) -> Widget:
-    if can_execute_vscode_command(command.command):
+def conditional_vscode_link(
+    text: str, command: VSCodeCommand, context: str | None = None
+) -> Widget:
+    if can_execute_vscode_command(command.command, context=context):
         vscode_link = VSCodeLink(text)
         vscode_link.commands = [command]
         return vscode_link

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -90,6 +90,7 @@ def eval(
     sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
     epochs: int | Epochs | None = None,
     fail_on_error: bool | float | None = None,
+    retry_on_error: int | None = None,
     debug_errors: bool | None = None,
     message_limit: int | None = None,
     token_limit: int | None = None,
@@ -151,6 +152,8 @@ def eval(
             (default); `False` to never fail on sample errors; Value between 0 and 1
             to fail if a proportion of total samples fails. Value greater than 1 to fail
             eval if a count of samples fails.
+        retry_on_error: Number of times to retry samples if they encounter errors
+            (by default, no retries occur).
         debug_errors: Raise task errors (rather than logging them)
             so they can be debugged (defaults to False).
         message_limit: Limit on total messages used for each sample.
@@ -214,6 +217,7 @@ def eval(
                 sample_id=sample_id,
                 epochs=epochs,
                 fail_on_error=fail_on_error,
+                retry_on_error=retry_on_error,
                 debug_errors=debug_errors,
                 message_limit=message_limit,
                 token_limit=token_limit,
@@ -266,6 +270,7 @@ async def eval_async(
     sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
     epochs: int | Epochs | None = None,
     fail_on_error: bool | float | None = None,
+    retry_on_error: int | None = None,
     debug_errors: bool | None = None,
     message_limit: int | None = None,
     token_limit: int | None = None,
@@ -315,6 +320,8 @@ async def eval_async(
         fail_on_error: `True` to fail on first sample error
             (default); `False` to never fail on sample errors; Value between 0 and 1
             to fail if a proportion of total samples fails. Value greater than 1 to fail eval if a count of samples fails.
+        retry_on_error: Number of times to retry samples if they encounter errors
+            (by default, no retries occur).
         debug_errors: Raise task errors (rather than logging them) so they can be debugged (defaults to False).
         message_limit: Limit on total messages used for each sample.
         token_limit: Limit on total tokens used for each sample.
@@ -455,6 +462,7 @@ async def eval_async(
             else None,
             approval=config_from_approval_policies(approval) if approval else None,
             fail_on_error=fail_on_error,
+            retry_on_error=retry_on_error,
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
@@ -551,6 +559,7 @@ def eval_retry(
     trace: bool | None = None,
     display: DisplayType | None = None,
     fail_on_error: bool | float | None = None,
+    retry_on_error: int | None = None,
     debug_errors: bool | None = None,
     log_samples: bool | None = None,
     log_images: bool | None = None,
@@ -589,6 +598,8 @@ def eval_retry(
             (default); `False` to never fail on sample errors; Value between 0 and 1
             to fail if a proportion of total samples fails. Value greater than 1 to fail
             eval if a count of samples fails.
+        retry_on_error: Number of times to retry samples if they encounter errors
+            (by default, no retries occur).
         debug_errors: Raise task errors (rather than logging them)
             so they can be debugged (defaults to False).
         log_samples: Log detailed samples and scores (defaults to True)
@@ -631,6 +642,7 @@ def eval_retry(
             max_sandboxes=max_sandboxes,
             sandbox_cleanup=sandbox_cleanup,
             fail_on_error=fail_on_error,
+            retry_on_error=retry_on_error,
             debug_errors=debug_errors,
             log_samples=log_samples,
             log_images=log_images,
@@ -658,6 +670,7 @@ async def eval_retry_async(
     max_sandboxes: int | None = None,
     sandbox_cleanup: bool | None = None,
     fail_on_error: bool | float | None = None,
+    retry_on_error: int | None = None,
     debug_errors: bool | None = None,
     log_samples: bool | None = None,
     log_images: bool | None = None,
@@ -672,46 +685,40 @@ async def eval_retry_async(
     """Retry a previously failed evaluation task.
     Args:
-        tasks: (str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog]):
-            Log files for task(s) to retry.
-        log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
+        tasks: Log files for task(s) to retry.
+        log_level: Level for logging to the console: "debug", "http", "sandbox",
           "info", "warning", "error", or "critical" (defaults to "warning")
-        log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
-        log_dir (str | None): Output path for logging results
-           (defaults to file log in ./logs directory).
-        log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
-           to "eval", the native high-performance format).
-        max_samples (int | None): Maximum number of samples to run in parallel
+        log_level_transcript: Level for logging to the log file (defaults to "info")
+        log_dir: Output path for logging results (defaults to file log in ./logs directory).
+        log_format: Format for writing log files (defaults to "eval", the native high-performance format).
+        max_samples: Maximum number of samples to run in parallel
            (default is max_connections)
-        max_tasks (int | None): Maximum number of tasks to run in parallel
-           (default is 1)
-        max_subprocesses (int): Maximum number of subprocesses to
-           run in parallel (default is os.cpu_count())
-        max_sandboxes (int): Maximum number of sandboxes (per-provider) to run in parallel.
-        sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
+        max_tasks: Maximum number of tasks to run in parallel (default is 1)
+        max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
+        max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
+        sandbox_cleanup: Cleanup sandbox environments after task completes
            (defaults to True)
-        fail_on_error (bool | float | None): `True` to fail on first sample error
+        fail_on_error: `True` to fail on first sample error
            (default); `False` to never fail on sample errors; Value between 0 and 1
            to fail if a proportion of total samples fails. Value greater than 1 to fail
            eval if a count of samples fails.
-        debug_errors (bool | None): Raise task errors (rather than logging them)
+        retry_on_error: Number of times to retry samples if they encounter errors
+           (by default, no retries occur).
+        debug_errors: Raise task errors (rather than logging them)
            so they can be debugged (defaults to False).
-        log_samples: (bool | None): Log detailed samples and scores (defaults to True)
-        log_images: (bool | None): Log base64 encoded version of images,
+        log_samples: Log detailed samples and scores (defaults to True)
+        log_images: Log base64 encoded version of images,
            even if specified as a filename or URL (defaults to False)
-        log_buffer: (int | None): Number of samples to buffer before writing log file.
+        log_buffer: Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
         log_shared: Indicate that the log directory is shared, which results in
             additional syncing of realtime log data for Inspect View.
-        score (bool): Score output (defaults to True)
-        score_display (bool | None): Show scoring metrics in realtime (defaults to True)
-        max_retries (int | None):
-           Maximum number of times to retry request.
-        timeout: (int | None):
-           Request timeout (in seconds)
-        max_connections (int | None):
-           Maximum number of concurrent connections to Model API (default is per Model API)
+        score: Score output (defaults to True)
+        score_display: Show scoring metrics in realtime (defaults to True)
+        max_retries: Maximum number of times to retry request.
+        timeout: Request timeout (in seconds)
+        max_connections: Maximum number of concurrent connections to Model API (default is per Model API)
     Returns:
         List of EvalLog (one for each task)
@@ -802,6 +809,11 @@ async def eval_retry_async(
             if fail_on_error is not None
             else eval_log.eval.config.fail_on_error
         )
+        retry_on_error = (
+            retry_on_error
+            if retry_on_error is not None
+            else eval_log.eval.config.retry_on_error
+        )
         log_samples = (
             log_samples if log_samples is not None else eval_log.eval.config.log_samples
         )
@@ -852,6 +864,7 @@ async def eval_retry_async(
                 sample_id=sample_id,
                 epochs=epochs,
                 fail_on_error=fail_on_error,
+                retry_on_error=retry_on_error,
                 debug_errors=debug_errors,
                 message_limit=message_limit,
                 token_limit=token_limit,

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -82,6 +82,7 @@ def eval_set(
     sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
     epochs: int | Epochs | None = None,
     fail_on_error: bool | float | None = None,
+    retry_on_error: int | None = None,
     debug_errors: bool | None = None,
     message_limit: int | None = None,
     token_limit: int | None = None,
@@ -153,6 +154,8 @@ def eval_set(
             (default); `False` to never fail on sample errors; Value between 0 and 1
             to fail if a proportion of total samples fails. Value greater than 1 to fail
             eval if a count of samples fails.
+        retry_on_error: Number of times to retry samples if they encounter errors
+            (by default, no retries occur).
         debug_errors: Raise task errors (rather than logging them)
             so they can be debugged (defaults to False).
         message_limit: Limit on total messages used for each sample.
@@ -215,6 +218,7 @@ def eval_set(
             sample_id=sample_id,
             epochs=epochs,
             fail_on_error=fail_on_error,
+            retry_on_error=retry_on_error,
             debug_errors=debug_errors,
             message_limit=message_limit,
             token_limit=token_limit,

inspect_ai/_eval/loader.py CHANGED Viewed

@@ -25,7 +25,6 @@ from inspect_ai._util.registry import (
     registry_lookup,
     registry_params,
 )
-from inspect_ai.agent._agent import Agent
 from inspect_ai.agent._as_solver import as_solver
 from inspect_ai.model import Model
 from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
@@ -423,9 +422,9 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
             if solver_name is None:
                 raise ValueError(f"Unable to resolve solver name from {spec.solver}")
             elif registry_lookup("solver", solver_name) is not None:
-                return cast(Solver, registry_create("solver", solver_name, **spec.args))
+                return registry_create("solver", solver_name, **spec.args)
             elif registry_lookup("agent", solver_name) is not None:
-                agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
+                agent = registry_create("agent", solver_name, **spec.args)
                 return as_solver(agent)
             else:
                 raise ValueError(
@@ -484,11 +483,11 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
             # create decorator based solvers using the registry
             if any(solver[0] == solver_name for solver in solver_decorators):
-                return cast(Solver, registry_create("solver", solver_name, **spec.args))
+                return registry_create("solver", solver_name, **spec.args)
             # create decorator based agents using the registry
             elif any(agent[0] == solver_name for agent in agent_decorators):
-                agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
+                agent = registry_create("agent", solver_name, **spec.args)
                 return as_solver(agent)
             # create bridge based solvers by calling the function and wrapping it in bridge()

inspect_ai/_eval/registry.py CHANGED Viewed

@@ -80,7 +80,7 @@ def task_create(name: str, **kwargs: Any) -> Task:
         else:
             logger.warning(f"param '{param}' not used by task '{name}'")
-    return cast(Task, registry_create("task", name, **task_args))
+    return registry_create("task", name, **task_args)
 @overload

inspect_ai/_eval/run.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sys
 from typing import Any, Awaitable, Callable, Set, cast
 from inspect_ai._eval.task.task import Task
+from inspect_ai._util.environ import environ_vars
 from inspect_ai._util.trace import trace_action
 if sys.version_info < (3, 11):
@@ -49,7 +50,7 @@ from .loader import (
 from .task.log import TaskLogger
 from .task.resolved import ResolvedTask
 from .task.run import TaskRunOptions, task_run
-from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
+from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task_and_sample
 from .task.util import slice_dataset, task_run_dir
 log = logging.getLogger(__name__)
@@ -435,7 +436,9 @@ async def startup_sandbox_environments(
         # resolve each sample and add to sandboxenvs
         dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
         for sample in dataset:
-            sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample)
+            sandbox = await resolve_sandbox_for_task_and_sample(
+                eval_sandbox, task.task, sample
+            )
             if sandbox is not None and sandbox not in sandboxenvs:
                 sandboxenvs.add(sandbox)
@@ -448,7 +451,7 @@ async def startup_sandbox_environments(
             # run startup
             task_init = cast(TaskInit, getattr(sandboxenv_type, "task_init"))
-            with chdir(sandboxenv.run_dir):
+            with chdir(sandboxenv.run_dir), environ_vars(dict(sandboxenv.env)):
                 await task_init("startup", sandboxenv.sandbox.config)
             # append cleanup method

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -187,6 +187,9 @@ class TaskLogger:
         # log the sample event
         self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
+    def remove_sample(self, id: str | int, epoch: int) -> None:
+        self._buffer_db.remove_samples([(id, epoch)])
     async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
         # log the sample
         await self.recorder.log_sample(self.eval, sample)
@@ -202,6 +205,9 @@ class TaskLogger:
                 scores=sample.scores,
                 error=sample.error.message if sample.error is not None else None,
                 limit=f"{sample.limit.type}" if sample.limit is not None else None,
+                retries=len(sample.error_retries)
+                if sample.error_retries is not None
+                else None,
             )
         )

inspect-ai 0.3.90__py3-none-any.whl → 0.3.92__py3-none-any.whl

inspect-ai 0.3.90py3-none-any.whl → 0.3.92py3-none-any.whl