PyPI - inspect-ai - Versions diffs - 0.3.53__py3-none-any.whl → 0.3.55__py3-none-any.whl - Mend

inspect-ai 0.3.53py3-none-any.whl → 0.3.55py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

inspect_ai/_cli/eval.py +26 -1
inspect_ai/_cli/main.py +2 -0
inspect_ai/_cli/trace.py +244 -0
inspect_ai/_display/textual/app.py +5 -1
inspect_ai/_display/textual/widgets/tasks.py +13 -3
inspect_ai/_eval/eval.py +17 -0
inspect_ai/_eval/task/images.py +4 -14
inspect_ai/_eval/task/log.py +2 -1
inspect_ai/_eval/task/run.py +26 -10
inspect_ai/_util/constants.py +3 -3
inspect_ai/_util/display.py +1 -0
inspect_ai/_util/logger.py +34 -8
inspect_ai/_util/trace.py +275 -0
inspect_ai/log/_log.py +3 -0
inspect_ai/log/_message.py +2 -2
inspect_ai/log/_recorders/eval.py +6 -17
inspect_ai/log/_recorders/json.py +19 -17
inspect_ai/model/_cache.py +22 -16
inspect_ai/model/_call_tools.py +9 -1
inspect_ai/model/_generate_config.py +2 -2
inspect_ai/model/_model.py +11 -12
inspect_ai/model/_providers/bedrock.py +1 -1
inspect_ai/model/_providers/openai.py +11 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +1 -1
inspect_ai/util/_sandbox/context.py +6 -1
inspect_ai/util/_sandbox/docker/compose.py +58 -19
inspect_ai/util/_sandbox/docker/docker.py +11 -11
inspect_ai/util/_sandbox/docker/util.py +0 -6
inspect_ai/util/_sandbox/service.py +17 -7
inspect_ai/util/_subprocess.py +6 -1
inspect_ai/util/_subtask.py +8 -2
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/METADATA +7 -7
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/RECORD +37 -35
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.53.dist-info → inspect_ai-0.3.55.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -42,6 +42,7 @@ LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not s
 NO_SCORE_HELP = (
     "Do not score model output (use the inspect score command to score output later)"
 )
+NO_SCORE_DISPLAY = "Do not display scoring metrics in realtime."
 MAX_CONNECTIONS_HELP = f"Maximum number of concurrent connections to Model API (defaults to {DEFAULT_MAX_CONNECTIONS})"
 MAX_RETRIES_HELP = (
     f"Maximum number of times to retry request (defaults to {DEFAULT_MAX_RETRIES})"
@@ -257,6 +258,13 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         help=NO_SCORE_HELP,
         envvar="INSPECT_EVAL_NO_SCORE",
     )
+    @click.option(
+        "--no-score-display",
+        type=bool,
+        is_flag=True,
+        help=NO_SCORE_HELP,
+        envvar="INSPECT_EVAL_SCORE_DISPLAY",
+    )
     @click.option(
         "--max-tokens",
         type=int,
@@ -339,7 +347,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
         "--logprobs",
         type=bool,
         is_flag=True,
-        help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, vLLM only.",
+        help="Return log probabilities of the output tokens. OpenAI, Google, Grok, TogetherAI, Huggingface, llama-cpp-python, and vLLM only.",
         envvar="INSPECT_EVAL_LOGPROBS",
     )
     @click.option(
@@ -446,6 +454,7 @@ def eval_command(
     log_images: bool | None,
     log_buffer: int | None,
     no_score: bool | None,
+    no_score_display: bool | None,
     log_format: Literal["eval", "json"] | None,
     **common: Unpack[CommonOptions],
 ) -> None:
@@ -495,6 +504,7 @@ def eval_command(
         log_images=log_images,
         log_buffer=log_buffer,
         no_score=no_score,
+        no_score_display=no_score_display,
         is_eval_set=False,
         **config,
     )
@@ -603,6 +613,7 @@ def eval_set_command(
     log_images: bool | None,
     log_buffer: int | None,
     no_score: bool | None,
+    no_score_display: bool | None,
     bundle_dir: str | None,
     bundle_overwrite: bool | None,
     log_format: Literal["eval", "json"] | None,
@@ -654,6 +665,7 @@ def eval_set_command(
         log_images=log_images,
         log_buffer=log_buffer,
         no_score=no_score,
+        no_score_display=no_score_display,
         is_eval_set=True,
         retry_attempts=retry_attempts,
         retry_wait=retry_wait,
@@ -706,6 +718,7 @@ def eval_exec(
     log_images: bool | None,
     log_buffer: int | None,
     no_score: bool | None,
+    no_score_display: bool | None,
     is_eval_set: bool = False,
     retry_attempts: int | None = None,
     retry_wait: int | None = None,
@@ -746,6 +759,7 @@ def eval_exec(
     log_images = False if log_images is False else None
     trace = True if trace else None
     score = False if no_score else True
+    score_display = False if no_score_display else None
     # build params
     params: dict[str, Any] = (
@@ -781,6 +795,7 @@ def eval_exec(
             log_images=log_images,
             log_buffer=log_buffer,
             score=score,
+            score_display=score_display,
         )
         | kwargs
     )
@@ -915,6 +930,13 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
     help=NO_SCORE_HELP,
     envvar="INSPECT_EVAL_SCORE",
 )
+@click.option(
+    "--no-score-display",
+    type=bool,
+    is_flag=True,
+    help=NO_SCORE_HELP,
+    envvar="INSPECT_EVAL_SCORE_DISPLAY",
+)
 @click.option(
     "--max-connections",
     type=int,
@@ -940,6 +962,7 @@ def eval_retry_command(
     log_images: bool | None,
     log_buffer: int | None,
     no_score: bool | None,
+    no_score_display: bool | None,
     max_connections: int | None,
     max_retries: int | None,
     timeout: int | None,
@@ -954,6 +977,7 @@ def eval_retry_command(
     log_samples = False if no_log_samples else None
     log_images = False if log_images is False else None
     score = False if no_score else True
+    score_display = False if no_score_display else None
     # resolve fail_on_error
     if no_fail_on_error is True:
@@ -984,6 +1008,7 @@ def eval_retry_command(
         log_images=log_images,
         log_buffer=log_buffer,
         score=score,
+        score_display=score_display,
         max_retries=max_retries,
         timeout=timeout,
         max_connections=max_connections,

inspect_ai/_cli/main.py CHANGED Viewed

@@ -11,6 +11,7 @@ from .list import list_command
 from .log import log_command
 from .sandbox import sandbox_command
 from .score import score_command
+from .trace import trace_command
 from .view import view_command
@@ -46,6 +47,7 @@ inspect.add_command(log_command)
 inspect.add_command(score_command)
 inspect.add_command(view_command)
 inspect.add_command(sandbox_command)
+inspect.add_command(trace_command)
 def main() -> None:

inspect_ai/_cli/trace.py ADDED Viewed

@@ -0,0 +1,244 @@
+import os
+import shlex
+import time
+from datetime import datetime
+from json import dumps
+from pathlib import Path
+from typing import Callable, cast
+import click
+from pydantic_core import to_json
+from rich import print as r_print
+from rich.console import Console, RenderableType
+from rich.table import Column, Table
+from inspect_ai._util.error import PrerequisiteError
+from inspect_ai._util.logger import TRACE_FILE_NAME
+from inspect_ai._util.trace import ActionTraceRecord, inspect_trace_dir, read_trace_file
+@click.group("trace")
+def trace_command() -> None:
+    """List and read execution traces.
+    Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces.
+    """
+    return None
+@trace_command.command("list")
+@click.option(
+    "--json",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Output listing as JSON",
+)
+def list_command(json: bool) -> None:
+    """List all trace files."""
+    trace_dir = inspect_trace_dir()
+    trace_files: list[dict[str, float | str]] = [
+        {"mtime": f.lstat().st_mtime, "file": f.absolute().as_posix()}
+        for f in trace_dir.iterdir()
+        if f.is_file()
+    ]
+    trace_files.sort(key=lambda f: cast(float, f["mtime"]), reverse=True)
+    if json:
+        print(dumps(trace_files, indent=2))
+    else:
+        table = Table(box=None, show_header=True, pad_edge=False)
+        table.add_column("Time")
+        table.add_column("Trace File")
+        for file in trace_files:
+            mtime = datetime.fromtimestamp(cast(float, file["mtime"])).astimezone()
+            table.add_row(
+                mtime.strftime("%d-%b %H:%M:%S %Z"), shlex.quote(str(file["file"]))
+            )
+        r_print(table)
+@trace_command.command("dump")
+@click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
+def read_command(trace_file: str) -> None:
+    """Dump a trace file to stdout (as a JSON array of log records)."""
+    trace_file_path = resolve_trace_file_path(trace_file)
+    traces = read_trace_file(trace_file_path)
+    print(
+        to_json(traces, indent=2, exclude_none=True, fallback=lambda _: None).decode()
+    )
+@trace_command.command("anomalies")
+@click.argument("trace-file", type=str, required=False, default=TRACE_FILE_NAME)
+@click.option(
+    "--all",
+    is_flag=True,
+    default=False,
+    help="Show all anomolies including errors and timeouts (by default only still running and cancelled actions are shown).",
+)
+def anomolies_command(trace_file: str, all: bool) -> None:
+    """Look for anomalies in a trace file (never completed or cancelled actions)."""
+    trace_file_path = resolve_trace_file_path(trace_file)
+    traces = read_trace_file(trace_file_path)
+    # Track started actions
+    running_actions: dict[str, ActionTraceRecord] = {}
+    canceled_actions: dict[str, ActionTraceRecord] = {}
+    error_actions: dict[str, ActionTraceRecord] = {}
+    timeout_actions: dict[str, ActionTraceRecord] = {}
+    def action_started(trace: ActionTraceRecord) -> None:
+        running_actions[trace.trace_id] = trace
+    def action_completed(trace: ActionTraceRecord) -> ActionTraceRecord:
+        start_trace = running_actions.get(trace.trace_id)
+        if start_trace:
+            del running_actions[trace.trace_id]
+            return start_trace
+        else:
+            raise RuntimeError(f"Expected {trace.trace_id} in action dictionary.")
+    def action_failed(trace: ActionTraceRecord) -> None:
+        if all:
+            error_actions[start_trace.trace_id] = trace
+    def action_canceled(trace: ActionTraceRecord) -> None:
+        canceled_actions[start_trace.trace_id] = trace
+    def action_timeout(trace: ActionTraceRecord) -> None:
+        if all:
+            timeout_actions[start_trace.trace_id] = trace
+    for trace in traces:
+        if isinstance(trace, ActionTraceRecord):
+            match trace.event:
+                case "enter":
+                    action_started(trace)
+                case "exit":
+                    action_completed(trace)
+                case "cancel":
+                    start_trace = action_completed(trace)
+                    trace.start_time = start_trace.start_time
+                    action_canceled(trace)
+                case "error":
+                    start_trace = action_completed(trace)
+                    trace.start_time = start_trace.start_time
+                    action_failed(trace)
+                case "timeout":
+                    start_trace = action_completed(trace)
+                    trace.start_time = start_trace.start_time
+                    action_timeout(trace)
+                case _:
+                    print(f"Unknown event type: {trace.event}")
+    # do we have any traces?
+    if (
+        len(running_actions)
+        + len(canceled_actions)
+        + len(error_actions)
+        + len(timeout_actions)
+        == 0
+    ):
+        print(f"TRACE: {shlex.quote(trace_file_path.as_posix())}\n")
+        if all:
+            print("No anomalies found in trace log.")
+        else:
+            print(
+                "No running or cancelled actions found in trace log (pass --all to see errors and timeouts)."
+            )
+        return
+    with open(os.devnull, "w") as f:
+        # generate output
+        console = Console(record=True, file=f)
+        def print_fn(o: RenderableType) -> None:
+            console.print(o, highlight=False)
+        print_fn(f"[bold]TRACE: {shlex.quote(trace_file_path.as_posix())}[bold]")
+        _print_bucket(print_fn, "Running Actions", running_actions)
+        _print_bucket(print_fn, "Cancelled Actions", canceled_actions)
+        _print_bucket(print_fn, "Error Actions", error_actions)
+        _print_bucket(print_fn, "Timeout Actions", timeout_actions)
+        # print
+        print(console.export_text(styles=True).strip())
+def _print_bucket(
+    print_fn: Callable[[RenderableType], None],
+    label: str,
+    bucket: dict[str, ActionTraceRecord],
+) -> None:
+    if len(bucket) > 0:
+        # Sort the items in chronological order of when
+        # they finished so the first finished item is at the top
+        sorted_actions = sorted(
+            bucket.values(),
+            key=lambda record: (record.start_time or 0) + (record.duration or 0),
+            reverse=True,
+        )
+        # create table
+        table = Table(
+            Column(""),
+            Column("", justify="right"),
+            Column(""),
+            Column("", width=22),
+            box=None,
+            title=label,
+            title_justify="left",
+            title_style="bold",
+            pad_edge=False,
+            padding=(0, 1),
+        )
+        for action in sorted_actions:
+            # Compute duration (use the event duration or time since started)
+            duration = (
+                action.duration
+                if action.duration is not None
+                else time.time() - action.start_time
+                if action.start_time is not None
+                else 0.0
+            )
+            # The event start time
+            start_time = formatTime(action.start_time) if action.start_time else "None"
+            # Event detail
+            detail = (
+                f"{action.detail or action.message} {action.error}"
+                if action.event == "error"
+                else (action.detail or action.message)
+            )
+            table.add_row(
+                action.action,
+                f"{round(duration, 2):.2f}s".rjust(8),
+                f" {detail}",
+                start_time,
+            )
+        print_fn("")
+        print_fn(table)
+def resolve_trace_file_path(trace_file: str) -> Path:
+    trace_file_path = Path(trace_file)
+    if not trace_file_path.is_absolute():
+        trace_file_path = inspect_trace_dir() / trace_file_path
+    if not trace_file_path.exists():
+        raise PrerequisiteError(
+            f"The specified trace file '{trace_file_path}' does not exist."
+        )
+    return trace_file_path
+def formatTime(timestamp: float) -> str:
+    dt = datetime.fromtimestamp(timestamp).astimezone()
+    return dt.strftime("%H:%M:%S %Z")

inspect_ai/_display/textual/app.py CHANGED Viewed

@@ -197,7 +197,11 @@ class TaskScreenApp(App[TR]):
         # add task
         try:
-            yield self.query_one(TasksView).add_task(task)
+            task_view = self.query_one(TasksView)
+            task_view.set_display_metrics(
+                profile.eval_config.score_display is not False
+            )
+            yield task_view.add_task(task)
         finally:
             pass

inspect_ai/_display/textual/widgets/tasks.py CHANGED Viewed

@@ -72,6 +72,7 @@ class TasksView(Container):
         self.description_width = MAX_DESCRIPTION_WIDTH
         self.model_name_width = MAX_MODEL_NAME_WIDTH
         self.sample_count_width = 0
+        self.display_metrics = True
     def init_tasks(self, tasks: list[TaskSpec]) -> None:
         # clear existing tasks
@@ -89,7 +90,11 @@ class TasksView(Container):
     def add_task(self, task: TaskWithResult) -> TaskDisplay:
         self.update_count_width(task.profile.samples)
         task_display = TaskProgressView(
-            task, self.description_width, self.model_name_width, self.sample_count_width
+            task,
+            self.description_width,
+            self.model_name_width,
+            self.sample_count_width,
+            self.display_metrics,
         )
         self.tasks.mount(task_display)
         self.tasks.scroll_to_widget(task_display)
@@ -97,6 +102,9 @@ class TasksView(Container):
         return task_display
+    def set_display_metrics(self, display_metrics: bool) -> None:
+        self.display_metrics = display_metrics
     def update_count_width(self, samples: int) -> None:
         sample_count_str = progress_count(samples, samples, self.sample_count_width)
         self.sample_count_width = min(
@@ -174,6 +182,7 @@ class TaskProgressView(Widget):
         description_width: int,
         model_name_width: int,
         sample_count_width: int,
+        display_metrics: bool,
     ) -> None:
         super().__init__()
         self.t = task
@@ -190,6 +199,7 @@ class TaskProgressView(Widget):
         self.task_detail = TaskDetail(id="task-detail", classes="hidden")
         self.sample_count_width: int = sample_count_width
+        self.display_metrics = display_metrics
     metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
     metrics_width: reactive[int | None] = reactive(None)
@@ -198,7 +208,7 @@ class TaskProgressView(Widget):
     samples_total: reactive[int] = reactive(0)
     def compose(self) -> ComposeResult:
-        yield self.toggle
+        yield (self.toggle if self.display_metrics else Static())
         yield TaskStatusIcon()
         yield Static(
             progress_description(self.t.profile, self.description_width, pad=True)
@@ -274,7 +284,7 @@ class TaskProgressView(Widget):
     def update_metrics_label(self) -> None:
         # compute the label (with a min size)
-        if self.metrics is not None:
+        if self.metrics is not None and self.metrics_display is not None:
             metric_label = task_metric(self.metrics, self.metrics_width)
             self.metrics_width = len(metric_label)
             self.metrics_display.update(metric_label)

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -76,6 +76,7 @@ def eval(
     log_images: bool | None = None,
     log_buffer: int | None = None,
     score: bool = True,
+    score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
 ) -> list[EvalLog]:
     r"""Evaluate tasks using a Model.
@@ -139,6 +140,7 @@ def eval(
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
         score (bool): Score output (defaults to True)
+        score_display (bool | None): Show scoring metrics in realtime (defaults to True)
         **kwargs (GenerateConfigArgs): Model generation options.
     Returns:
@@ -183,6 +185,7 @@ def eval(
             log_images=log_images,
             log_buffer=log_buffer,
             score=score,
+            score_display=score_display,
             **kwargs,
         )
     )
@@ -220,6 +223,7 @@ async def eval_async(
     log_images: bool | None = None,
     log_buffer: int | None = None,
     score: bool = True,
+    score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
 ) -> list[EvalLog]:
     r"""Evaluate tasks using a Model (async).
@@ -282,6 +286,7 @@ async def eval_async(
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
         score (bool): Score output (defaults to True)
+        score_display (bool | None): Show scoring metrics in realtime (defaults to True)
         **kwargs (GenerateConfigArgs): Model generation options.
     Returns:
@@ -380,6 +385,7 @@ async def eval_async(
             log_samples=log_samples,
             log_images=log_images,
             log_buffer=log_buffer,
+            score_display=score_display,
         )
         # run tasks - 2 codepaths, one for the traditional task at a time
@@ -467,6 +473,7 @@ def eval_retry(
     log_images: bool | None = None,
     log_buffer: int | None = None,
     score: bool = True,
+    score_display: bool | None = None,
     max_retries: int | None = None,
     timeout: int | None = None,
     max_connections: int | None = None,
@@ -507,6 +514,7 @@ def eval_retry(
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
         score (bool): Score output (defaults to True)
+        score_display (bool | None): Show scoring metrics in realtime (defaults to True)
         max_retries (int | None):
            Maximum number of times to retry request.
         timeout: (int | None):
@@ -541,6 +549,7 @@ def eval_retry(
             log_images=log_images,
             log_buffer=log_buffer,
             score=score,
+            score_display=score_display,
             max_retries=max_retries,
             timeout=timeout,
             max_connections=max_connections,
@@ -565,6 +574,7 @@ async def eval_retry_async(
     log_images: bool | None = None,
     log_buffer: int | None = None,
     score: bool = True,
+    score_display: bool | None = None,
     max_retries: int | None = None,
     timeout: int | None = None,
     max_connections: int | None = None,
@@ -603,6 +613,7 @@ async def eval_retry_async(
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
         score (bool): Score output (defaults to True)
+        score_display (bool | None): Show scoring metrics in realtime (defaults to True)
         max_retries (int | None):
            Maximum number of times to retry request.
         timeout: (int | None):
@@ -699,6 +710,11 @@ async def eval_retry_async(
         log_buffer = (
             log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
         )
+        score_display = (
+            score_display
+            if score_display is not None
+            else eval_log.eval.config.score_display
+        )
         config = eval_log.plan.config
         config.max_retries = max_retries or config.max_retries
@@ -740,6 +756,7 @@ async def eval_retry_async(
                 log_images=log_images,
                 log_buffer=log_buffer,
                 score=score,
+                score_display=score_display,
                 **dict(config),
             )
         )[0]

inspect_ai/_eval/task/images.py CHANGED Viewed

@@ -30,13 +30,8 @@ async def samples_with_base64_images(samples: list[Sample]) -> list[Sample]:
 async def sample_with_base64_images(sample: Sample) -> Sample:
     if isinstance(sample.input, list):
-        return Sample(
-            input=await messages_with_base64_images(sample.input),
-            target=sample.target,
-            id=sample.id,
-            metadata=sample.metadata,
-            files=sample.files,
-            choices=sample.choices,
+        return sample.model_copy(
+            update={"input": await messages_with_base64_images(sample.input)}
         )
     else:
         return sample
@@ -44,13 +39,8 @@ async def sample_with_base64_images(sample: Sample) -> Sample:
 def sample_without_base64_images(sample: Sample) -> Sample:
     if isinstance(sample.input, list):
-        return Sample(
-            input=messages_without_base64_images(sample.input),
-            target=sample.target,
-            id=sample.id,
-            metadata=sample.metadata,
-            files=sample.files,
-            choices=sample.choices,
+        return sample.model_copy(
+            update={"input": messages_without_base64_images(sample.input)}
         )
     else:
         return sample

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -69,10 +69,11 @@ class TaskLogger:
         )
         packages = {PKG_NAME: importlib_metadata.version(PKG_NAME)}
-        # remove api_key from model_args
+        # redact authentication oriented model_args
         model_args = model_args.copy()
         if "api_key" in model_args:
             del model_args["api_key"]
+        model_args = {k: v for k, v in model_args.items() if not k.startswith("aws_")}
         # cwd_relative_path for sandbox config
         if sandbox and isinstance(sandbox.config, str):

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -217,7 +217,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
             log_location=log_location,
         )
-        with display().task(profile) as td:
+        with display().task(
+            profile,
+        ) as td:
             try:
                 # start the log
                 await log_start(logger, plan, generate_config)
@@ -252,7 +254,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
                     # track when samples complete and update progress as we go
                     progress_results: list[dict[str, SampleScore]] = []
-                    update_metrics_display = update_metrics_display_fn(td)
+                    update_metrics_display = update_metrics_display_fn(
+                        td,
+                        display_metrics=profile.eval_config.score_display is not False,
+                    )
                     def sample_complete(sample_score: dict[str, SampleScore]) -> None:
                         # Capture the result
@@ -400,7 +405,10 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
 def update_metrics_display_fn(
-    td: TaskDisplay, initial_interval: float = 0, min_interval: float = 0.9
+    td: TaskDisplay,
+    initial_interval: float = 0,
+    min_interval: float = 0.9,
+    display_metrics: bool = True,
 ) -> Callable[
     [
         int,
@@ -420,6 +428,10 @@ def update_metrics_display_fn(
         reducers: ScoreReducer | list[ScoreReducer] | None,
         metrics: list[Metric] | dict[str, list[Metric]] | None,
     ) -> None:
+        # Don't compute metrics if they are not being displayed
+        if not display_metrics:
+            return None
         nonlocal next_compute_time
         time_start = time.perf_counter()
         if time_start >= next_compute_time:
@@ -568,14 +580,18 @@ async def task_run_sample(
                 state = await plan(state, generate)
         except TimeoutError:
-            # notify the user
-            transcript()._event(
-                SampleLimitEvent(
-                    type="time",
-                    message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
-                    limit=time_limit,
+            if time_limit is not None:
+                transcript()._event(
+                    SampleLimitEvent(
+                        type="time",
+                        message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
+                        limit=time_limit,
+                    )
+                )
+            else:
+                py_logger.warning(
+                    "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
                 )
-            )
             # capture most recent state for scoring
             state = sample_state() or state

inspect-ai 0.3.53__py3-none-any.whl → 0.3.55__py3-none-any.whl

inspect-ai 0.3.53py3-none-any.whl → 0.3.55py3-none-any.whl