PyPI - inspect-ai - Versions diffs - 0.3.49__py3-none-any.whl → 0.3.51__py3-none-any.whl - Mend

inspect-ai 0.3.49py3-none-any.whl → 0.3.51py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

inspect_ai/_cli/info.py +2 -2
inspect_ai/_cli/log.py +2 -2
inspect_ai/_cli/score.py +2 -2
inspect_ai/_display/core/display.py +19 -0
inspect_ai/_display/core/panel.py +37 -7
inspect_ai/_display/core/progress.py +29 -2
inspect_ai/_display/core/results.py +79 -40
inspect_ai/_display/core/textual.py +21 -0
inspect_ai/_display/rich/display.py +28 -8
inspect_ai/_display/textual/app.py +107 -1
inspect_ai/_display/textual/display.py +1 -1
inspect_ai/_display/textual/widgets/samples.py +132 -91
inspect_ai/_display/textual/widgets/task_detail.py +236 -0
inspect_ai/_display/textual/widgets/tasks.py +74 -6
inspect_ai/_display/textual/widgets/toggle.py +32 -0
inspect_ai/_eval/context.py +2 -0
inspect_ai/_eval/eval.py +4 -3
inspect_ai/_eval/loader.py +1 -1
inspect_ai/_eval/run.py +35 -2
inspect_ai/_eval/task/log.py +13 -11
inspect_ai/_eval/task/results.py +12 -3
inspect_ai/_eval/task/run.py +139 -36
inspect_ai/_eval/task/sandbox.py +2 -1
inspect_ai/_util/_async.py +30 -1
inspect_ai/_util/file.py +31 -4
inspect_ai/_util/html.py +3 -0
inspect_ai/_util/logger.py +6 -5
inspect_ai/_util/platform.py +5 -6
inspect_ai/_util/registry.py +1 -1
inspect_ai/_view/server.py +9 -9
inspect_ai/_view/www/App.css +2 -2
inspect_ai/_view/www/dist/assets/index.css +2 -2
inspect_ai/_view/www/dist/assets/index.js +352 -294
inspect_ai/_view/www/log-schema.json +13 -0
inspect_ai/_view/www/package.json +1 -0
inspect_ai/_view/www/src/components/MessageBand.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +16 -13
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -3
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +52 -77
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -13
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +15 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +4 -2
inspect_ai/_view/www/src/types/log.d.ts +2 -0
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +2 -0
inspect_ai/_view/www/yarn.lock +9 -4
inspect_ai/approval/__init__.py +1 -1
inspect_ai/approval/_human/approver.py +35 -0
inspect_ai/approval/_human/console.py +62 -0
inspect_ai/approval/_human/manager.py +108 -0
inspect_ai/approval/_human/panel.py +233 -0
inspect_ai/approval/_human/util.py +51 -0
inspect_ai/dataset/_sources/hf.py +2 -2
inspect_ai/dataset/_sources/util.py +1 -1
inspect_ai/log/_file.py +106 -36
inspect_ai/log/_recorders/eval.py +226 -158
inspect_ai/log/_recorders/file.py +9 -6
inspect_ai/log/_recorders/json.py +35 -12
inspect_ai/log/_recorders/recorder.py +15 -15
inspect_ai/log/_samples.py +52 -0
inspect_ai/model/_model.py +14 -0
inspect_ai/model/_model_output.py +4 -0
inspect_ai/model/_providers/azureai.py +1 -1
inspect_ai/model/_providers/hf.py +106 -4
inspect_ai/model/_providers/util/__init__.py +2 -0
inspect_ai/model/_providers/util/hf_handler.py +200 -0
inspect_ai/scorer/_common.py +1 -1
inspect_ai/solver/_plan.py +0 -8
inspect_ai/solver/_task_state.py +18 -1
inspect_ai/solver/_use_tools.py +9 -1
inspect_ai/tool/_tool_def.py +2 -2
inspect_ai/tool/_tool_info.py +14 -2
inspect_ai/tool/_tool_params.py +2 -1
inspect_ai/tool/_tools/_execute.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +6 -0
inspect_ai/util/__init__.py +5 -6
inspect_ai/util/_panel.py +91 -0
inspect_ai/util/_sandbox/__init__.py +2 -6
inspect_ai/util/_sandbox/context.py +4 -3
inspect_ai/util/_sandbox/docker/compose.py +12 -2
inspect_ai/util/_sandbox/docker/docker.py +19 -9
inspect_ai/util/_sandbox/docker/util.py +10 -2
inspect_ai/util/_sandbox/environment.py +47 -41
inspect_ai/util/_sandbox/local.py +15 -10
inspect_ai/util/_subprocess.py +43 -3
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/METADATA +2 -2
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/RECORD +90 -82
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
inspect_ai/approval/_human.py +0 -123
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/top_level.txt +0 -0

inspect_ai/_display/textual/widgets/task_detail.py ADDED Viewed

@@ -0,0 +1,236 @@
+import re
+from dataclasses import dataclass
+import numpy as np
+from textual.app import ComposeResult
+from textual.containers import Center, Grid, Horizontal
+from textual.reactive import Reactive, reactive
+from textual.widget import Widget
+from textual.widgets import Static
+from inspect_ai._display.core.display import TaskDisplayMetric
+@dataclass
+class TaskMetric:
+    name: str
+    value: float
+class TaskDetail(Widget):
+    hidden = reactive(False)
+    DEFAULT_CSS = """
+    TaskDetail {
+        background: $boost;
+        width: 100%;
+        height: auto;
+        padding: 1 0 1 0;
+    }
+    TaskDetail Grid {
+        width: 100%;
+        height: auto;
+        grid-gutter: 1 3;
+    }
+    """
+    def __init__(
+        self,
+        *,
+        hidden: bool = True,
+        id: str | None = None,
+        classes: str | None = None,
+    ) -> None:
+        super().__init__(id=id, classes=classes)
+        self.hidden = hidden
+        self.existing_metrics: dict[str, TaskMetrics] = {}
+        self.grid = Grid()
+        self.by_reducer: dict[str | None, dict[str, list[TaskMetric]]] = {}
+        self.metrics: list[TaskDisplayMetric] = []
+    def watch_hidden(self, hidden: bool) -> None:
+        """React to changes in the `visible` property."""
+        if hidden:
+            self.add_class("hidden")
+        else:
+            self.remove_class("hidden")
+    def compose(self) -> ComposeResult:
+        yield self.grid
+    def on_mount(self) -> None:
+        self.refresh_grid()
+    def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
+        # Group by reducer then scorer within reducers
+        self.metrics = metrics
+        for metric in metrics:
+            reducer_group = (
+                self.by_reducer[metric.reducer]
+                if metric.reducer in self.by_reducer
+                else {}
+            )
+            by_scorer_metrics = (
+                reducer_group[metric.scorer] if metric.scorer in reducer_group else []
+            )
+            by_scorer_metrics.append(TaskMetric(name=metric.name, value=metric.value))
+            reducer_group[metric.scorer] = by_scorer_metrics
+            self.by_reducer[metric.reducer] = reducer_group
+        self.refresh_grid()
+    def refresh_grid(self) -> None:
+        # Don't refresh the grid if not attached
+        # since we may explicitly mount new widgets
+        if not self.grid.is_attached:
+            return
+        # don't refresh the grid if there are no scores
+        if len(self.by_reducer) == 0:
+            return
+        # Compute the row and column count
+        row_count = len(self.by_reducer)
+        col_count = len(next(iter(self.by_reducer.values())))
+        # If this can fit in a single row, make it fit
+        # otherwise place each reducer on their own row
+        self.grid.styles.grid_columns = "auto"
+        if row_count * col_count < 4:
+            self.grid.styles.grid_size_columns = row_count * col_count
+            self.grid.styles.grid_size_rows = 1
+        else:
+            self.grid.styles.grid_size_columns = col_count
+            self.grid.styles.grid_size_rows = row_count
+        # In order to reduce flashing the below tracks use of widgets
+        # and updates them when possible (removing and adding them as needed)
+        # Makes keys for tracking Task Metric widgets
+        def metric_key(reducer: str | None, scorer: str) -> str:
+            reducer = reducer or "none"
+            return valid_id(f"task-{reducer}-{scorer}-tbl")
+        # Remove keys that are no longer present
+        existing_keys = set(self.existing_metrics.keys())
+        new_keys = set(metric_key(m.reducer, m.scorer) for m in self.metrics)
+        to_remove = existing_keys - new_keys
+        for remove in to_remove:
+            task_metric = self.existing_metrics[remove]
+            task_metric.remove()
+        # add or update widgets with metrics
+        for reducer, scorers in self.by_reducer.items():
+            for scorer, scores in scorers.items():
+                key = metric_key(reducer=reducer, scorer=scorer)
+                if key in self.existing_metrics:
+                    task_metrics = self.existing_metrics[key]
+                    task_metrics.update(scores)
+                else:
+                    task_metrics = TaskMetrics(
+                        id=key, scorer=scorer, reducer=reducer, metrics=scores
+                    )
+                    self.grid.mount(task_metrics)
+                    self.existing_metrics[key] = task_metrics
+class TaskMetrics(Widget):
+    DEFAULT_CSS = """
+    TaskMetrics {
+        width: auto;
+        height: auto;
+    }
+    TaskMetrics Grid {
+        width: auto;
+        grid-size: 2;
+        grid-columns: auto;
+        grid-gutter: 0 3;
+        padding: 0 2 0 2;
+    }
+    TaskMetric Center {
+        width: auto;
+    }
+    TaskMetrics Center Static {
+        width: auto;
+    }
+    TaskMetrics Center Horizontal {
+        width: auto;
+        height: auto;
+    }
+    TaskMetrics Center Horizontal Static {
+        width: auto;
+        height: auto;
+    }
+    TaskMetrics .scorer {
+        padding: 0 1 0 0;
+        text-style: bold;
+    }
+    TaskMetrics .reducer {
+        color: $foreground-darken-3;
+    }
+    """
+    metrics: Reactive[list[TaskMetric]] = reactive([])
+    def __init__(
+        self,
+        *,
+        scorer: str | None,
+        reducer: str | None,
+        metrics: list[TaskMetric],
+        id: str | None = None,
+        classes: str | None = None,
+    ) -> None:
+        super().__init__(id=id, classes=classes)
+        self.scorer = scorer
+        self.reducer = reducer
+        self.metrics = metrics
+        self.grid: Grid = Grid()
+        self.value_widgets: dict[str, Static] = {}
+    def compose(self) -> ComposeResult:
+        # Just yield a single DataTable widget
+        yield Center(self._title())
+        with Grid():
+            for metric in self.metrics:
+                # Add the value static but keep it around
+                # for future updates
+                self.value_widgets[metric.name] = Static(
+                    self._metric_value(metric.value)
+                )
+                yield Static(metric.name)
+                yield self.value_widgets[metric.name]
+    def update(self, metrics: list[TaskMetric]) -> None:
+        for metric in metrics:
+            widget = self.value_widgets[metric.name]
+            widget.update(content=f"{metric.value:,.3f}")
+    def _title(self) -> Widget:
+        if self.scorer is None:
+            return Static("")
+        elif self.reducer is None:
+            return Static(self.scorer)
+        else:
+            return Horizontal(
+                Static(self.scorer, classes="scorer"),
+                Static(f"({self.reducer})", classes="reducer"),
+            )
+    def _metric_value(self, val: float) -> str:
+        if np.isnan(val):
+            return " n/a "
+        else:
+            return f"{val:.3f}"
+def valid_id(identifier: str) -> str:
+    # Remove invalid characters
+    valid_part = re.sub(r"[^a-zA-Z0-9_-]", "_", identifier)
+    # Ensure it doesn't start with a number
+    if valid_part and valid_part[0].isdigit():
+        valid_part = "_" + valid_part
+    # If the string is empty return a default valid identifier
+    return valid_part or "default_identifier"

inspect_ai/_display/textual/widgets/tasks.py CHANGED Viewed

@@ -4,19 +4,25 @@ from typing import Iterator, cast
 from rich.console import RenderableType
 from rich.text import Text
+from textual import on
 from textual.app import ComposeResult
 from textual.containers import Container, ScrollableContainer
+from textual.css.query import NoMatches
 from textual.reactive import reactive
 from textual.widget import Widget
 from textual.widgets import ProgressBar, Static
 from typing_extensions import override
+from inspect_ai._display.core.results import task_metric
 from inspect_ai._display.textual.widgets.clock import Clock
+from inspect_ai._display.textual.widgets.task_detail import TaskDetail
+from inspect_ai._display.textual.widgets.toggle import Toggle
 from ...core.display import (
     Progress,
     TaskCancelled,
     TaskDisplay,
+    TaskDisplayMetric,
     TaskError,
     TaskResult,
     TaskSpec,
@@ -25,6 +31,7 @@ from ...core.display import (
 from ...core.progress import (
     MAX_DESCRIPTION_WIDTH,
     MAX_MODEL_NAME_WIDTH,
+    progress_count,
     progress_description,
     progress_model_name,
 )
@@ -106,9 +113,10 @@ class TaskProgressView(Widget):
         height: auto;
         width: 1fr;
         layout: grid;
-        grid-size: 5 1;
-        grid-columns: auto auto auto 1fr auto;
-        grid-gutter: 1;
+        grid-size: 8 2;
+        grid-columns: auto auto auto auto 1fr auto auto auto;
+        grid-rows: auto auto;
+        grid-gutter: 0 1;
     }
     TaskProgressView Bar {
         width: 1fr;
@@ -119,6 +127,15 @@ class TaskProgressView(Widget):
             color: $success;
         }
     }
+    #task-metrics {
+        color:$text-secondary;
+    }
+    #task-detail {
+        column-span: 8;
+    }
+    .hidden {
+        display: none;
+    }
     """
     def __init__(
@@ -126,12 +143,19 @@ class TaskProgressView(Widget):
     ) -> None:
         super().__init__()
         self.t = task
         self.description_width = description_width
         self.model_name_width = model_name_width
         self.progress_bar = ProgressBar(total=task.profile.steps, show_eta=False)
+        self.count_display = Static()
+        self.metrics_display = Static(id="task-metrics")
         self.task_progress = TaskProgress(self.progress_bar)
+        self.toggle = Toggle()
+        self.task_detail = TaskDetail(id="task-detail", classes="hidden")
     def compose(self) -> ComposeResult:
+        yield self.toggle
         yield TaskStatusIcon()
         yield Static(
             progress_description(self.t.profile, self.description_width, pad=True)
@@ -140,7 +164,15 @@ class TaskProgressView(Widget):
             progress_model_name(self.t.profile.model, self.model_name_width, pad=True)
         )
         yield self.progress_bar
+        yield self.count_display
+        yield self.metrics_display
         yield Clock()
+        yield self.task_detail
+    @on(Toggle.Toggled)
+    def handle_title_toggle(self, event: Toggle.Toggled) -> None:
+        self.task_detail.hidden = not self.toggle.toggled
+        event.stop()
     def on_mount(self) -> None:
         self.query_one(Clock).start(datetime.now().timestamp())
@@ -151,10 +183,21 @@ class TaskProgressView(Widget):
     def complete(self, result: TaskResult) -> None:
         self.t.result = result
-        self.query_one(TaskStatusIcon).result = result
-        self.query_one(Clock).stop()
+        try:
+            self.query_one(TaskStatusIcon).result = result
+            self.query_one(Clock).stop()
+        except NoMatches:
+            pass
         self.task_progress.complete()
+    def sample_complete(self, complete: int, total: int) -> None:
+        self.count_display.update(progress_count(complete, total))
+    def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
+        if len(metrics) > 0:
+            self.metrics_display.update(task_metric(metrics))
+            self.task_detail.update_metrics(metrics)
 class TaskStatusIcon(Static):
     result: reactive[TaskResult | None] = reactive(None)
@@ -181,13 +224,38 @@ class TaskStatusIcon(Static):
             return Text("⠿", style=running)
+MAX_PROGRESS_PERCENT = 0.02
+MIN_PROGRESS_PERCENT = 0.98
 class TaskProgress(Progress):
     def __init__(self, progress_bar: ProgressBar) -> None:
         self.progress_bar = progress_bar
+        self.current_progress = 0
+        # always show a minimum amount of progress
+        minimum_steps = (
+            MAX_PROGRESS_PERCENT * progress_bar.total
+            if progress_bar.total is not None
+            else 0
+        )
+        self.progress_bar.update(progress=minimum_steps)
     @override
     def update(self, n: int = 1) -> None:
-        self.progress_bar.update(advance=n)
+        self.current_progress = self.current_progress + n
+        # enforce a maximum cap on task progress
+        max_progress = (
+            MIN_PROGRESS_PERCENT * self.progress_bar.total
+            if self.progress_bar.total is not None
+            else 0
+        )
+        if (
+            self.current_progress > self.progress_bar.progress
+            and self.current_progress < max_progress
+        ):
+            self.progress_bar.update(progress=self.current_progress)
     @override
     def complete(self) -> None:

inspect_ai/_display/textual/widgets/toggle.py ADDED Viewed

@@ -0,0 +1,32 @@
+from textual.events import Click
+from textual.message import Message
+from textual.reactive import reactive
+from textual.widgets import Static
+class Toggle(Static, can_focus=True):
+    toggled = reactive(True)
+    def __init__(
+        self, on_symbol: str = "▼", off_symbol: str = "▶", toggled: bool = False
+    ) -> None:
+        super().__init__()
+        self.on_symbol = on_symbol
+        self.off_symbol = off_symbol
+        self.toggled = toggled
+    class Toggled(Message):
+        """Request toggle."""
+    async def _on_click(self, event: Click) -> None:
+        """Inform ancestor we want to toggle."""
+        event.stop()
+        self.toggled = not self.toggled
+        self.post_message(self.Toggled())
+    def _watch_toggled(self, toggled: bool) -> None:
+        if toggled:
+            self.update(self.on_symbol)
+        else:
+            self.update(self.off_symbol)

inspect_ai/_eval/context.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from inspect_ai._util.dotenv import init_dotenv
 from inspect_ai._util.hooks import init_hooks
 from inspect_ai._util.logger import init_http_rate_limit_count, init_logger
+from inspect_ai.approval._human.manager import init_human_approval_manager
 from inspect_ai.log._samples import init_active_samples
 from inspect_ai.model import GenerateConfig, Model
 from inspect_ai.model._model import init_active_model, init_model_usage
@@ -20,6 +21,7 @@ def init_eval_context(
     init_http_rate_limit_count()
     init_hooks()
     init_active_samples()
+    init_human_approval_manager()
 def init_task_context(model: Model, config: GenerateConfig = GenerateConfig()) -> None:

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -21,7 +21,8 @@ from inspect_ai.approval._policy import (
     approval_policies_from_config,
     config_from_approval_policies,
 )
-from inspect_ai.log import EvalConfig, EvalLog, EvalLogInfo, read_eval_log
+from inspect_ai.log import EvalConfig, EvalLog, EvalLogInfo
+from inspect_ai.log._file import read_eval_log_async
 from inspect_ai.log._recorders import create_recorder_for_format
 from inspect_ai.model import (
     GenerateConfig,
@@ -600,9 +601,9 @@ async def eval_retry_async(
             task
             if isinstance(task, EvalLog)
             else (
-                read_eval_log(task.name)
+                await read_eval_log_async(task.name)
                 if isinstance(task, EvalLogInfo)
-                else read_eval_log(task)
+                else await read_eval_log_async(task)
             )
         )
         for task in tasks

inspect_ai/_eval/loader.py CHANGED Viewed

@@ -198,7 +198,7 @@ def resolve_task_sandbox(
                     break
         # resolve relative paths
-        if resolved_sandbox.config is not None:
+        if isinstance(resolved_sandbox.config, str):
             file_path = Path(resolved_sandbox.config)
             if not file_path.is_absolute():
                 file_path = Path(task_run_dir(task)) / file_path

inspect_ai/_eval/run.py CHANGED Viewed

@@ -12,9 +12,10 @@ from inspect_ai._display.core.active import (
     init_task_screen,
 )
 from inspect_ai._display.core.display import TaskSpec
-from inspect_ai._util.error import exception_message
+from inspect_ai._util.error import PrerequisiteError, exception_message
 from inspect_ai._util.path import chdir
 from inspect_ai._util.registry import registry_unqualified_name
+from inspect_ai.dataset._dataset import Dataset
 from inspect_ai.log import EvalConfig, EvalLog
 from inspect_ai.log._recorders import Recorder
 from inspect_ai.model import GenerateConfigArgs
@@ -23,6 +24,7 @@ from inspect_ai.scorer._reducer import ScoreReducer, reducer_log_names
 from inspect_ai.scorer._reducer.registry import validate_reducer
 from inspect_ai.solver._solver import Solver, SolverSpec
 from inspect_ai.util._sandbox.environment import (
+    SandboxEnvironmentConfigType,
     SandboxEnvironmentSpec,
     SandboxEnvironmentType,
     TaskCleanup,
@@ -149,6 +151,9 @@ async def eval_run(
                     if sample.id is None:
                         sample.id = id + 1
+                # Ensure sample ids are unique
+                ensure_unique_ids(task.dataset)
                 # create and track the logger
                 logger = TaskLogger(
                     task_name=task.name,
@@ -168,6 +173,7 @@ async def eval_run(
                     metadata=task.metadata,
                     recorder=recorder,
                 )
+                await logger.init()
                 # append task
                 task_run_options.append(
@@ -287,6 +293,12 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
                 await task
                 result = task.result()
                 results.append(result)
+            except Exception as ex:
+                # errors generally don't escape from tasks (the exception being if an error
+                # occurs during the final write of the log)
+                log.error(
+                    f"Task '{task_options.task.name}' encountered an error during finalisation: {ex}"
+                )
             # tracking
             tasks_completed += 1
@@ -340,7 +352,7 @@ async def startup_sandbox_environments(
                 sandboxenvs.add(sandbox)
     # initialiase sandboxenvs (track cleanups)
-    cleanups: list[tuple[TaskCleanup, str | None, str]] = []
+    cleanups: list[tuple[TaskCleanup, SandboxEnvironmentConfigType | None, str]] = []
     with display().suspend_task_app():
         for sandboxenv in sandboxenvs:
             # find type
@@ -377,3 +389,24 @@ def task_specs(tasks: list[TaskRunOptions]) -> list[TaskSpec]:
         TaskSpec(registry_unqualified_name(task.task.name), ModelName(task.model))
         for task in tasks
     ]
+def ensure_unique_ids(dataset: Dataset) -> None:
+    """
+    Validates that all samples in the dataset have unique IDs.
+    Raises a error if duplicates are found.
+    Args:
+        dataset (Datatset): The dataset
+    Raises:
+        PrerequisiteError: If duplicate IDs are found in the dataset.
+    """
+    seen_ids = set()
+    for sample in dataset:
+        if sample.id in seen_ids:
+            raise PrerequisiteError(
+                f"The dataset contains duplicate sample ids (duplicate id: {sample.id}). Please ensure each sample has a unique id."
+            )
+        seen_ids.add(sample.id)

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -75,7 +75,7 @@ class TaskLogger:
             del model_args["api_key"]
         # cwd_relative_path for sandbox config
-        if sandbox and sandbox.config:
+        if sandbox and isinstance(sandbox.config, str):
             sandbox = SandboxEnvironmentSpec(
                 sandbox.type, cwd_relative_path(sandbox.config)
             )
@@ -118,7 +118,6 @@ class TaskLogger:
         # stack recorder and location
         self.recorder = recorder
-        self._location = self.recorder.log_init(self.eval)
         # number of samples logged
         self._samples_completed = 0
@@ -127,6 +126,9 @@ class TaskLogger:
         self.flush_buffer = eval_config.log_buffer or recorder.default_log_buffer()
         self.flush_pending = 0
+    async def init(self) -> None:
+        self._location = await self.recorder.log_init(self.eval)
     @property
     def location(self) -> str:
         return self._location
@@ -135,25 +137,25 @@ class TaskLogger:
     def samples_completed(self) -> int:
         return self._samples_completed
-    def log_start(self, plan: EvalPlan) -> None:
-        self.recorder.log_start(self.eval, plan)
+    async def log_start(self, plan: EvalPlan) -> None:
+        await self.recorder.log_start(self.eval, plan)
-    def log_sample(self, sample: EvalSample, *, flush: bool) -> None:
+    async def log_sample(self, sample: EvalSample, *, flush: bool) -> None:
         # log the sample
-        self.recorder.log_sample(self.eval, sample)
+        await self.recorder.log_sample(self.eval, sample)
         # flush if requested
         if flush:
             self.flush_pending += 1
             if self.flush_pending >= self.flush_buffer:
-                self.recorder.flush(self.eval)
+                await self.recorder.flush(self.eval)
                 self.flush_pending = 0
         # track sucessful samples logged
         if sample.error is None:
             self._samples_completed += 1
-    def log_finish(
+    async def log_finish(
         self,
         status: Literal["success", "cancelled", "error"],
         stats: EvalStats,
@@ -161,12 +163,12 @@ class TaskLogger:
         reductions: list[EvalSampleReductions] | None = None,
         error: EvalError | None = None,
     ) -> EvalLog:
-        return self.recorder.log_finish(
+        return await self.recorder.log_finish(
             self.eval, status, stats, results, reductions, error
         )
-def log_start(
+async def log_start(
     logger: TaskLogger,
     plan: Plan,
     config: GenerateConfig,
@@ -185,7 +187,7 @@ def log_start(
     if plan.finish:
         eval_plan.steps.append(eval_plan_step(plan.finish))
-    logger.log_start(eval_plan)
+    await logger.log_start(eval_plan)
 def collect_eval_data(stats: EvalStats) -> None:

inspect_ai/_eval/task/results.py CHANGED Viewed

@@ -175,7 +175,10 @@ def scorer_for_metrics(
         )
         # process metric values
-        metric_value = metric(scores)
+        if len(scores) > 0:
+            metric_value = metric(scores)
+        else:
+            metric_value = float("Nan")
         base_metric_name = registry_log_name(metric)
         # If the metric value is a dictionary, turn each of the entries
@@ -233,7 +236,9 @@ def scorers_from_metric_dict(
     results: list[EvalScore] = []
     # Expand any metric keys
-    resolved_metrics = resolve_glob_metric_keys(metrics, scores[0])
+    resolved_metrics = (
+        resolve_glob_metric_keys(metrics, scores[0]) if len(scores) > 0 else metrics
+    )
     for metric_key, metric_list in resolved_metrics.items():
         # filter scores to a list of scalars with the value of the metric name
@@ -258,9 +263,13 @@ def scorers_from_metric_dict(
         for target_metric in metric_list:
             # compute the metric value
             metric_name = registry_log_name(target_metric)
+            if len(metric_scores) > 0:
+                value = target_metric(metric_scores)
+            else:
+                value = float("Nan")
             result_metrics[metric_name] = EvalMetric(
                 name=metric_name,
-                value=cast(float, target_metric(metric_scores)),
+                value=cast(float, value),
             )
         # create a scorer result for this metric

inspect-ai 0.3.49__py3-none-any.whl → 0.3.51__py3-none-any.whl

inspect-ai 0.3.49py3-none-any.whl → 0.3.51py3-none-any.whl