PyPI - inspect-ai - Versions diffs - 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl - Mend

inspect-ai 0.3.68py3-none-any.whl → 0.3.70py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

inspect_ai/_cli/eval.py +13 -1
inspect_ai/_display/plain/display.py +9 -11
inspect_ai/_display/textual/app.py +5 -5
inspect_ai/_display/textual/widgets/samples.py +47 -18
inspect_ai/_display/textual/widgets/transcript.py +25 -12
inspect_ai/_eval/eval.py +14 -2
inspect_ai/_eval/evalset.py +6 -1
inspect_ai/_eval/run.py +6 -0
inspect_ai/_eval/task/run.py +44 -15
inspect_ai/_eval/task/task.py +26 -3
inspect_ai/_util/interrupt.py +15 -0
inspect_ai/_util/logger.py +23 -0
inspect_ai/_util/rich.py +7 -8
inspect_ai/_util/text.py +301 -1
inspect_ai/_util/transcript.py +10 -2
inspect_ai/_util/working.py +46 -0
inspect_ai/_view/www/dist/assets/index.css +56 -12
inspect_ai/_view/www/dist/assets/index.js +905 -751
inspect_ai/_view/www/log-schema.json +337 -2
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
inspect_ai/_view/www/node_modules/flatted/python/test.py +63 -0
inspect_ai/_view/www/src/appearance/icons.ts +3 -1
inspect_ai/_view/www/src/metadata/RenderedContent.tsx +0 -1
inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
inspect_ai/_view/www/src/samples/SampleDisplay.tsx +28 -1
inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +23 -2
inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +1 -1
inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -0
inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +152 -0
inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +9 -2
inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +19 -1
inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
inspect_ai/_view/www/src/types/log.d.ts +188 -108
inspect_ai/_view/www/src/utils/format.ts +7 -4
inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +9 -6
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_condense.py +1 -0
inspect_ai/log/_log.py +72 -12
inspect_ai/log/_samples.py +5 -5
inspect_ai/log/_transcript.py +31 -1
inspect_ai/model/_call_tools.py +1 -1
inspect_ai/model/_conversation.py +1 -1
inspect_ai/model/_model.py +35 -16
inspect_ai/model/_model_call.py +10 -3
inspect_ai/model/_providers/anthropic.py +13 -2
inspect_ai/model/_providers/bedrock.py +7 -0
inspect_ai/model/_providers/cloudflare.py +20 -7
inspect_ai/model/_providers/google.py +358 -302
inspect_ai/model/_providers/groq.py +57 -23
inspect_ai/model/_providers/hf.py +6 -0
inspect_ai/model/_providers/mistral.py +81 -52
inspect_ai/model/_providers/openai.py +9 -0
inspect_ai/model/_providers/providers.py +6 -6
inspect_ai/model/_providers/util/tracker.py +92 -0
inspect_ai/model/_providers/vllm.py +13 -5
inspect_ai/solver/_basic_agent.py +1 -3
inspect_ai/solver/_bridge/patch.py +0 -2
inspect_ai/solver/_limit.py +4 -4
inspect_ai/solver/_plan.py +3 -3
inspect_ai/solver/_solver.py +3 -0
inspect_ai/solver/_task_state.py +10 -1
inspect_ai/tool/_tools/_web_search.py +3 -3
inspect_ai/util/_concurrency.py +14 -8
inspect_ai/util/_sandbox/context.py +15 -0
inspect_ai/util/_sandbox/docker/cleanup.py +8 -3
inspect_ai/util/_sandbox/docker/compose.py +5 -9
inspect_ai/util/_sandbox/docker/docker.py +20 -6
inspect_ai/util/_sandbox/docker/util.py +10 -1
inspect_ai/util/_sandbox/environment.py +32 -1
inspect_ai/util/_sandbox/events.py +149 -0
inspect_ai/util/_sandbox/local.py +3 -3
inspect_ai/util/_sandbox/self_check.py +2 -1
inspect_ai/util/_subprocess.py +4 -1
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/METADATA +5 -5
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/RECORD +82 -74
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.68.dist-info → inspect_ai-0.3.70.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -218,9 +218,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
     @click.option(
         "--time-limit",
         type=int,
-        help="Limit on total execution time for each sample.",
+        help="Limit on total running time for each sample.",
         envvar="INSPECT_EVAL_TIME_LIMIT",
     )
+    @click.option(
+        "--working-limit",
+        type=int,
+        help="Limit on total working time (e.g. model generation, tool calls, etc.) for each sample.",
+        envvar="INSPECT_EVAL_WORKING_LIMIT",
+    )
     @click.option(
         "--fail-on-error",
         type=float,
@@ -468,6 +474,7 @@ def eval_command(
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
+    working_limit: int | None,
     max_samples: int | None,
     max_tasks: int | None,
     max_subprocesses: int | None,
@@ -518,6 +525,7 @@ def eval_command(
         message_limit=message_limit,
         token_limit=token_limit,
         time_limit=time_limit,
+        working_limit=working_limit,
         max_samples=max_samples,
         max_tasks=max_tasks,
         max_subprocesses=max_subprocesses,
@@ -629,6 +637,7 @@ def eval_set_command(
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
+    working_limit: int | None,
     max_samples: int | None,
     max_tasks: int | None,
     max_subprocesses: int | None,
@@ -684,6 +693,7 @@ def eval_set_command(
         message_limit=message_limit,
         token_limit=token_limit,
         time_limit=time_limit,
+        working_limit=working_limit,
         max_samples=max_samples,
         max_tasks=max_tasks,
         max_subprocesses=max_subprocesses,
@@ -737,6 +747,7 @@ def eval_exec(
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
+    working_limit: int | None,
     max_samples: int | None,
     max_tasks: int | None,
     max_subprocesses: int | None,
@@ -817,6 +828,7 @@ def eval_exec(
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             max_samples=max_samples,
             max_tasks=max_tasks,
             max_subprocesses=max_subprocesses,

inspect_ai/_display/plain/display.py CHANGED Viewed

@@ -119,14 +119,14 @@ class PlainTaskDisplay(TaskDisplay):
         self.samples_complete = 0
         self.samples_total = 0
         self.current_metrics: list[TaskDisplayMetric] | None = None
-        self.last_progress = 0  # Track last progress percentage
+        self.last_progress = 0
     @contextlib.contextmanager
     def progress(self) -> Iterator[Progress]:
         self.progress_display = PlainProgress(self.task.profile.steps)
         yield self.progress_display
-    @throttle(1)
+    @throttle(5)
     def _print_status_throttled(self) -> None:
         self._print_status()
@@ -135,13 +135,8 @@ class PlainTaskDisplay(TaskDisplay):
         if not self.progress_display:
             return
-        # Calculate current progress percentage
-        current_progress = int(
-            self.progress_display.current / self.progress_display.total * 100
-        )
-        # Only print on percentage changes to avoid too much output
-        if current_progress != self.last_progress:
+        # Only print when step count changes to avoid too much output
+        if self.progress_display.current != self.last_progress:
             status_parts: list[str] = []
             # if this is parallel print task and model to distinguish (limit both to 12 chars)
@@ -154,8 +149,11 @@ class PlainTaskDisplay(TaskDisplay):
                 )
             # Add step progress
+            progress_percent = int(
+                self.progress_display.current / self.progress_display.total * 100
+            )
             status_parts.append(
-                f"Steps: {self.progress_display.current:3d}/{self.progress_display.total} {current_progress:3d}%"
+                f"Steps: {self.progress_display.current:3d}/{self.progress_display.total} {progress_percent:3d}%"
             )
             # Add sample progress
@@ -187,7 +185,7 @@ class PlainTaskDisplay(TaskDisplay):
             # Print on new line
             print(" | ".join(status_parts))
-            self.last_progress = current_progress
+            self.last_progress = self.progress_display.current
     def sample_complete(self, complete: int, total: int) -> None:
         self.samples_complete = complete

inspect_ai/_display/textual/app.py CHANGED Viewed

@@ -13,7 +13,6 @@ from typing import (
 import rich
 from rich.console import Console
-from rich.text import Text
 from textual.app import App, ComposeResult
 from textual.binding import Binding, BindingType
 from textual.css.query import NoMatches
@@ -186,7 +185,8 @@ class TaskScreenApp(App[TR]):
         # force repaint
         self.refresh(repaint=True)
-        # enable mouse support (this broke in textual 2.0 when running in VS Code)
+        # enable mouse support (this broke in textual 2.0 when running in VS Code
+        # however is fixed in textual 2.1)
         assert self.app._driver
         textual_enable_mouse_support(self.app._driver)
@@ -316,9 +316,9 @@ class TaskScreenApp(App[TR]):
         def set_unread(unread: int | None) -> None:
             if unread is not None:
-                console_tab.label = Text(f"Console ({unread}")
+                console_tab.label = f"Console ({unread})"  # type: ignore[assignment]
             else:
-                console_tab.label = Text("Console")
+                console_tab.label = "Console"  # type: ignore[assignment]
         self.watch(console_view, "unread", set_unread)
@@ -385,7 +385,7 @@ class TaskScreenApp(App[TR]):
         def set_title(self, title: str) -> None:
             tabs = self.app.query_one(TabbedContent)
             tab = tabs.get_tab(self.tab_id)
-            tab.label = Text(title)
+            tab.label = title  # type: ignore[assignment]
         def activate(self) -> None:
             # show the tab

inspect_ai/_display/textual/widgets/samples.py CHANGED Viewed

@@ -6,6 +6,7 @@ from rich.table import Table
 from rich.text import Text
 from textual.app import ComposeResult
 from textual.containers import Horizontal, HorizontalGroup, Vertical, VerticalGroup
+from textual.css.query import NoMatches
 from textual.reactive import reactive
 from textual.widget import Widget
 from textual.widgets import (
@@ -38,7 +39,7 @@ class SamplesView(Widget):
         padding: 0 1 0 1;
         layout: grid;
         grid-size: 2 3;
-        grid-rows: auto 1fr auto;
+        grid-rows: auto 1fr 3;
         grid-columns: 32 1fr;
         grid-gutter: 1;
     }
@@ -61,7 +62,10 @@ class SamplesView(Widget):
         )
     async def notify_active(self, active: bool) -> None:
-        await self.query_one(TranscriptView).notify_active(active)
+        try:
+            await self.query_one(TranscriptView).notify_active(active)
+        except NoMatches:
+            pass
     def set_samples(self, samples: list[ActiveSample]) -> None:
         # throttle to no more than 1 second per 100 samples
@@ -137,8 +141,8 @@ class SamplesList(OptionList):
         if highlighted_sample and (highlighted_sample not in self.samples):
             self.samples.append(highlighted_sample)
-        # sort the samples by execution time
-        self.samples.sort(key=lambda sample: sample.execution_time, reverse=True)
+        # sort the samples by running time
+        self.samples.sort(key=lambda sample: sample.running_time, reverse=True)
         # rebuild the list
         self.clear_options()
@@ -150,9 +154,7 @@ class SamplesList(OptionList):
             table.add_column(width=1)
             task_name = Text.from_markup(f"{registry_unqualified_name(sample.task)}")
             task_name.truncate(18, overflow="ellipsis", pad=True)
-            task_time = Text.from_markup(
-                f"{format_progress_time(sample.execution_time)}"
-            )
+            task_time = Text.from_markup(f"{format_progress_time(sample.running_time)}")
             table.add_row(task_name, task_time, " ")
             sample_id = Text.from_markup(f"id: {sample.sample.id}")
             sample_id.truncate(18, overflow="ellipsis", pad=True)
@@ -408,11 +410,17 @@ class SampleToolbar(Horizontal):
     PENDING_STATUS = "pending_status"
     PENDING_CAPTION = "pending_caption"
+    TIMEOUT_TOOL_CALL_ENABLED = (
+        "Cancel the tool call and report a timeout to the model."
+    )
+    TIMEOUT_TOOL_CALL_DISABLED = "Cancelling tool call..."
+    CANCEL_SCORE_OUTPUT_ENABLED = (
+        "Cancel the sample and score whatever output has been generated so far."
+    )
+    CANCEL_RAISE_ERROR_ENABLED = "Cancel the sample and raise an error"
+    CANCEL_DISABLED = "Cancelling sample..."
     DEFAULT_CSS = f"""
-    SampleToolbar {{
-        grid-size: 5 1;
-        grid-columns: auto auto 1fr auto auto;
-    }}
     SampleToolbar #{STATUS_GROUP} {{
         width: 22;
     }}
@@ -445,18 +453,18 @@ class SampleToolbar(Horizontal):
         yield Button(
             Text("Timeout Tool"),
             id=self.TIMEOUT_TOOL_CALL,
-            tooltip="Cancel the tool call and report a timeout to the model.",
+            tooltip=self.TIMEOUT_TOOL_CALL_ENABLED,
         )
         yield Horizontal()
         yield Button(
             Text("Cancel (Score)"),
             id=self.CANCEL_SCORE_OUTPUT,
-            tooltip="Cancel the sample and score whatever output has been generated so far.",
+            tooltip=self.CANCEL_SCORE_OUTPUT_ENABLED,
         )
         yield Button(
             Text("Cancel (Error)"),
             id=self.CANCEL_RAISE_ERROR,
-            tooltip="Cancel the sample and raise an error (task will exit unless fail_on_error is set)",
+            tooltip=self.CANCEL_RAISE_ERROR_ENABLED,
         )
     def on_mount(self) -> None:
@@ -475,14 +483,26 @@ class SampleToolbar(Horizontal):
                 )
                 if isinstance(last_event, ToolEvent):
                     last_event._cancel()
-            elif event.button.id == self.CANCEL_SCORE_OUTPUT:
-                self.sample.interrupt("score")
-            elif event.button.id == self.CANCEL_RAISE_ERROR:
-                self.sample.interrupt("error")
+                    event.button.disabled = True
+                    event.button.tooltip = self.TIMEOUT_TOOL_CALL_DISABLED
+            else:
+                if event.button.id == self.CANCEL_SCORE_OUTPUT:
+                    self.sample.interrupt("score")
+                elif event.button.id == self.CANCEL_RAISE_ERROR:
+                    self.sample.interrupt("error")
+                cancel_score_output = self.query_one("#" + self.CANCEL_SCORE_OUTPUT)
+                cancel_score_output.disabled = True
+                cancel_score_output.tooltip = self.CANCEL_DISABLED
+                cancel_with_error = self.query_one("#" + self.CANCEL_RAISE_ERROR)
+                cancel_with_error.disabled = True
+                cancel_with_error.tooltip = self.CANCEL_DISABLED
     async def sync_sample(self, sample: ActiveSample | None) -> None:
         from inspect_ai.log._transcript import ModelEvent
+        # is it a new sample?
+        new_sample = sample != self.sample
         # track the sample
         self.sample = sample
@@ -499,6 +519,13 @@ class SampleToolbar(Horizontal):
             cancel_score_output.display = True
             cancel_with_error.display = not sample.fails_on_error
+            # if its a new sample then reset enabled states
+            if new_sample:
+                cancel_score_output.disabled = False
+                cancel_score_output.tooltip = self.CANCEL_SCORE_OUTPUT_ENABLED
+                cancel_with_error.disabled = False
+                cancel_with_error.tooltip = self.CANCEL_RAISE_ERROR_ENABLED
             # if we have a pending event then start the clock and show pending status
             last_event = (
                 sample.transcript.events[-1]
@@ -520,6 +547,8 @@ class SampleToolbar(Horizontal):
                 )
                 timeout_tool.display = isinstance(last_event, ToolEvent)
+                timeout_tool.disabled = False
+                timeout_tool.tooltip = self.TIMEOUT_TOOL_CALL_ENABLED
                 clock.start(last_event.timestamp.timestamp())
             else:

inspect_ai/_display/textual/widgets/transcript.py CHANGED Viewed

@@ -193,16 +193,29 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
     return EventDisplay(f"model: {event.model}", Group(*content))
-def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
-    # render sub-events
-    display: list[EventDisplay] = []
-    if event.events:
-        for e in event.events:
-            display.extend(render_event(e) or [])
+def render_sub_events(events: list[Event]) -> list[RenderableType]:
+    content: list[RenderableType] = []
+    for e in events:
+        event_displays = render_event(e) or []
+        for d in event_displays:
+            if d.content:
+                content.append(Text("  "))
+                content.append(transcript_separator(d.title, "black", "··"))
+                if isinstance(d.content, Markdown):
+                    set_transcript_markdown_options(d.content)
+                content.append(d.content)
+    return content
+def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
     # render the call
     content = transcript_tool_call(event)
+    # render sub-events
+    if event.events:
+        content.extend(render_sub_events(event.events))
     # render the output
     if isinstance(event.result, list):
         result: ToolResult = "\n".join(
@@ -220,7 +233,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
         result = str(result).strip()
         content.extend(lines_display(result, 50))
-    return display + [EventDisplay("tool call", Group(*content))]
+    return [EventDisplay("tool call", Group(*content))]
 def render_step_event(event: StepEvent) -> EventDisplay:
@@ -257,13 +270,13 @@ def render_score_event(event: ScoreEvent) -> EventDisplay:
 def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
+    # render header
+    content: list[RenderableType] = [transcript_function(event.name, event.input)]
     # render sub-events
-    display: list[EventDisplay] = []
     if event.events:
-        for e in event.events:
-            display.extend(render_event(e) or [])
+        content.extend(render_sub_events(event.events))
-    content: list[RenderableType] = [transcript_function(event.name, event.input)]
     if event.result:
         content.append(Text())
         if isinstance(event.result, str | int | float | bool | None):
@@ -271,7 +284,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
         else:
             content.append(render_as_json(event.result))
-    return display + [EventDisplay(f"subtask: {event.name}", Group(*content))]
+    return [EventDisplay(f"subtask: {event.name}", Group(*content))]
 def render_input_event(event: InputEvent) -> EventDisplay:

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -75,6 +75,7 @@ def eval(
     message_limit: int | None = None,
     token_limit: int | None = None,
     time_limit: int | None = None,
+    working_limit: int | None = None,
     max_samples: int | None = None,
     max_tasks: int | None = None,
     max_subprocesses: int | None = None,
@@ -132,7 +133,10 @@ def eval(
             so they can be debugged (defaults to False).
         message_limit: Limit on total messages used for each sample.
         token_limit: Limit on total tokens used for each sample.
-        time_limit: Limit on time (in seconds) for execution of each sample.
+        time_limit: Limit on clock time (in seconds) for samples.
+        working_limit: Limit on working time (in seconds) for sample. Working
+            time includes model generation, tool calls, etc. but does not include
+            time spent waiting on retries or shared resources.
         max_samples: Maximum number of samples to run in parallel
             (default is max_connections)
         max_tasks: Maximum number of tasks to run in parallel
@@ -186,6 +190,7 @@ def eval(
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             max_samples=max_samples,
             max_tasks=max_tasks,
             max_subprocesses=max_subprocesses,
@@ -227,6 +232,7 @@ async def eval_async(
     message_limit: int | None = None,
     token_limit: int | None = None,
     time_limit: int | None = None,
+    working_limit: int | None = None,
     max_samples: int | None = None,
     max_tasks: int | None = None,
     max_subprocesses: int | None = None,
@@ -281,7 +287,10 @@ async def eval_async(
            so they can be debugged (defaults to False).
         message_limit (int | None): Limit on total messages used for each sample.
         token_limit (int | None): Limit on total tokens used for each sample.
-        time_limit (int | None): Limit on time (in seconds) for execution of each sample.
+        time_limit: Limit on clock time (in seconds) for samples.
+        working_limit: Limit on working time (in seconds) for sample. Working
+            time includes model generation, tool calls, etc. but does not include
+            time spent waiting on retries or shared resources.
         max_samples (int | None): Maximum number of samples to run in parallel
            (default is max_connections)
         max_tasks (int | None): Maximum number of tasks to run in parallel
@@ -395,6 +404,7 @@ async def eval_async(
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             max_samples=max_samples,
             max_tasks=max_tasks,
             max_subprocesses=max_subprocesses,
@@ -702,6 +712,7 @@ async def eval_retry_async(
         message_limit = eval_log.eval.config.message_limit
         token_limit = eval_log.eval.config.token_limit
         time_limit = eval_log.eval.config.time_limit
+        working_limit = eval_log.eval.config.working_limit
         max_samples = max_samples or eval_log.eval.config.max_samples
         max_tasks = max_tasks or eval_log.eval.config.max_tasks
         max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
@@ -763,6 +774,7 @@ async def eval_retry_async(
                 message_limit=message_limit,
                 token_limit=token_limit,
                 time_limit=time_limit,
+                working_limit=working_limit,
                 max_samples=max_samples,
                 max_tasks=max_tasks,
                 max_subprocesses=max_subprocesses,

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -79,6 +79,7 @@ def eval_set(
     message_limit: int | None = None,
     token_limit: int | None = None,
     time_limit: int | None = None,
+    working_limit: int | None = None,
     max_samples: int | None = None,
     max_tasks: int | None = None,
     max_subprocesses: int | None = None,
@@ -146,7 +147,10 @@ def eval_set(
             so they can be debugged (defaults to False).
         message_limit: Limit on total messages used for each sample.
         token_limit: Limit on total tokens used for each sample.
-        time_limit: Limit on time (in seconds) for execution of each sample.
+        time_limit: Limit on clock time (in seconds) for samples.
+        working_limit: Limit on working time (in seconds) for sample. Working
+            time includes model generation, tool calls, etc. but does not include
+            time spent waiting on retries or shared resources.
         max_samples: Maximum number of samples to run in parallel
             (default is max_connections)
         max_tasks: Maximum number of tasks to run in parallel
@@ -202,6 +206,7 @@ def eval_set(
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             max_samples=max_samples,
             max_tasks=max_tasks,
             max_subprocesses=max_subprocesses,

inspect_ai/_eval/run.py CHANGED Viewed

@@ -163,6 +163,12 @@ async def eval_run(
                 else:
                     task.time_limit = task_eval_config.time_limit
+                # sample execution limit
+                if task_eval_config.working_limit is None:
+                    task_eval_config.working_limit = task.working_limit
+                else:
+                    task.working_limit = task_eval_config.working_limit
                 # fail_on_error
                 if task_eval_config.fail_on_error is None:
                     task_eval_config.fail_on_error = task.fail_on_error

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -33,6 +33,10 @@ from inspect_ai._util.registry import (
     registry_unqualified_name,
 )
 from inspect_ai._util.timeouts import Timeout, timeout
+from inspect_ai._util.working import (
+    init_sample_working_limit,
+    sample_waiting_time,
+)
 from inspect_ai._view.notify import view_notify_eval
 from inspect_ai.dataset import Dataset, Sample
 from inspect_ai.log import (
@@ -56,6 +60,7 @@ from inspect_ai.log._transcript import (
     SampleInitEvent,
     SampleLimitEvent,
     ScoreEvent,
+    StepEvent,
     transcript,
 )
 from inspect_ai.model import (
@@ -182,9 +187,9 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
         if isinstance(solver, Plan):
             plan = solver
         elif isinstance(solver, Chain):
-            plan = Plan(list(solver), internal=True)
+            plan = Plan(list(solver), cleanup=task.cleanup, internal=True)
         else:
-            plan = Plan(unroll(solver), internal=True)
+            plan = Plan(unroll(solver), cleanup=task.cleanup, internal=True)
         # add setup solver(s) if specified
         if task.setup:
@@ -308,6 +313,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
                                 or config.fail_on_error is True
                             ),
                             time_limit=config.time_limit,
+                            working_limit=config.working_limit,
                             semaphore=sample_semaphore,
                         )
                         for (sample, state) in zip(samples, states)
@@ -500,6 +506,7 @@ async def task_run_sample(
     sample_complete: Callable[[dict[str, SampleScore]], None],
     fails_on_error: bool,
     time_limit: int | None,
+    working_limit: int | None,
     semaphore: asyncio.Semaphore | None,
 ) -> dict[str, SampleScore] | None:
     # if there is an existing sample then tick off its progress, log it, and return it
@@ -570,19 +577,37 @@ async def task_run_sample(
             message_limit=state.message_limit,
             token_limit=state.token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             fails_on_error=fails_on_error,
             transcript=sample_transcript,
         ) as active,
     ):
+        start_time: float | None = None
         error: EvalError | None = None
         raise_error: BaseException | None = None
         results: dict[str, SampleScore] = {}
         try:
+            # begin init
+            transcript()._event(StepEvent(action="begin", name="init"))
+            # sample init event (remove file bodies as they have content or absolute paths)
+            event_sample = sample.model_copy(
+                update=dict(files={k: "" for k in sample.files.keys()})
+                if sample.files
+                else None
+            )
+            transcript()._event(
+                SampleInitEvent(sample=event_sample, state=state_jsonable(state))
+            )
             async with sandboxenv_cm:
                 try:
                     # update active sample wth sandboxes now that we are initialised
                     active.sandboxes = await sandbox_connections()
+                    # end init
+                    transcript()._event(StepEvent(action="end", name="init"))
                     # initialise timeout context manager
                     timeout_cm = (
                         timeout(time_limit)
@@ -590,23 +615,15 @@ async def task_run_sample(
                         else contextlib.nullcontext()
                     )
+                    # record start time
+                    start_time = time.monotonic()
+                    init_sample_working_limit(start_time, working_limit)
                     # run sample w/ optional timeout
                     async with timeout_cm:
                         # mark started
                         active.started = datetime.now().timestamp()
-                        # sample init event (remove file bodies as they have content or absolute paths)
-                        event_sample = sample.model_copy(
-                            update=dict(files={k: "" for k in sample.files.keys()})
-                            if sample.files
-                            else None
-                        )
-                        transcript()._event(
-                            SampleInitEvent(
-                                sample=event_sample, state=state_jsonable(state)
-                            )
-                        )
                         # set progress for plan then run it
                         state = await plan(state, generate)
@@ -661,11 +678,13 @@ async def task_run_sample(
                     # capture most recent state for scoring
                     state = ex.state or sample_state() or state
-                    state.completed = True
                 except BaseException as ex:
                     error, raise_error = handle_error(ex)
+                # mark completed
+                state.completed = True
                 # set timeout for scoring. if the original timeout was hit we still
                 # want to provide opportunity for scoring, but we don't necessarily
                 # want to wait the full timeout again (especially in the case where
@@ -768,6 +787,7 @@ async def task_run_sample(
             # log the sample
             await log_sample(
+                start_time=start_time,
                 logger=logger,
                 sample=sample,
                 state=state,
@@ -788,6 +808,7 @@ async def task_run_sample(
 async def log_sample(
+    start_time: float | None,
     logger: TaskLogger,
     sample: Sample,
     state: TaskState,
@@ -804,6 +825,9 @@ async def log_sample(
     # construct sample for logging
+    # compute total time if we can
+    total_time = time.monotonic() - start_time if start_time is not None else None
     # if a limit was hit, note that in the Eval Sample
     limit = None
     for e in transcript().events:
@@ -827,8 +851,13 @@ async def log_sample(
         output=state.output,
         scores={k: v.score for k, v in scores.items()},
         store=dict(state.store.items()),
+        uuid=state.uuid,
         events=list(transcript().events),
         model_usage=sample_model_usage(),
+        total_time=round(total_time, 3) if total_time is not None else None,
+        working_time=round(total_time - sample_waiting_time(), 3)
+        if total_time is not None
+        else None,
         error=error,
         limit=limit,
     )

inspect-ai 0.3.68__py3-none-any.whl → 0.3.70__py3-none-any.whl

inspect-ai 0.3.68py3-none-any.whl → 0.3.70py3-none-any.whl