PyPI - inspect-ai - Versions diffs - 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl - Mend

inspect-ai 0.3.69py3-none-any.whl → 0.3.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -218,9 +218,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
     @click.option(
         "--time-limit",
         type=int,
-        help="Limit on total execution time for each sample.",
+        help="Limit on total running time for each sample.",
         envvar="INSPECT_EVAL_TIME_LIMIT",
     )
+    @click.option(
+        "--working-limit",
+        type=int,
+        help="Limit on total working time (e.g. model generation, tool calls, etc.) for each sample.",
+        envvar="INSPECT_EVAL_WORKING_LIMIT",
+    )
     @click.option(
         "--fail-on-error",
         type=float,
@@ -384,15 +390,19 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
     @click.option(
         "--reasoning-effort",
         type=click.Choice(["low", "medium", "high"]),
-        help="Constrains effort on reasoning for reasoning models. Open AI o1 models only.",
+        help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
         envvar="INSPECT_EVAL_REASONING_EFFORT",
     )
     @click.option(
-        "--reasoning-history/--no-reasoning-history",
-        type=bool,
-        is_flag=True,
-        default=True,
-        help="Include reasoning in chat message history sent to generate.",
+        "--reasoning-tokens",
+        type=int,
+        help="Maximum number of tokens to use for reasoning. Anthropic Claude models only.",
+        envvar="INSPECT_EVAL_REASONING_TOKENS",
+    )
+    @click.option(
+        "--reasoning-history",
+        type=click.Choice(["none", "all", "last", "auto"]),
+        help='Include reasoning in chat message history sent to generate (defaults to "auto", which uses the recommended default for each provider)',
         envvar="INSPECT_EVAL_REASONING_HISTORY",
     )
     @click.option(
@@ -464,10 +474,12 @@ def eval_command(
     max_tool_output: int | None,
     cache_prompt: str | None,
     reasoning_effort: str | None,
-    reasoning_history: bool | None,
+    reasoning_tokens: int | None,
+    reasoning_history: Literal["none", "all", "last", "auto"] | None,
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
+    working_limit: int | None,
     max_samples: int | None,
     max_tasks: int | None,
     max_subprocesses: int | None,
@@ -518,6 +530,7 @@ def eval_command(
         message_limit=message_limit,
         token_limit=token_limit,
         time_limit=time_limit,
+        working_limit=working_limit,
         max_samples=max_samples,
         max_tasks=max_tasks,
         max_subprocesses=max_subprocesses,
@@ -625,10 +638,12 @@ def eval_set_command(
     max_tool_output: int | None,
     cache_prompt: str | None,
     reasoning_effort: str | None,
-    reasoning_history: bool | None,
+    reasoning_tokens: int | None,
+    reasoning_history: Literal["none", "all", "last", "auto"] | None,
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
+    working_limit: int | None,
     max_samples: int | None,
     max_tasks: int | None,
     max_subprocesses: int | None,
@@ -684,6 +699,7 @@ def eval_set_command(
         message_limit=message_limit,
         token_limit=token_limit,
         time_limit=time_limit,
+        working_limit=working_limit,
         max_samples=max_samples,
         max_tasks=max_tasks,
         max_subprocesses=max_subprocesses,
@@ -737,6 +753,7 @@ def eval_exec(
     message_limit: int | None,
     token_limit: int | None,
     time_limit: int | None,
+    working_limit: int | None,
     max_samples: int | None,
     max_tasks: int | None,
     max_subprocesses: int | None,
@@ -817,6 +834,7 @@ def eval_exec(
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             max_samples=max_samples,
             max_tasks=max_tasks,
             max_subprocesses=max_subprocesses,

inspect_ai/_display/core/display.py CHANGED Viewed

@@ -143,3 +143,5 @@ class Display(Protocol):
     @contextlib.contextmanager
     def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]: ...
+    def display_counter(self, caption: str, value: str) -> None: ...

inspect_ai/_display/core/footer.py CHANGED Viewed

@@ -9,10 +9,12 @@ from .config import task_dict
 @throttle(1)
-def task_footer(style: str = "") -> tuple[RenderableType, RenderableType]:
+def task_footer(
+    counters: dict[str, str], style: str = ""
+) -> tuple[RenderableType, RenderableType]:
     return (
         Text.from_markup(task_resources(), style=style),
-        Text.from_markup(task_http_rate_limits(), style=style),
+        Text.from_markup(task_counters(counters), style=style),
     )
@@ -23,5 +25,13 @@ def task_resources() -> str:
     return task_dict(resources)
-def task_http_rate_limits() -> str:
+def task_counters(counters: dict[str, str]) -> str:
+    return task_dict(counters | task_http_rate_limits())
+def task_http_rate_limits() -> dict[str, str]:
+    return {"HTTP rate limits": f"{http_rate_limit_count():,}"}
+def task_http_rate_limits_str() -> str:
     return f"HTTP rate limits: {http_rate_limit_count():,}"

inspect_ai/_display/plain/display.py CHANGED Viewed

@@ -22,7 +22,7 @@ from ..core.display import (
     TaskSpec,
     TaskWithResult,
 )
-from ..core.footer import task_http_rate_limits
+from ..core.footer import task_http_rate_limits_str
 from ..core.panel import task_panel, task_targets
 from ..core.results import task_metric, tasks_results
@@ -89,6 +89,10 @@ class PlainDisplay(Display):
             show_model_names=self.multiple_model_names,
         )
+    def display_counter(self, caption: str, value: str) -> None:
+        # Not supported for plain display as counters are only shown for tasks.
+        pass
     def _print_results(self) -> None:
         """Print final results using rich panels"""
         panels = tasks_results(self.tasks)
@@ -178,7 +182,7 @@ class PlainTaskDisplay(TaskDisplay):
             status_parts.append(resources)
             # Add rate limits
-            rate_limits = task_http_rate_limits()
+            rate_limits = task_http_rate_limits_str()
             if rate_limits:
                 status_parts.append(rate_limits)

inspect_ai/_display/rich/display.py CHANGED Viewed

@@ -60,6 +60,7 @@ class RichDisplay(Display):
         self.parallel = False
         self.live: Live | None = None
         self.timer_handle: asyncio.TimerHandle | None = None
+        self.counters: dict[str, str] = {}
         rich_initialise()
     @override
@@ -153,13 +154,20 @@ class RichDisplay(Display):
             and self.live.is_started
         ):
             if self.parallel:
-                r = tasks_live_status(self.total_tasks, self.tasks, self.progress_ui)
+                r = tasks_live_status(
+                    self.total_tasks, self.tasks, self.progress_ui, self.counters
+                )
             else:
-                r = task_live_status(self.tasks, self.progress_ui)
+                r = task_live_status(self.tasks, self.progress_ui, self.counters)
             self.live.update(r, refresh=True)
         self.timer_handle = asyncio.get_event_loop().call_later(1, self._update_display)
+    @override
+    def display_counter(self, caption: str, value: str) -> None:
+        self.counters[caption] = value
+        self._update_display()
 class RichTaskScreen(TaskScreen):
     def __init__(self, live: Live) -> None:
@@ -286,7 +294,9 @@ class RichTaskDisplay(TaskDisplay):
         self.p.complete()
-def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> RenderableType:
+def task_live_status(
+    tasks: list[TaskStatus], progress: RProgress, counters: dict[str, str]
+) -> RenderableType:
     theme = rich_theme()
     # the panel contents
@@ -300,13 +310,16 @@ def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> Renderable
         show_model=len(tasks) == 1,
         body=Group("", progress),
         subtitle=subtitle,
-        footer=task_footer(theme.light),
+        footer=task_footer(counters, theme.light),
         log_location=None,
     )
 def tasks_live_status(
-    total_tasks: int, tasks: list[TaskStatus], progress: RProgress
+    total_tasks: int,
+    tasks: list[TaskStatus],
+    progress: RProgress,
+    counters: dict[str, str],
 ) -> RenderableType:
     # rendering context
     theme = rich_theme()
@@ -325,7 +338,7 @@ def tasks_live_status(
     footer_table = Table.grid(expand=True)
     footer_table.add_column()
     footer_table.add_column(justify="right")
-    footer = task_footer(theme.light)
+    footer = task_footer(counters, theme.light)
     footer_table.add_row()
     footer_table.add_row(footer[0], footer[1])

inspect_ai/_display/textual/app.py CHANGED Viewed

@@ -89,6 +89,7 @@ class TaskScreenApp(App[TR]):
         self._total_tasks = 0
         self._parallel = False
         self._tasks: list[TaskWithResult] = []
+        self._counters: dict[str, str] = {}
         # all tasks processed by app
         self._app_tasks: list[TaskWithResult] = []
@@ -185,7 +186,8 @@ class TaskScreenApp(App[TR]):
         # force repaint
         self.refresh(repaint=True)
-        # enable mouse support (this broke in textual 2.0 when running in VS Code)
+        # enable mouse support (this broke in textual 2.0 when running in VS Code
+        # however is fixed in textual 2.1)
         assert self.app._driver
         textual_enable_mouse_support(self.app._driver)
@@ -301,7 +303,7 @@ class TaskScreenApp(App[TR]):
         samples_view.set_samples(active_and_started_samples)
     def update_footer(self) -> None:
-        left, right = task_footer()
+        left, right = task_footer(self._counters)
         footer = self.query_one(AppFooter)
         footer.left = left
         footer.right = right
@@ -315,7 +317,7 @@ class TaskScreenApp(App[TR]):
         def set_unread(unread: int | None) -> None:
             if unread is not None:
-                console_tab.label = f"Console ({unread}"  # type: ignore[assignment]
+                console_tab.label = f"Console ({unread})"  # type: ignore[assignment]
             else:
                 console_tab.label = "Console"  # type: ignore[assignment]
@@ -376,6 +378,10 @@ class TaskScreenApp(App[TR]):
         except NoMatches:
             return None
+    def display_counter(self, caption: str, value: str) -> None:
+        self._counters[caption] = value
+        self.update_footer()
     class InputPanelHost(InputPanel.Host):
         def __init__(self, app: "TaskScreenApp[TR]", tab_id: str) -> None:
             self.app = app

inspect_ai/_display/textual/display.py CHANGED Viewed

@@ -72,3 +72,7 @@ class TextualDisplay(Display):
     def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
         with self.app.task_display(profile) as task_display:
             yield task_display
+    @override
+    def display_counter(self, caption: str, value: str) -> None:
+        self.app.display_counter(caption, value)

inspect_ai/_display/textual/widgets/samples.py CHANGED Viewed

@@ -39,7 +39,7 @@ class SamplesView(Widget):
         padding: 0 1 0 1;
         layout: grid;
         grid-size: 2 3;
-        grid-rows: auto 1fr auto;
+        grid-rows: auto 1fr 3;
         grid-columns: 32 1fr;
         grid-gutter: 1;
     }
@@ -141,8 +141,8 @@ class SamplesList(OptionList):
         if highlighted_sample and (highlighted_sample not in self.samples):
             self.samples.append(highlighted_sample)
-        # sort the samples by execution time
-        self.samples.sort(key=lambda sample: sample.execution_time, reverse=True)
+        # sort the samples by running time
+        self.samples.sort(key=lambda sample: sample.running_time, reverse=True)
         # rebuild the list
         self.clear_options()
@@ -154,9 +154,7 @@ class SamplesList(OptionList):
             table.add_column(width=1)
             task_name = Text.from_markup(f"{registry_unqualified_name(sample.task)}")
             task_name.truncate(18, overflow="ellipsis", pad=True)
-            task_time = Text.from_markup(
-                f"{format_progress_time(sample.execution_time)}"
-            )
+            task_time = Text.from_markup(f"{format_progress_time(sample.running_time)}")
             table.add_row(task_name, task_time, " ")
             sample_id = Text.from_markup(f"id: {sample.sample.id}")
             sample_id.truncate(18, overflow="ellipsis", pad=True)
@@ -423,10 +421,6 @@ class SampleToolbar(Horizontal):
     CANCEL_DISABLED = "Cancelling sample..."
     DEFAULT_CSS = f"""
-    SampleToolbar {{
-        grid-size: 5 1;
-        grid-columns: auto auto 1fr auto auto;
-    }}
     SampleToolbar #{STATUS_GROUP} {{
         width: 22;
     }}

inspect_ai/_display/textual/widgets/transcript.py CHANGED Viewed

@@ -9,7 +9,7 @@ from textual.containers import ScrollableContainer
 from textual.widget import Widget
 from textual.widgets import Static
-from inspect_ai._util.content import ContentText
+from inspect_ai._util.content import ContentReasoning, ContentText
 from inspect_ai._util.rich import lines_display
 from inspect_ai._util.transcript import (
     set_transcript_markdown_options,
@@ -36,7 +36,6 @@ from inspect_ai.log._transcript import (
 )
 from inspect_ai.model._chat_message import (
     ChatMessage,
-    ChatMessageAssistant,
     ChatMessageUser,
 )
 from inspect_ai.model._render import messages_preceding_assistant
@@ -193,16 +192,29 @@ def render_model_event(event: ModelEvent) -> EventDisplay:
     return EventDisplay(f"model: {event.model}", Group(*content))
-def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
-    # render sub-events
-    display: list[EventDisplay] = []
-    if event.events:
-        for e in event.events:
-            display.extend(render_event(e) or [])
+def render_sub_events(events: list[Event]) -> list[RenderableType]:
+    content: list[RenderableType] = []
+    for e in events:
+        event_displays = render_event(e) or []
+        for d in event_displays:
+            if d.content:
+                content.append(Text("  "))
+                content.append(transcript_separator(d.title, "black", "··"))
+                if isinstance(d.content, Markdown):
+                    set_transcript_markdown_options(d.content)
+                content.append(d.content)
+    return content
+def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
     # render the call
     content = transcript_tool_call(event)
+    # render sub-events
+    if event.events:
+        content.extend(render_sub_events(event.events))
     # render the output
     if isinstance(event.result, list):
         result: ToolResult = "\n".join(
@@ -220,7 +232,7 @@ def render_tool_event(event: ToolEvent) -> list[EventDisplay]:
         result = str(result).strip()
         content.extend(lines_display(result, 50))
-    return display + [EventDisplay("tool call", Group(*content))]
+    return [EventDisplay("tool call", Group(*content))]
 def render_step_event(event: StepEvent) -> EventDisplay:
@@ -257,13 +269,13 @@ def render_score_event(event: ScoreEvent) -> EventDisplay:
 def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
+    # render header
+    content: list[RenderableType] = [transcript_function(event.name, event.input)]
     # render sub-events
-    display: list[EventDisplay] = []
     if event.events:
-        for e in event.events:
-            display.extend(render_event(e) or [])
+        content.extend(render_sub_events(event.events))
-    content: list[RenderableType] = [transcript_function(event.name, event.input)]
     if event.result:
         content.append(Text())
         if isinstance(event.result, str | int | float | bool | None):
@@ -271,7 +283,7 @@ def render_subtask_event(event: SubtaskEvent) -> list[EventDisplay]:
         else:
             content.append(render_as_json(event.result))
-    return display + [EventDisplay(f"subtask: {event.name}", Group(*content))]
+    return [EventDisplay(f"subtask: {event.name}", Group(*content))]
 def render_input_event(event: InputEvent) -> EventDisplay:
@@ -320,11 +332,16 @@ def render_message(message: ChatMessage) -> list[RenderableType]:
         Text(),
     ]
-    if isinstance(message, ChatMessageAssistant) and message.reasoning:
-        content.extend(transcript_reasoning(message.reasoning))
-    if message.text:
+    # deal with plain text or with content blocks
+    if isinstance(message.content, str):
         content.extend([transcript_markdown(message.text.strip(), escape=True)])
+    else:
+        for c in message.content:
+            if isinstance(c, ContentReasoning):
+                content.extend(transcript_reasoning(c))
+            elif isinstance(c, ContentText):
+                content.extend([transcript_markdown(c.text.strip(), escape=True)])
     return content

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -75,6 +75,7 @@ def eval(
     message_limit: int | None = None,
     token_limit: int | None = None,
     time_limit: int | None = None,
+    working_limit: int | None = None,
     max_samples: int | None = None,
     max_tasks: int | None = None,
     max_subprocesses: int | None = None,
@@ -132,7 +133,10 @@ def eval(
             so they can be debugged (defaults to False).
         message_limit: Limit on total messages used for each sample.
         token_limit: Limit on total tokens used for each sample.
-        time_limit: Limit on time (in seconds) for execution of each sample.
+        time_limit: Limit on clock time (in seconds) for samples.
+        working_limit: Limit on working time (in seconds) for sample. Working
+            time includes model generation, tool calls, etc. but does not include
+            time spent waiting on retries or shared resources.
         max_samples: Maximum number of samples to run in parallel
             (default is max_connections)
         max_tasks: Maximum number of tasks to run in parallel
@@ -186,6 +190,7 @@ def eval(
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             max_samples=max_samples,
             max_tasks=max_tasks,
             max_subprocesses=max_subprocesses,
@@ -227,6 +232,7 @@ async def eval_async(
     message_limit: int | None = None,
     token_limit: int | None = None,
     time_limit: int | None = None,
+    working_limit: int | None = None,
     max_samples: int | None = None,
     max_tasks: int | None = None,
     max_subprocesses: int | None = None,
@@ -281,7 +287,10 @@ async def eval_async(
            so they can be debugged (defaults to False).
         message_limit (int | None): Limit on total messages used for each sample.
         token_limit (int | None): Limit on total tokens used for each sample.
-        time_limit (int | None): Limit on time (in seconds) for execution of each sample.
+        time_limit: Limit on clock time (in seconds) for samples.
+        working_limit: Limit on working time (in seconds) for sample. Working
+            time includes model generation, tool calls, etc. but does not include
+            time spent waiting on retries or shared resources.
         max_samples (int | None): Maximum number of samples to run in parallel
            (default is max_connections)
         max_tasks (int | None): Maximum number of tasks to run in parallel
@@ -395,6 +404,7 @@ async def eval_async(
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             max_samples=max_samples,
             max_tasks=max_tasks,
             max_subprocesses=max_subprocesses,
@@ -702,6 +712,7 @@ async def eval_retry_async(
         message_limit = eval_log.eval.config.message_limit
         token_limit = eval_log.eval.config.token_limit
         time_limit = eval_log.eval.config.time_limit
+        working_limit = eval_log.eval.config.working_limit
         max_samples = max_samples or eval_log.eval.config.max_samples
         max_tasks = max_tasks or eval_log.eval.config.max_tasks
         max_subprocesses = max_subprocesses or eval_log.eval.config.max_subprocesses
@@ -763,6 +774,7 @@ async def eval_retry_async(
                 message_limit=message_limit,
                 token_limit=token_limit,
                 time_limit=time_limit,
+                working_limit=working_limit,
                 max_samples=max_samples,
                 max_tasks=max_tasks,
                 max_subprocesses=max_subprocesses,

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -79,6 +79,7 @@ def eval_set(
     message_limit: int | None = None,
     token_limit: int | None = None,
     time_limit: int | None = None,
+    working_limit: int | None = None,
     max_samples: int | None = None,
     max_tasks: int | None = None,
     max_subprocesses: int | None = None,
@@ -146,7 +147,10 @@ def eval_set(
             so they can be debugged (defaults to False).
         message_limit: Limit on total messages used for each sample.
         token_limit: Limit on total tokens used for each sample.
-        time_limit: Limit on time (in seconds) for execution of each sample.
+        time_limit: Limit on clock time (in seconds) for samples.
+        working_limit: Limit on working time (in seconds) for sample. Working
+            time includes model generation, tool calls, etc. but does not include
+            time spent waiting on retries or shared resources.
         max_samples: Maximum number of samples to run in parallel
             (default is max_connections)
         max_tasks: Maximum number of tasks to run in parallel
@@ -202,6 +206,7 @@ def eval_set(
             message_limit=message_limit,
             token_limit=token_limit,
             time_limit=time_limit,
+            working_limit=working_limit,
             max_samples=max_samples,
             max_tasks=max_tasks,
             max_subprocesses=max_subprocesses,

inspect_ai/_eval/run.py CHANGED Viewed

@@ -163,6 +163,12 @@ async def eval_run(
                 else:
                     task.time_limit = task_eval_config.time_limit
+                # sample execution limit
+                if task_eval_config.working_limit is None:
+                    task_eval_config.working_limit = task.working_limit
+                else:
+                    task.working_limit = task_eval_config.working_limit
                 # fail_on_error
                 if task_eval_config.fail_on_error is None:
                     task_eval_config.fail_on_error = task.fail_on_error

inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

inspect-ai 0.3.69py3-none-any.whl → 0.3.71py3-none-any.whl