PyPI - inspect-ai - Versions diffs - 0.3.98__py3-none-any.whl → 0.3.100__py3-none-any.whl - Mend

inspect-ai 0.3.98py3-none-any.whl → 0.3.100py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

inspect_ai/__init__.py +2 -0
inspect_ai/_cli/log.py +1 -1
inspect_ai/_display/core/config.py +11 -5
inspect_ai/_display/core/panel.py +66 -2
inspect_ai/_display/core/textual.py +5 -2
inspect_ai/_display/plain/display.py +1 -0
inspect_ai/_display/rich/display.py +2 -2
inspect_ai/_display/textual/widgets/transcript.py +41 -1
inspect_ai/_eval/run.py +12 -4
inspect_ai/_eval/score.py +2 -4
inspect_ai/_eval/task/log.py +1 -1
inspect_ai/_eval/task/run.py +59 -81
inspect_ai/_eval/task/task.py +1 -1
inspect_ai/_util/_async.py +1 -1
inspect_ai/_util/content.py +11 -6
inspect_ai/_util/interrupt.py +2 -2
inspect_ai/_util/text.py +7 -0
inspect_ai/_util/working.py +8 -37
inspect_ai/_view/__init__.py +0 -0
inspect_ai/_view/schema.py +3 -1
inspect_ai/_view/view.py +14 -0
inspect_ai/_view/www/CLAUDE.md +15 -0
inspect_ai/_view/www/dist/assets/index.css +273 -169
inspect_ai/_view/www/dist/assets/index.js +20079 -17019
inspect_ai/_view/www/log-schema.json +122 -8
inspect_ai/_view/www/package.json +5 -1
inspect_ai/_view/www/src/@types/log.d.ts +20 -2
inspect_ai/_view/www/src/app/App.tsx +1 -15
inspect_ai/_view/www/src/app/appearance/icons.ts +4 -1
inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +24 -6
inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +0 -5
inspect_ai/_view/www/src/app/content/RenderedContent.tsx +221 -205
inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +2 -1
inspect_ai/_view/www/src/app/log-view/tabs/SamplesTab.tsx +5 -0
inspect_ai/_view/www/src/app/routing/url.ts +84 -4
inspect_ai/_view/www/src/app/samples/InlineSampleDisplay.module.css +0 -5
inspect_ai/_view/www/src/app/samples/SampleDialog.module.css +1 -1
inspect_ai/_view/www/src/app/samples/SampleDisplay.module.css +7 -0
inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +26 -19
inspect_ai/_view/www/src/app/samples/SampleSummaryView.module.css +1 -2
inspect_ai/_view/www/src/app/samples/chat/ChatMessage.tsx +8 -6
inspect_ai/_view/www/src/app/samples/chat/ChatMessageRow.tsx +0 -4
inspect_ai/_view/www/src/app/samples/chat/ChatViewVirtualList.tsx +3 -2
inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +2 -0
inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +2 -0
inspect_ai/_view/www/src/app/samples/chat/messages.ts +1 -0
inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +1 -0
inspect_ai/_view/www/src/app/samples/list/SampleRow.tsx +1 -1
inspect_ai/_view/www/src/app/samples/scores/SampleScoresGrid.module.css +2 -2
inspect_ai/_view/www/src/app/samples/transcript/ErrorEventView.tsx +2 -3
inspect_ai/_view/www/src/app/samples/transcript/InfoEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/InputEventView.tsx +1 -2
inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.module.css +1 -1
inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/SampleInitEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +3 -2
inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.tsx +4 -5
inspect_ai/_view/www/src/app/samples/transcript/ScoreEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +1 -2
inspect_ai/_view/www/src/app/samples/transcript/StepEventView.tsx +1 -3
inspect_ai/_view/www/src/app/samples/transcript/SubtaskEventView.tsx +1 -2
inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +3 -4
inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.module.css +42 -0
inspect_ai/_view/www/src/app/samples/transcript/TranscriptPanel.tsx +77 -0
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualList.tsx +27 -71
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +13 -3
inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.tsx +27 -2
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.module.css +1 -0
inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +21 -22
inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.module.css +45 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/OutlineRow.tsx +223 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.module.css +10 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +258 -0
inspect_ai/_view/www/src/app/samples/transcript/outline/tree-visitors.ts +187 -0
inspect_ai/_view/www/src/app/samples/transcript/state/StateEventRenderers.tsx +8 -1
inspect_ai/_view/www/src/app/samples/transcript/state/StateEventView.tsx +3 -4
inspect_ai/_view/www/src/app/samples/transcript/transform/hooks.ts +78 -0
inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +340 -135
inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +3 -0
inspect_ai/_view/www/src/app/samples/transcript/types.ts +2 -0
inspect_ai/_view/www/src/app/types.ts +5 -1
inspect_ai/_view/www/src/client/api/api-browser.ts +2 -2
inspect_ai/_view/www/src/components/LiveVirtualList.tsx +6 -1
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +1 -1
inspect_ai/_view/www/src/components/PopOver.tsx +422 -0
inspect_ai/_view/www/src/components/PulsingDots.module.css +9 -9
inspect_ai/_view/www/src/components/PulsingDots.tsx +4 -1
inspect_ai/_view/www/src/components/StickyScroll.tsx +183 -0
inspect_ai/_view/www/src/components/TabSet.tsx +4 -0
inspect_ai/_view/www/src/state/hooks.ts +52 -2
inspect_ai/_view/www/src/state/logSlice.ts +4 -3
inspect_ai/_view/www/src/state/samplePolling.ts +8 -0
inspect_ai/_view/www/src/state/sampleSlice.ts +53 -9
inspect_ai/_view/www/src/state/scrolling.ts +152 -0
inspect_ai/_view/www/src/utils/attachments.ts +7 -0
inspect_ai/_view/www/src/utils/python.ts +18 -0
inspect_ai/_view/www/yarn.lock +269 -6
inspect_ai/agent/_react.py +12 -7
inspect_ai/agent/_run.py +46 -11
inspect_ai/analysis/beta/_dataframe/samples/table.py +19 -18
inspect_ai/log/_bundle.py +5 -3
inspect_ai/log/_log.py +3 -3
inspect_ai/log/_recorders/file.py +2 -9
inspect_ai/log/_transcript.py +1 -1
inspect_ai/model/_call_tools.py +6 -2
inspect_ai/model/_openai.py +1 -1
inspect_ai/model/_openai_responses.py +78 -39
inspect_ai/model/_openai_web_search.py +31 -0
inspect_ai/model/_providers/anthropic.py +3 -6
inspect_ai/model/_providers/azureai.py +72 -3
inspect_ai/model/_providers/openai.py +2 -1
inspect_ai/model/_providers/providers.py +1 -1
inspect_ai/scorer/_metric.py +1 -2
inspect_ai/solver/_task_state.py +2 -2
inspect_ai/tool/_tool.py +6 -2
inspect_ai/tool/_tool_def.py +27 -4
inspect_ai/tool/_tool_info.py +2 -0
inspect_ai/tool/_tools/_web_search/_google.py +15 -4
inspect_ai/tool/_tools/_web_search/_tavily.py +35 -12
inspect_ai/tool/_tools/_web_search/_web_search.py +214 -45
inspect_ai/util/__init__.py +6 -0
inspect_ai/util/_json.py +3 -0
inspect_ai/util/_limit.py +374 -141
inspect_ai/util/_sandbox/docker/compose.py +20 -11
inspect_ai/util/_span.py +1 -1
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/METADATA +3 -3
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/RECORD +131 -117
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.98.dist-info → inspect_ai-0.3.100.dist-info}/top_level.txt +0 -0

inspect_ai/__init__.py CHANGED Viewed

@@ -10,6 +10,7 @@ from inspect_ai._eval.score import score, score_async
 from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
 from inspect_ai._eval.task.tasks import Tasks
 from inspect_ai._util.constants import PKG_NAME
+from inspect_ai._view.view import view
 from inspect_ai.agent._human.agent import human_cli
 from inspect_ai.solver._human_agent import human_agent
@@ -32,4 +33,5 @@ __all__ = [
     "TaskInfo",
     "task",
     "task_with",
+    "view",
 ]

inspect_ai/_cli/log.py CHANGED Viewed

@@ -199,6 +199,6 @@ def view_resource(file: str) -> str:
 def view_type_resource(file: str) -> str:
-    resource = PKG_PATH / "_view" / "www" / "src" / "types" / file
+    resource = PKG_PATH / "_view" / "www" / "src" / "@types" / file
     with open(resource, "r", encoding="utf-8") as f:
         return f.read()

inspect_ai/_display/core/config.py CHANGED Viewed

@@ -1,4 +1,8 @@
+from rich.console import RenderableType
+from rich.text import Text
 from inspect_ai._util.registry import is_model_dict, is_registry_dict
+from inspect_ai._util.text import truncate_text
 from inspect_ai.log._log import eval_config_defaults
 from .display import TaskProfile
@@ -6,7 +10,7 @@ from .display import TaskProfile
 def task_config(
     profile: TaskProfile, generate_config: bool = True, style: str = ""
-) -> str:
+) -> RenderableType:
     # merge config
     # wind params back for display
     task_args = dict(profile.task_args)
@@ -39,15 +43,17 @@ def task_config(
         elif name not in ["limit", "model", "response_schema", "log_shared"]:
             if isinstance(value, list):
                 value = ",".join([str(v) for v in value])
+            elif isinstance(value, dict):
+                value = "{...}"
             if isinstance(value, str):
+                value = truncate_text(value, 50)
                 value = value.replace("[", "\\[")
             config_print.append(f"{name}: {value}")
     values = ", ".join(config_print)
     if values:
-        if style:
-            return f"[{style}]{values}[/{style}]"
-        else:
-            return values
+        values_text = Text(values, style=style)
+        values_text.truncate(500, overflow="ellipsis")
+        return values_text
     else:
         return ""

inspect_ai/_display/core/panel.py CHANGED Viewed

@@ -9,6 +9,7 @@ from rich.text import Text
 from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
 from inspect_ai._util.path import cwd_relative_path
 from inspect_ai._util.registry import registry_unqualified_name
+from inspect_ai.util._display import display_type
 from .display import TaskProfile
 from .rich import is_vscode_notebook, rich_theme
@@ -24,7 +25,13 @@ def task_panel(
     | None,
     footer: RenderableType | tuple[RenderableType, RenderableType] | None,
     log_location: str | None,
-) -> Panel:
+) -> RenderableType:
+    # dispatch to plain handler if we are in plain mode
+    if display_type() == "plain":
+        return task_panel_plain(
+            profile, show_model, body, subtitle, footer, log_location
+        )
     # rendering context
     theme = rich_theme()
     console = rich.get_console()
@@ -93,7 +100,7 @@ def task_panel(
     # create panel w/ title
     panel = Panel(
         root,
-        title=f"[bold][{theme.meta}]{task_title(profile, show_model)}[/{theme.meta}][/bold]",
+        title=task_panel_title(profile, show_model),
         title_align="left",
         width=width,
         expand=True,
@@ -101,6 +108,63 @@ def task_panel(
     return panel
+def task_panel_plain(
+    profile: TaskProfile,
+    show_model: bool,
+    body: RenderableType,
+    subtitle: RenderableType
+    | str
+    | Tuple[RenderableType | str, RenderableType | str]
+    | None,
+    footer: RenderableType | tuple[RenderableType, RenderableType] | None,
+    log_location: str | None,
+) -> RenderableType:
+    # delimiter text
+    delimeter = "---------------------------------------------------------"
+    # root table for output
+    table = Table.grid(expand=False)
+    table.add_column()
+    table.add_row(delimeter)
+    # title and subtitle
+    table.add_row(task_panel_title(profile, show_model))
+    if isinstance(subtitle, tuple):
+        subtitle = subtitle[0]
+    table.add_row(subtitle)
+    # task info
+    if body:
+        table.add_row(body)
+    # footer
+    if isinstance(footer, tuple):
+        footer = footer[0]
+    if footer:
+        table.add_row(footer)
+    # log location
+    if log_location:
+        # Print a cwd relative path
+        try:
+            log_location_relative = cwd_relative_path(log_location, walk_up=True)
+        except ValueError:
+            log_location_relative = log_location
+        table.add_row(f"Log: {log_location_relative}")
+    table.add_row(delimeter)
+    table.add_row("")
+    return table
+def task_panel_title(profile: TaskProfile, show_model: bool) -> str:
+    theme = rich_theme()
+    return (
+        f"[bold][{theme.meta}]{task_title(profile, show_model)}[/{theme.meta}][/bold]"
+    )
 def to_renderable(item: RenderableType | str, style: str = "") -> RenderableType:
     if isinstance(item, str):
         return Text.from_markup(item, style=style)

inspect_ai/_display/core/textual.py CHANGED Viewed

@@ -8,8 +8,6 @@ logger = getLogger(__name__)
 # force mouse support for textual -- this works around an issue where
 # mouse events are disabled after a reload of the vs code ide, see:
 #   https://github.com/Textualize/textual/issues/5380
-# ansi codes for enabling mouse support are idempotent so it is fine
-# to do this even in cases where mouse support is already enabled.
 # we try/catch since we aren't 100% sure there aren't cases where doing
 # this won't raise and we'd rather not fail hard in in these case
 def textual_enable_mouse_support(driver: Driver) -> None:
@@ -17,5 +15,10 @@ def textual_enable_mouse_support(driver: Driver) -> None:
     if enable_mouse_support:
         try:
             enable_mouse_support()
+            # Re-enable SGR-Pixels format if it was previously enabled.
+            # See #1943.
+            enable_mouse_pixels = getattr(driver, "_enable_mouse_pixels", None)
+            if enable_mouse_pixels and getattr(driver, "_mouse_pixels", False):
+                enable_mouse_pixels()
         except Exception as ex:
             logger.warning(f"Error enabling mouse support: {ex}")

inspect_ai/_display/plain/display.py CHANGED Viewed

@@ -208,3 +208,4 @@ class PlainTaskDisplay(TaskDisplay):
     def complete(self, result: TaskResult) -> None:
         self.task.result = result
         self._print_status()
+        print("")

inspect_ai/_display/rich/display.py CHANGED Viewed

@@ -341,8 +341,6 @@ def tasks_live_status(
     # get config
     config = task_config(tasks[0].profile, generate_config=False, style=theme.light)
-    if config:
-        config += "\n"
     # build footer table
     footer_table = Table.grid(expand=True)
@@ -356,6 +354,8 @@ def tasks_live_status(
     layout_table = Table.grid(expand=True)
     layout_table.add_column()
     layout_table.add_row(config)
+    if config:
+        layout_table.add_row("")
     layout_table.add_row(progress)
     layout_table.add_row(footer_table)

inspect_ai/_display/textual/widgets/transcript.py CHANGED Viewed

@@ -84,6 +84,7 @@ class TranscriptView(ScrollableContainer):
                 scroll_to_end = (
                     new_sample or abs(self.scroll_y - self.max_scroll_y) <= 20
                 )
                 async with self.batch():
                     await self.remove_children()
                     await self.mount_all(
@@ -100,9 +101,32 @@ class TranscriptView(ScrollableContainer):
         else:
             self._pending_sample = sample
-    def _widgets_for_events(self, events: Sequence[Event]) -> list[Widget]:
+    def _widgets_for_events(
+        self, events: Sequence[Event], limit: int = 10
+    ) -> list[Widget]:
         widgets: list[Widget] = []
+        # filter the events to the <limit> most recent
+        filtered_events = events
+        if len(events) > limit:
+            filtered_events = filtered_events[-limit:]
+        # find the sample init event
+        sample_init: SampleInitEvent | None = None
         for event in events:
+            if isinstance(event, SampleInitEvent):
+                sample_init = event
+                break
+        # add the sample init event if it isn't already in the event list
+        if sample_init and sample_init not in filtered_events:
+            filtered_events = [sample_init] + list(filtered_events)
+        # compute how many events we filtered out
+        filtered_count = len(events) - len(filtered_events)
+        showed_filtered_count = False
+        for event in filtered_events:
             display = render_event(event)
             if display:
                 for d in display:
@@ -118,6 +142,22 @@ class TranscriptView(ScrollableContainer):
                             set_transcript_markdown_options(d.content)
                         widgets.append(Static(d.content, markup=False))
                         widgets.append(Static(Text(" ")))
+                        if not showed_filtered_count and filtered_count > 0:
+                            showed_filtered_count = True
+                            widgets.append(
+                                Static(
+                                    transcript_separator(
+                                        f"{filtered_count} events..."
+                                        if filtered_count > 1
+                                        else "1 event...",
+                                        self.app.current_theme.primary,
+                                    )
+                                )
+                            )
+                            widgets.append(Static(Text(" ")))
         return widgets

inspect_ai/_eval/run.py CHANGED Viewed

@@ -298,10 +298,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
     # setup pending tasks, queue, and results
     pending_tasks = tasks.copy()
-    results: list[EvalLog] = []
+    results: list[tuple[int, EvalLog]] = []
     tasks_completed = 0
     total_tasks = len(tasks)
+    # Create a mapping from task to its original index
+    task_to_original_index = {id(task): i for i, task in enumerate(tasks)}
     # produce/consume tasks
     send_channel, receive_channel = anyio.create_memory_object_stream[TaskRunOptions](
         parallel * 2
@@ -322,7 +325,7 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
             # among those models, pick one with the least usage
             model = min(models_with_pending, key=lambda m: model_counts[m])
-            # now we know there’s at least one pending task for this model so it’s safe to pick it
+            # now we know there's at least one pending task for this model so it's safe to pick it
             next_task = next(t for t in pending_tasks if str(t.model) == model)
             pending_tasks.remove(next_task)
             model_counts[str(next_task.model)] += 1
@@ -339,6 +342,8 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
             nonlocal tasks_completed
             async for task_options in receive_channel:
                 result: EvalLog | None = None
+                # Get the original index of this task
+                original_index = task_to_original_index[id(task_options)]
                 # run the task
                 try:
@@ -354,11 +359,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
                             # see: https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
                             def create_task_runner(
                                 options: TaskRunOptions = task_options,
+                                idx: int = original_index,
                             ) -> Callable[[], Awaitable[None]]:
                                 async def run_task() -> None:
                                     nonlocal result
                                     result = await task_run(options)
-                                    results.append(result)
+                                    # Store result with its original index
+                                    results.append((idx, result))
                                 return run_task
@@ -426,7 +433,8 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
             clear_task_screen()
-        return results
+        # Sort results by original index and return just the values
+        return [r for _, r in sorted(results)]
 def resolve_task_sample_ids(

inspect_ai/_eval/score.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import functools
 from copy import deepcopy
 from pathlib import Path
-from typing import Any, Callable, Literal, cast
+from typing import Any, Callable, Literal
 import anyio
@@ -270,9 +270,7 @@ def metrics_from_log(log: EvalLog) -> list[Metric] | dict[str, list[Metric]] | N
 def metric_from_log(metric: EvalMetricDefinition) -> Metric:
-    return cast(
-        Metric, registry_create("metric", metric.name, **(metric.options or {}))
-    )
+    return registry_create("metric", metric.name, **(metric.options or {}))
 def reducers_from_log(log: EvalLog) -> list[ScoreReducer] | None:

inspect_ai/_eval/task/log.py CHANGED Viewed

@@ -56,7 +56,7 @@ class TaskLogger:
     def __init__(
         self,
         task_name: str,
-        task_version: int,
+        task_version: int | str,
         task_file: str | None,
         task_registry_name: str | None,
         task_id: str | None,

inspect_ai/_eval/task/run.py CHANGED Viewed

@@ -35,11 +35,7 @@ from inspect_ai._util.registry import (
     registry_log_name,
     registry_unqualified_name,
 )
-from inspect_ai._util.working import (
-    end_sample_working_limit,
-    init_sample_working_limit,
-    sample_waiting_time,
-)
+from inspect_ai._util.working import init_sample_working_time, sample_waiting_time
 from inspect_ai._view.notify import view_notify_eval
 from inspect_ai.dataset import Dataset, Sample
 from inspect_ai.log import (
@@ -90,6 +86,8 @@ from inspect_ai.solver._fork import set_task_generate
 from inspect_ai.solver._solver import Solver
 from inspect_ai.solver._task_state import sample_state, set_sample_state, state_jsonable
 from inspect_ai.util._limit import LimitExceededError
+from inspect_ai.util._limit import time_limit as create_time_limit
+from inspect_ai.util._limit import working_limit as create_working_limit
 from inspect_ai.util._sandbox.context import sandbox_connections
 from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from inspect_ai.util._span import span
@@ -635,10 +633,6 @@ async def task_run_sample(
             )
             async with sandboxenv_cm:
-                timeout_cm: (
-                    contextlib._GeneratorContextManager[anyio.CancelScope]
-                    | contextlib.nullcontext[None]
-                ) = contextlib.nullcontext()
                 try:
                     # update active sample wth sandboxes now that we are initialised
                     # (ensure that we still exit init context in presence of sandbox error)
@@ -647,19 +641,17 @@ async def task_run_sample(
                     finally:
                         await init_span.__aexit__(None, None, None)
-                    # initialise timeout context manager
-                    timeout_cm = (
-                        anyio.fail_after(time_limit)
-                        if time_limit is not None
-                        else contextlib.nullcontext()
-                    )
                     # record start time
                     start_time = time.monotonic()
-                    init_sample_working_limit(start_time, working_limit)
-                    # run sample w/ optional timeout
-                    with timeout_cm, state._token_limit, state._message_limit:
+                    init_sample_working_time(start_time)
+                    # run sample w/ optional limits
+                    with (
+                        state._token_limit,
+                        state._message_limit,
+                        create_time_limit(time_limit),
+                        create_working_limit(working_limit),
+                    ):
                         # mark started
                         active.started = datetime.now().timestamp()
@@ -675,24 +667,15 @@ async def task_run_sample(
                             )
                         # set progress for plan then run it
-                        state = await plan(state, generate)
-                    # disable sample working limit after execution
-                    end_sample_working_limit()
+                        async with span("solvers"):
+                            state = await plan(state, generate)
                 except TimeoutError:
-                    if time_limit is not None:
-                        transcript()._event(
-                            SampleLimitEvent(
-                                type="time",
-                                message=f"Sample completed: exceeded time limit ({time_limit:,} seconds)",
-                                limit=time_limit,
-                            )
-                        )
-                    else:
-                        py_logger.warning(
-                            "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
-                        )
+                    # Scoped time limits manifest themselves as LimitExceededError, not
+                    # TimeoutError.
+                    py_logger.warning(
+                        "Unexpected timeout error reached top of sample stack. Are you handling TimeoutError when applying timeouts?"
+                    )
                     # capture most recent state for scoring
                     state = sample_state() or state
@@ -737,54 +720,59 @@ async def task_run_sample(
                 # the cause of the timeout is a hung container and scoring requires
                 # interacting with the container). as a middle ground we use half
                 # of the original timeout value for scoring.
-                if time_limit is not None:
-                    timeout_cm = anyio.fail_after(time_limit / 2)
+                scoring_time_limit = time_limit / 2 if time_limit else None
                 set_sample_state(state)
                 # scoring
                 try:
                     # timeout during scoring will result in an ordinary sample error
-                    with timeout_cm:
+                    with create_time_limit(scoring_time_limit):
                         if error is None:
-                            for scorer in scorers or []:
-                                scorer_name = unique_scorer_name(
-                                    scorer, list(results.keys())
-                                )
-                                async with span(name=scorer_name, type="scorer"):
-                                    score_result = (
-                                        await scorer(state, Target(sample.target))
-                                        if scorer
-                                        else None
+                            async with span(name="scorers"):
+                                for scorer in scorers or []:
+                                    scorer_name = unique_scorer_name(
+                                        scorer, list(results.keys())
                                     )
-                                    if score_result is not None:
-                                        sample_score = SampleScore(
-                                            score=score_result,
-                                            sample_id=sample.id,
-                                            sample_metadata=sample.metadata,
-                                            scorer=registry_unqualified_name(scorer),
+                                    async with span(name=scorer_name, type="scorer"):
+                                        score_result = (
+                                            await scorer(state, Target(sample.target))
+                                            if scorer
+                                            else None
+                                        )
+                                        if score_result is not None:
+                                            sample_score = SampleScore(
+                                                score=score_result,
+                                                sample_id=sample.id,
+                                                sample_metadata=sample.metadata,
+                                                scorer=registry_unqualified_name(
+                                                    scorer
+                                                ),
+                                            )
+                                            transcript()._event(
+                                                ScoreEvent(
+                                                    score=score_result,
+                                                    target=sample.target,
+                                                )
+                                            )
+                                            results[scorer_name] = sample_score
+                                # add scores returned by solvers
+                                if state.scores is not None:
+                                    for name, score in state.scores.items():
+                                        results[name] = SampleScore(
+                                            score=score,
+                                            sample_id=state.sample_id,
+                                            sample_metadata=state.metadata,
                                         )
                                         transcript()._event(
                                             ScoreEvent(
-                                                score=score_result, target=sample.target
+                                                score=score, target=sample.target
                                             )
                                         )
-                                        results[scorer_name] = sample_score
-                            # add scores returned by solvers
-                            if state.scores is not None:
-                                for name, score in state.scores.items():
-                                    results[name] = SampleScore(
-                                        score=score,
-                                        sample_id=state.sample_id,
-                                        sample_metadata=state.metadata,
-                                    )
-                                    transcript()._event(
-                                        ScoreEvent(score=score, target=sample.target)
-                                    )
-                            # propagate results into scores
-                            state.scores = {k: v.score for k, v in results.items()}
+                                # propagate results into scores
+                                state.scores = {k: v.score for k, v in results.items()}
                 except anyio.get_cancelled_exc_class():
                     if active.interrupt_action:
@@ -798,17 +786,7 @@ async def task_run_sample(
                     raise
                 except BaseException as ex:
-                    # note timeout
-                    if isinstance(ex, TimeoutError):
-                        transcript()._event(
-                            SampleLimitEvent(
-                                type="time",
-                                message=f"Unable to score sample due to exceeded time limit ({time_limit:,} seconds)",
-                                limit=time_limit,
-                            )
-                        )
-                    # handle error (this will throw if we've exceeded the limit)
+                    # handle error
                     error, raise_error = handle_error(ex)
         except Exception as ex:

inspect_ai/_eval/task/task.py CHANGED Viewed

@@ -64,7 +64,7 @@ class Task:
         time_limit: int | None = None,
         working_limit: int | None = None,
         name: str | None = None,
-        version: int = 0,
+        version: int | str = 0,
         metadata: dict[str, Any] | None = None,
         **kwargs: Unpack[TaskDeprecatedArgs],
     ) -> None:

inspect_ai/_util/_async.py CHANGED Viewed

@@ -136,7 +136,7 @@ def current_async_backend() -> Literal["asyncio", "trio"] | None:
 def configured_async_backend() -> Literal["asyncio", "trio"]:
-    backend = os.environ.get("INSPECT_ASYNC_BACKEND", "asyncio").lower()
+    backend = os.environ.get("INSPECT_ASYNC_BACKEND", "asyncio").lower() or "asyncio"
     return _validate_backend(backend)

inspect_ai/_util/content.py CHANGED Viewed

@@ -1,9 +1,14 @@
 from typing import Literal, Union
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, JsonValue
-class ContentText(BaseModel):
+class ContentBase(BaseModel):
+    internal: JsonValue | None = Field(default=None)
+    """Model provider specific payload - typically used to aid transformation back to model types."""
+class ContentText(ContentBase):
     """Text content."""
     type: Literal["text"] = Field(default="text")
@@ -16,7 +21,7 @@ class ContentText(BaseModel):
     """Was this a refusal message?"""
-class ContentReasoning(BaseModel):
+class ContentReasoning(ContentBase):
     """Reasoning content.
     See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
@@ -35,7 +40,7 @@ class ContentReasoning(BaseModel):
     """Indicates that the explicit content of this reasoning block has been redacted."""
-class ContentImage(BaseModel):
+class ContentImage(ContentBase):
     """Image content."""
     type: Literal["image"] = Field(default="image")
@@ -51,7 +56,7 @@ class ContentImage(BaseModel):
     """
-class ContentAudio(BaseModel):
+class ContentAudio(ContentBase):
     """Audio content."""
     type: Literal["audio"] = Field(default="audio")
@@ -64,7 +69,7 @@ class ContentAudio(BaseModel):
     """Format of audio data ('mp3' or 'wav')"""
-class ContentVideo(BaseModel):
+class ContentVideo(ContentBase):
     """Video content."""
     type: Literal["video"] = Field(default="video")

inspect_ai/_util/interrupt.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import anyio
-from .working import check_sample_working_limit
+from inspect_ai.util._limit import check_working_limit
 def check_sample_interrupt() -> None:
@@ -12,4 +12,4 @@ def check_sample_interrupt() -> None:
         raise anyio.get_cancelled_exc_class()
     # check for working_limit
-    check_sample_working_limit()
+    check_working_limit()

inspect_ai/_util/text.py CHANGED Viewed

@@ -1,12 +1,19 @@
 import random
 import re
 import string
+import textwrap
 from logging import getLogger
 from typing import List, NamedTuple
 logger = getLogger(__name__)
+def truncate_text(text: str, max_length: int) -> str:
+    if len(text) <= max_length:
+        return text
+    return textwrap.shorten(text, width=max_length, placeholder="...")
 def strip_punctuation(s: str) -> str:
     return s.strip(string.whitespace + string.punctuation)

inspect-ai 0.3.98__py3-none-any.whl → 0.3.100__py3-none-any.whl

inspect-ai 0.3.98py3-none-any.whl → 0.3.100py3-none-any.whl