PyPI - inspect-ai - Versions diffs - 0.3.102__py3-none-any.whl → 0.3.104__py3-none-any.whl - Mend

inspect-ai 0.3.102py3-none-any.whl → 0.3.104py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

inspect_ai/_cli/common.py +2 -1
inspect_ai/_cli/eval.py +2 -1
inspect_ai/_display/core/active.py +3 -0
inspect_ai/_display/core/config.py +1 -0
inspect_ai/_display/core/panel.py +21 -13
inspect_ai/_display/core/results.py +3 -7
inspect_ai/_display/core/rich.py +3 -5
inspect_ai/_display/log/__init__.py +0 -0
inspect_ai/_display/log/display.py +173 -0
inspect_ai/_display/plain/display.py +2 -2
inspect_ai/_display/rich/display.py +2 -4
inspect_ai/_display/textual/app.py +1 -6
inspect_ai/_display/textual/widgets/task_detail.py +3 -14
inspect_ai/_display/textual/widgets/tasks.py +1 -1
inspect_ai/_eval/eval.py +14 -2
inspect_ai/_eval/evalset.py +3 -2
inspect_ai/_eval/registry.py +6 -1
inspect_ai/_eval/run.py +7 -1
inspect_ai/_eval/task/constants.py +1 -0
inspect_ai/_eval/task/log.py +5 -1
inspect_ai/_eval/task/run.py +1 -1
inspect_ai/_util/citation.py +88 -0
inspect_ai/_util/content.py +24 -2
inspect_ai/_util/json.py +17 -2
inspect_ai/_util/registry.py +19 -4
inspect_ai/_view/schema.py +0 -6
inspect_ai/_view/www/dist/assets/index.css +82 -24
inspect_ai/_view/www/dist/assets/index.js +10124 -9808
inspect_ai/_view/www/log-schema.json +418 -1
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
inspect_ai/_view/www/node_modules/katex/src/fonts/generate_fonts.py +58 -0
inspect_ai/_view/www/node_modules/katex/src/metrics/extract_tfms.py +114 -0
inspect_ai/_view/www/node_modules/katex/src/metrics/extract_ttfs.py +122 -0
inspect_ai/_view/www/node_modules/katex/src/metrics/format_json.py +28 -0
inspect_ai/_view/www/node_modules/katex/src/metrics/parse_tfm.py +211 -0
inspect_ai/_view/www/package.json +2 -2
inspect_ai/_view/www/src/@types/log.d.ts +140 -39
inspect_ai/_view/www/src/app/content/RecordTree.tsx +13 -0
inspect_ai/_view/www/src/app/log-view/LogView.tsx +1 -1
inspect_ai/_view/www/src/app/routing/logNavigation.ts +31 -0
inspect_ai/_view/www/src/app/routing/{navigationHooks.ts → sampleNavigation.ts} +39 -86
inspect_ai/_view/www/src/app/samples/SampleDialog.tsx +1 -1
inspect_ai/_view/www/src/app/samples/SampleDisplay.tsx +1 -1
inspect_ai/_view/www/src/app/samples/chat/MessageCitations.module.css +16 -0
inspect_ai/_view/www/src/app/samples/chat/MessageCitations.tsx +63 -0
inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +6 -0
inspect_ai/_view/www/src/app/samples/chat/MessageContent.tsx +174 -25
inspect_ai/_view/www/src/app/samples/chat/MessageContents.tsx +21 -3
inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.module.css +7 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/ContentDataView.tsx +111 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.module.css +10 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearch.tsx +14 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.module.css +19 -0
inspect_ai/_view/www/src/app/samples/chat/content-data/WebSearchResults.tsx +49 -0
inspect_ai/_view/www/src/app/samples/chat/messages.ts +7 -1
inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.tsx +12 -2
inspect_ai/_view/www/src/app/samples/chat/types.ts +4 -0
inspect_ai/_view/www/src/app/samples/list/SampleList.tsx +1 -1
inspect_ai/_view/www/src/app/samples/sampleLimit.ts +2 -2
inspect_ai/_view/www/src/app/samples/transcript/ModelEventView.tsx +1 -1
inspect_ai/_view/www/src/app/samples/transcript/SampleLimitEventView.tsx +4 -4
inspect_ai/_view/www/src/app/samples/transcript/outline/TranscriptOutline.tsx +1 -1
inspect_ai/_view/www/src/components/MarkdownDiv.tsx +15 -2
inspect_ai/_view/www/src/tests/README.md +2 -2
inspect_ai/_view/www/src/utils/git.ts +3 -1
inspect_ai/_view/www/src/utils/html.ts +6 -0
inspect_ai/agent/_handoff.py +3 -3
inspect_ai/log/_condense.py +5 -0
inspect_ai/log/_file.py +4 -1
inspect_ai/log/_log.py +9 -4
inspect_ai/log/_recorders/eval.py +4 -3
inspect_ai/log/_recorders/json.py +5 -2
inspect_ai/log/_recorders/recorder.py +1 -0
inspect_ai/log/_util.py +2 -0
inspect_ai/model/__init__.py +14 -0
inspect_ai/model/_call_tools.py +13 -4
inspect_ai/model/_chat_message.py +3 -0
inspect_ai/model/_openai_responses.py +80 -34
inspect_ai/model/_providers/_anthropic_citations.py +158 -0
inspect_ai/model/_providers/_google_citations.py +100 -0
inspect_ai/model/_providers/anthropic.py +196 -34
inspect_ai/model/_providers/google.py +94 -22
inspect_ai/model/_providers/mistral.py +20 -7
inspect_ai/model/_providers/openai.py +11 -10
inspect_ai/model/_providers/openai_compatible.py +3 -2
inspect_ai/model/_providers/openai_responses.py +2 -5
inspect_ai/model/_providers/perplexity.py +123 -0
inspect_ai/model/_providers/providers.py +13 -2
inspect_ai/model/_providers/vertex.py +3 -0
inspect_ai/model/_trim.py +5 -0
inspect_ai/tool/__init__.py +14 -0
inspect_ai/tool/_mcp/_mcp.py +5 -2
inspect_ai/tool/_mcp/sampling.py +19 -3
inspect_ai/tool/_mcp/server.py +1 -1
inspect_ai/tool/_tool.py +10 -1
inspect_ai/tool/_tools/_web_search/_base_http_provider.py +104 -0
inspect_ai/tool/_tools/_web_search/_exa.py +78 -0
inspect_ai/tool/_tools/_web_search/_google.py +22 -25
inspect_ai/tool/_tools/_web_search/_tavily.py +47 -65
inspect_ai/tool/_tools/_web_search/_web_search.py +83 -36
inspect_ai/tool/_tools/_web_search/_web_search_provider.py +7 -0
inspect_ai/util/_display.py +11 -2
inspect_ai/util/_sandbox/docker/compose.py +2 -2
inspect_ai/util/_span.py +12 -1
{inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/METADATA +2 -2
{inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/RECORD +112 -88
/inspect_ai/model/{_openai_computer_use.py → _providers/_openai_computer_use.py} +0 -0
/inspect_ai/model/{_openai_web_search.py → _providers/_openai_web_search.py} +0 -0
{inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/licenses/LICENSE +0 -0
{inspect_ai-0.3.102.dist-info → inspect_ai-0.3.104.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/common.py CHANGED Viewed

@@ -60,7 +60,8 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
     @click.option(
         "--display",
         type=click.Choice(
-            ["full", "conversation", "rich", "plain", "none"], case_sensitive=False
+            ["full", "conversation", "rich", "plain", "log", "none"],
+            case_sensitive=False,
         ),
         default=DEFAULT_DISPLAY,
         envvar="INSPECT_DISPLAY",

inspect_ai/_cli/eval.py CHANGED Viewed

@@ -641,7 +641,7 @@ def eval_command(
 @click.option(
     "--retry-connections",
     type=float,
-    help="Reduce max_connections at this rate with each retry (defaults to 0.5)",
+    help="Reduce max_connections at this rate with each retry (defaults to 1.0, which results in no reduction).",
     envvar="INSPECT_EVAL_RETRY_CONNECTIONS",
 )
 @click.option(
@@ -966,6 +966,7 @@ def eval_exec(
         success, _ = eval_set(**params)
         return success
     else:
+        params["log_header_only"] = True  # cli invocation doesn't need full log
         eval(**params)
         return True

inspect_ai/_display/core/active.py CHANGED Viewed

@@ -5,6 +5,7 @@ import rich
 from inspect_ai.util._display import display_type
+from ..log.display import LogDisplay
 from ..plain.display import PlainDisplay
 from ..rich.display import RichDisplay
 from ..textual.display import TextualDisplay
@@ -24,6 +25,8 @@ def display() -> Display:
             and not rich.get_console().is_jupyter
         ):
             _active_display = TextualDisplay()
+        elif display_type() == "log":
+            _active_display = LogDisplay()
         else:
             _active_display = RichDisplay()

inspect_ai/_display/core/config.py CHANGED Viewed

@@ -30,6 +30,7 @@ def task_config(
         config = dict(profile.generate_config.model_dump(exclude_none=True)) | config
     if profile.tags:
         config["tags"] = ",".join(profile.tags)
+    config["dataset"] = profile.dataset
     config_print: list[str] = []
     for name, value in config.items():
         if name == "approval" and isinstance(value, dict):

inspect_ai/_display/core/panel.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Tuple
 import rich
-from rich.console import RenderableType
+from rich.console import Group, RenderableType
 from rich.panel import Panel
 from rich.table import Table
 from rich.text import Text
@@ -9,7 +9,7 @@ from rich.text import Text
 from inspect_ai._util.constants import CONSOLE_DISPLAY_WIDTH
 from inspect_ai._util.path import cwd_relative_path
 from inspect_ai._util.registry import registry_unqualified_name
-from inspect_ai.util._display import display_type
+from inspect_ai.util._display import display_type_plain
 from .display import TaskProfile
 from .rich import is_vscode_notebook, rich_theme
@@ -27,7 +27,7 @@ def task_panel(
     log_location: str | None,
 ) -> RenderableType:
     # dispatch to plain handler if we are in plain mode
-    if display_type() == "plain":
+    if display_type_plain():
         return task_panel_plain(
             profile, show_model, body, subtitle, footer, log_location
         )
@@ -89,23 +89,31 @@ def task_panel(
             log_location_relative = log_location
         root = Table.grid(expand=True)
-        root.add_column()
+        root.add_column(overflow="fold")
         root.add_row(table)
         root.add_row()
         root.add_row(
             f"[bold][{theme.light}]Log:[/{theme.light}][/bold] "
             + f"[{theme.link}]{log_location_relative}[/{theme.link}]"
         )
+        root.add_row()
-    # create panel w/ title
-    panel = Panel(
-        root,
-        title=task_panel_title(profile, show_model),
-        title_align="left",
-        width=width,
-        expand=True,
-    )
-    return panel
+        panel = Panel(
+            task_panel_title(profile, show_model),
+            padding=(0, 0),
+            width=width,
+            height=3,
+            expand=True,
+        )
+        return Group(panel, root)
+    else:
+        return Panel(
+            root,
+            title=task_panel_title(profile, show_model),
+            title_align="left",
+            width=width,
+            expand=True,
+        )
 def task_panel_plain(

inspect_ai/_display/core/results.py CHANGED Viewed

@@ -18,7 +18,7 @@ from .display import (
     TaskSuccess,
     TaskWithResult,
 )
-from .panel import task_panel, task_targets
+from .panel import task_panel
 from .rich import rich_theme
@@ -41,8 +41,6 @@ def task_result_cancelled(
 ) -> RenderableType:
     # The contents of the panel
     config = task_config(profile)
-    targets = task_targets(profile)
-    subtitle = config, targets
     body = task_stats(cancelled.stats)
     # The panel
@@ -50,7 +48,7 @@ def task_result_cancelled(
         profile=profile,
         show_model=True,
         body=body,
-        subtitle=subtitle,
+        subtitle=config,
         footer=task_interrupted(profile, cancelled.samples_completed),
         log_location=profile.log_location,
     )
@@ -76,8 +74,6 @@ def task_results(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
 def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
     # The contents of the panel
     config = task_config(profile)
-    targets = task_targets(profile)
-    subtitle = config, targets
     body = task_stats(success.stats)
     # the panel
@@ -85,7 +81,7 @@ def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> Renderabl
         profile=profile,
         show_model=True,
         body=body,
-        subtitle=subtitle,
+        subtitle=config,
         footer=task_results(profile, success),
         log_location=profile.log_location,
     )

inspect_ai/_display/core/rich.py CHANGED Viewed

@@ -11,7 +11,7 @@ from typing_extensions import override
 from inspect_ai._util.platform import is_running_in_jupyterlab, is_running_in_vscode
 from inspect_ai._util.transcript import transcript_code_theme
-from inspect_ai.util._display import display_type
+from inspect_ai.util._display import display_type, display_type_plain
 def is_vscode_notebook(console: Console) -> bool:
@@ -20,15 +20,13 @@ def is_vscode_notebook(console: Console) -> bool:
 def rich_no_color() -> bool:
     return (
-        display_type() == "plain"
-        or not is_running_in_vscode()
-        or is_running_in_jupyterlab()
+        display_type_plain() or not is_running_in_vscode() or is_running_in_jupyterlab()
     )
 def rich_initialise() -> None:
     # reflect ansi prefs
-    if display_type() == "plain":
+    if display_type_plain():
         rich.reconfigure(no_color=True, force_terminal=False, force_interactive=False)
     elif rich_no_color():
         rich.reconfigure(no_color=True)

inspect_ai/_display/log/__init__.py ADDED Viewed

File without changes

inspect_ai/_display/log/display.py ADDED Viewed

@@ -0,0 +1,173 @@
+import contextlib
+import logging
+from typing import AsyncIterator, Callable, Coroutine, Iterator
+import anyio
+from rich.console import Console
+from inspect_ai._util._async import configured_async_backend, run_coroutine
+from inspect_ai._util.platform import running_in_notebook
+from ...util import throttle
+from ...util._concurrency import concurrency_status_display
+from ..core.display import (
+    TR,
+    Display,
+    Progress,
+    TaskDisplay,
+    TaskDisplayMetric,
+    TaskProfile,
+    TaskResult,
+    TaskScreen,
+    TaskSpec,
+    TaskWithResult,
+)
+from ..core.footer import task_http_retries_str
+from ..core.results import task_metric, tasks_results
+class LogDisplay(Display):
+    def __init__(self) -> None:
+        self.total_tasks: int = 0
+        self.tasks: list[TaskWithResult] = []
+        self.parallel = False
+    def print(self, message: str) -> None:
+        logging.info(message, stacklevel=2)
+    @contextlib.contextmanager
+    def progress(self, total: int) -> Iterator[Progress]:
+        yield LogProgress(total)
+    def run_task_app(self, main: Callable[[], Coroutine[None, None, TR]]) -> TR:
+        if running_in_notebook():
+            return run_coroutine(main())
+        else:
+            return anyio.run(main, backend=configured_async_backend())
+    @contextlib.contextmanager
+    def suspend_task_app(self) -> Iterator[None]:
+        yield
+    @contextlib.asynccontextmanager
+    async def task_screen(
+        self, tasks: list[TaskSpec], parallel: bool
+    ) -> AsyncIterator[TaskScreen]:
+        self.total_tasks = len(tasks)
+        self.tasks = []
+        self.parallel = parallel
+        try:
+            logging.info(f"Running {self.total_tasks} tasks...", stacklevel=3)
+            yield TaskScreen()
+        finally:
+            # Log final results
+            if self.tasks:
+                self._log_results()
+    @contextlib.contextmanager
+    def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]:
+        # Create and yield task display
+        task = TaskWithResult(profile, None)
+        self.tasks.append(task)
+        yield LogTaskDisplay(task)
+        self._log_status()
+    def display_counter(self, caption: str, value: str) -> None:
+        logging.info(f"{caption}: {value}", stacklevel=2)
+    def _log_status(self) -> None:
+        """Log status updates for all tasks"""
+        completed_tasks = sum(1 for task in self.tasks if task.result is not None)
+        total_tasks = len(self.tasks)
+        logging.info(f"{completed_tasks}/{total_tasks} tasks complete", stacklevel=4)
+    def _log_results(self) -> None:
+        """Log final results"""
+        results = tasks_results(self.tasks)
+        console = Console(width=120)
+        console.log(results, _stack_offset=4)
+class LogProgress(Progress):
+    def __init__(self, total: int):
+        self.total = total
+        self.current = 0
+    def update(self, n: int = 1) -> None:
+        self.current += n
+    def complete(self) -> None:
+        self.current = self.total
+class LogTaskDisplay(TaskDisplay):
+    def __init__(self, task: TaskWithResult):
+        self.task = task
+        self.progress_display: LogProgress | None = None
+        self.samples_complete = 0
+        self.samples_total = 0
+        self.current_metrics: list[TaskDisplayMetric] | None = None
+    @contextlib.contextmanager
+    def progress(self) -> Iterator[Progress]:
+        self.progress_display = LogProgress(self.task.profile.steps)
+        yield self.progress_display
+    @throttle(5)
+    def _log_status_throttled(self, stacklevel: int) -> None:
+        self._log_status(stacklevel=stacklevel + 2)
+    def _log_status(self, stacklevel: int) -> None:
+        """Log status updates"""
+        status_parts: list[str] = []
+        # Add task name and model
+        status_parts.append(f"Task: {self.task.profile.name}")
+        status_parts.append(f"Model: {self.task.profile.model}")
+        # Add step progress
+        if self.progress_display:
+            progress_percent = int(
+                self.progress_display.current / self.progress_display.total * 100
+            )
+            status_parts.append(
+                f"Steps: {self.progress_display.current}/{self.progress_display.total} {progress_percent}%"
+            )
+        # Add sample progress
+        status_parts.append(f"Samples: {self.samples_complete}/{self.samples_total}")
+        # Add metrics
+        if self.current_metrics:
+            metric_str = task_metric(self.current_metrics)
+            status_parts.append(metric_str)
+        # Add resource usage
+        resources_dict: dict[str, str] = {}
+        for model, resource in concurrency_status_display().items():
+            resources_dict[model] = f"{resource[0]}/{resource[1]}"
+        resources = ", ".join(
+            [f"{key}: {value}" for key, value in resources_dict.items()]
+        )
+        status_parts.append(resources)
+        # Add rate limits
+        rate_limits = task_http_retries_str()
+        if rate_limits:
+            status_parts.append(rate_limits)
+        # Print on new line
+        logging.info(", ".join(status_parts), stacklevel=stacklevel)
+    def sample_complete(self, complete: int, total: int) -> None:
+        self.samples_complete = complete
+        self.samples_total = total
+        self._log_status_throttled(stacklevel=3)
+    def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
+        self.current_metrics = metrics
+        self._log_status_throttled(stacklevel=3)
+    def complete(self, result: TaskResult) -> None:
+        self.task.result = result
+        self._log_status(stacklevel=3)

inspect_ai/_display/plain/display.py CHANGED Viewed

@@ -25,7 +25,7 @@ from ..core.display import (
     TaskWithResult,
 )
 from ..core.footer import task_http_retries_str
-from ..core.panel import task_panel, task_targets
+from ..core.panel import task_panel
 from ..core.results import task_metric, tasks_results
@@ -79,7 +79,7 @@ class PlainDisplay(Display):
             profile=profile,
             show_model=True,
             body="",  # Empty body since we haven't started yet
-            subtitle=(task_config(profile), task_targets(profile)),
+            subtitle=task_config(profile),
             footer=None,
             log_location=None,
         )

inspect_ai/_display/rich/display.py CHANGED Viewed

@@ -32,7 +32,7 @@ from ..core.display import (
     TaskWithResult,
 )
 from ..core.footer import task_footer
-from ..core.panel import task_panel, task_targets, task_title, tasks_title
+from ..core.panel import task_panel, task_title, tasks_title
 from ..core.progress import (
     RichProgress,
     progress_description,
@@ -311,15 +311,13 @@ def task_live_status(
     # the panel contents
     config = task_config(tasks[0].profile, style=theme.light)
-    targets = task_targets(tasks[0].profile)
-    subtitle = config, targets
     # the panel
     return task_panel(
         profile=tasks[0].profile,
         show_model=len(tasks) == 1,
         body=Group("", progress),
-        subtitle=subtitle,
+        subtitle=config,
         footer=task_footer(counters, theme.light),
         log_location=None,
     )

inspect_ai/_display/textual/app.py CHANGED Viewed

@@ -42,7 +42,7 @@ from ..core.display import (
     TaskWithResult,
 )
 from ..core.footer import task_footer
-from ..core.panel import task_targets, task_title, tasks_title
+from ..core.panel import task_title, tasks_title
 from ..core.rich import record_console_input, rich_initialise, rich_theme
 from .theme import inspect_dark, inspect_light
 from .widgets.console import ConsoleView
@@ -296,13 +296,8 @@ class TaskScreenApp(App[TR]):
             tasks.config = task_config(
                 self._tasks[0].profile, generate_config=not self._parallel
             )
-            if not self._parallel:
-                tasks.targets = task_targets(self._tasks[0].profile)
-            else:
-                tasks.targets = " \n "
         else:
             tasks.config = ""
-            tasks.targets = ""
     def update_samples(self) -> None:
         samples_view = self.query_one(SamplesView)

inspect_ai/_display/textual/widgets/task_detail.py CHANGED Viewed

@@ -30,6 +30,8 @@ class TaskDetail(Widget):
         width: 100%;
         height: auto;
         grid-gutter: 1 3;
+        grid-size-columns: 3;
+        grid-columns: 1fr 1fr 1fr;
     }
     """
@@ -92,20 +94,6 @@ class TaskDetail(Widget):
         if len(self.by_reducer) == 0:
             return
-        # Compute the row and column count
-        row_count = len(self.by_reducer)
-        col_count = len(next(iter(self.by_reducer.values())))
-        # If this can fit in a single row, make it fit
-        # otherwise place each reducer on their own row
-        self.grid.styles.grid_columns = "auto"
-        if row_count * col_count < 4:
-            self.grid.styles.grid_size_columns = row_count * col_count
-            self.grid.styles.grid_size_rows = 1
-        else:
-            self.grid.styles.grid_size_columns = col_count
-            self.grid.styles.grid_size_rows = row_count
         # In order to reduce flashing the below tracks use of widgets
         # and updates them when possible (removing and adding them as needed)
         # Makes keys for tracking Task Metric widgets
@@ -142,6 +130,7 @@ class TaskMetrics(Widget):
     TaskMetrics {
         width: auto;
         height: auto;
+        border: solid $foreground 20%;
     }
     TaskMetrics Grid {
         width: auto;

inspect_ai/_display/textual/widgets/tasks.py CHANGED Viewed

@@ -174,7 +174,7 @@ class TaskProgressView(Widget):
         color:$text-secondary;
     }
     #task-detail {
-        column-span: 8;
+        column-span: 9;
     }
     .hidden {
         display: none;

inspect_ai/_eval/eval.py CHANGED Viewed

@@ -105,6 +105,7 @@ def eval(
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
+    log_header_only: bool | None = None,
     score: bool = True,
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -181,6 +182,8 @@ def eval(
         log_shared: Sync sample events to log directory so that users on other systems
             can see log updates in realtime (defaults to no syncing). Specify `True`
             to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
+        log_header_only: If `True`, the function should return only log headers rather
+            than full logs with samples (defaults to `False`).
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         **kwargs: Model generation options.
@@ -234,6 +237,7 @@ def eval(
                 log_images=log_images,
                 log_buffer=log_buffer,
                 log_shared=log_shared,
+                log_header_only=log_header_only,
                 score=score,
                 score_display=score_display,
                 **kwargs,
@@ -288,6 +292,7 @@ async def eval_async(
     log_images: bool | None = None,
     log_buffer: int | None = None,
     log_shared: bool | int | None = None,
+    log_header_only: bool | None = None,
     score: bool = True,
     score_display: bool | None = None,
     **kwargs: Unpack[GenerateConfigArgs],
@@ -344,7 +349,9 @@ async def eval_async(
         log_buffer: Number of samples to buffer before writing log file.
            If not specified, an appropriate default for the format and filesystem is
            chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
-        log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
+        log_shared: Indicate that the log directory is shared, which results in additional
+        syncing of realtime log data for Inspect View.
+        log_header_only: If `True`, the function should return only log headers rather than full logs with samples (defaults to `False`).
         score: Score output (defaults to True)
         score_display: Show scoring metrics in realtime (defaults to True)
         **kwargs: Model generation options.
@@ -432,6 +439,9 @@ async def eval_async(
         # resolve log_shared
         log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
+        # resolve header only
+        log_header_only = log_header_only is True
         # validate that --log-shared can't use used with 'json' format
         if log_shared and log_format == JSON_LOG_FORMAT:
             raise PrerequisiteError(
@@ -507,6 +517,7 @@ async def eval_async(
                         eval_config=eval_config,
                         eval_sandbox=sandbox,
                         recorder=recorder,
+                        header_only=log_header_only,
                         epochs_reducer=epochs_reducer,
                         solver=solver,
                         tags=tags,
@@ -532,6 +543,7 @@ async def eval_async(
                 eval_config=eval_config,
                 eval_sandbox=sandbox,
                 recorder=recorder,
+                header_only=log_header_only,
                 epochs_reducer=epochs_reducer,
                 solver=solver,
                 tags=tags,
@@ -800,7 +812,7 @@ async def eval_retry_async(
         model_roles = model_roles_config_to_model_roles(eval_log.eval.model_roles)
         # collect the rest of the params we need for the eval
-        task_args = eval_log.eval.task_args
+        task_args = eval_log.eval.task_args_passed
         tags = eval_log.eval.tags
         limit = eval_log.eval.config.limit
         sample_id = eval_log.eval.config.sample_id

inspect_ai/_eval/evalset.py CHANGED Viewed

@@ -114,7 +114,7 @@ def eval_set(
             (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
             per-retry will in no case by longer than 1 hour.
         retry_connections: Reduce max_connections at this rate with each retry
-            (defaults to 0.5)
+            (defaults to 1.0, which results in no reduction).
         retry_cleanup: Cleanup failed log files after retries
             (defaults to True)
         model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
@@ -235,6 +235,7 @@ def eval_set(
             log_images=log_images,
             log_buffer=log_buffer,
             log_shared=log_shared,
+            log_header_only=True,
             score=score,
             **kwargs,
         )
@@ -274,7 +275,7 @@ def eval_set(
     fs.mkdir(log_dir, exist_ok=True)
     # resolve some parameters
-    retry_connections = retry_connections or 0.5
+    retry_connections = retry_connections or 1.0
     retry_cleanup = retry_cleanup is not False
     max_connections = starting_max_connections(models, GenerateConfig(**kwargs))
     max_tasks = max_tasks if max_tasks is not None else max(len(models), 4)

inspect_ai/_eval/registry.py CHANGED Viewed

@@ -8,6 +8,7 @@ from inspect_ai._util.error import PrerequisiteError
 from inspect_ai._util.package import get_installed_package_name
 from inspect_ai._util.registry import (
     RegistryInfo,
+    extract_named_params,
     registry_add,
     registry_create,
     registry_info,
@@ -17,7 +18,7 @@ from inspect_ai._util.registry import (
 )
 from .task import Task
-from .task.constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
+from .task.constants import TASK_ALL_PARAMS_ATTR, TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
 MODEL_PARAM = "model"
@@ -133,6 +134,10 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
                 **w_kwargs,
             )
+            # extract all task parameters including defaults
+            named_params = extract_named_params(task_type, True, *w_args, **w_kwargs)
+            setattr(task_instance, TASK_ALL_PARAMS_ATTR, named_params)
             # if its not from an installed package then it is a "local"
             # module import, so set its task file and run dir
             if get_installed_package_name(task_type) is None:

inspect_ai/_eval/run.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import sys
 from typing import Any, Awaitable, Callable, Set, cast
+from inspect_ai._eval.task.constants import TASK_ALL_PARAMS_ATTR
 from inspect_ai._eval.task.task import Task
 from inspect_ai._util.environ import environ_vars
 from inspect_ai._util.trace import trace_action
@@ -63,6 +64,7 @@ async def eval_run(
     eval_config: EvalConfig,
     eval_sandbox: SandboxEnvironmentType | None,
     recorder: Recorder,
+    header_only: bool,
     epochs_reducer: list[ScoreReducer] | None = None,
     solver: Solver | SolverSpec | None = None,
     tags: list[str] | None = None,
@@ -207,11 +209,15 @@ async def eval_run(
                     metrics=eval_metrics,
                     sandbox=resolved_task.sandbox,
                     task_attribs=task.attribs,
-                    task_args=resolved_task.task_args,
+                    task_args=getattr(
+                        task, TASK_ALL_PARAMS_ATTR, resolved_task.task_args
+                    ),
+                    task_args_passed=resolved_task.task_args,
                     model_args=resolved_task.model.model_args,
                     eval_config=task_eval_config,
                     metadata=((metadata or {}) | (task.metadata or {})) or None,
                     recorder=recorder,
+                    header_only=header_only,
                 )
                 await logger.init()

inspect_ai/_eval/task/constants.py CHANGED Viewed

@@ -1,2 +1,3 @@
 TASK_FILE_ATTR = "__task_file__"
 TASK_RUN_DIR_ATTR = "__task_run_dir__"
+TASK_ALL_PARAMS_ATTR = "__task_all_params__"

inspect-ai 0.3.102__py3-none-any.whl → 0.3.104__py3-none-any.whl

inspect-ai 0.3.102py3-none-any.whl → 0.3.104py3-none-any.whl