PyPI - inspect-ai - Versions diffs - 0.3.49__py3-none-any.whl → 0.3.51__py3-none-any.whl - Mend

inspect-ai 0.3.49py3-none-any.whl → 0.3.51py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

inspect_ai/_cli/info.py +2 -2
inspect_ai/_cli/log.py +2 -2
inspect_ai/_cli/score.py +2 -2
inspect_ai/_display/core/display.py +19 -0
inspect_ai/_display/core/panel.py +37 -7
inspect_ai/_display/core/progress.py +29 -2
inspect_ai/_display/core/results.py +79 -40
inspect_ai/_display/core/textual.py +21 -0
inspect_ai/_display/rich/display.py +28 -8
inspect_ai/_display/textual/app.py +107 -1
inspect_ai/_display/textual/display.py +1 -1
inspect_ai/_display/textual/widgets/samples.py +132 -91
inspect_ai/_display/textual/widgets/task_detail.py +236 -0
inspect_ai/_display/textual/widgets/tasks.py +74 -6
inspect_ai/_display/textual/widgets/toggle.py +32 -0
inspect_ai/_eval/context.py +2 -0
inspect_ai/_eval/eval.py +4 -3
inspect_ai/_eval/loader.py +1 -1
inspect_ai/_eval/run.py +35 -2
inspect_ai/_eval/task/log.py +13 -11
inspect_ai/_eval/task/results.py +12 -3
inspect_ai/_eval/task/run.py +139 -36
inspect_ai/_eval/task/sandbox.py +2 -1
inspect_ai/_util/_async.py +30 -1
inspect_ai/_util/file.py +31 -4
inspect_ai/_util/html.py +3 -0
inspect_ai/_util/logger.py +6 -5
inspect_ai/_util/platform.py +5 -6
inspect_ai/_util/registry.py +1 -1
inspect_ai/_view/server.py +9 -9
inspect_ai/_view/www/App.css +2 -2
inspect_ai/_view/www/dist/assets/index.css +2 -2
inspect_ai/_view/www/dist/assets/index.js +352 -294
inspect_ai/_view/www/log-schema.json +13 -0
inspect_ai/_view/www/package.json +1 -0
inspect_ai/_view/www/src/components/MessageBand.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +16 -13
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -3
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +52 -77
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +38 -13
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +15 -2
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +4 -2
inspect_ai/_view/www/src/types/log.d.ts +2 -0
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +2 -0
inspect_ai/_view/www/yarn.lock +9 -4
inspect_ai/approval/__init__.py +1 -1
inspect_ai/approval/_human/approver.py +35 -0
inspect_ai/approval/_human/console.py +62 -0
inspect_ai/approval/_human/manager.py +108 -0
inspect_ai/approval/_human/panel.py +233 -0
inspect_ai/approval/_human/util.py +51 -0
inspect_ai/dataset/_sources/hf.py +2 -2
inspect_ai/dataset/_sources/util.py +1 -1
inspect_ai/log/_file.py +106 -36
inspect_ai/log/_recorders/eval.py +226 -158
inspect_ai/log/_recorders/file.py +9 -6
inspect_ai/log/_recorders/json.py +35 -12
inspect_ai/log/_recorders/recorder.py +15 -15
inspect_ai/log/_samples.py +52 -0
inspect_ai/model/_model.py +14 -0
inspect_ai/model/_model_output.py +4 -0
inspect_ai/model/_providers/azureai.py +1 -1
inspect_ai/model/_providers/hf.py +106 -4
inspect_ai/model/_providers/util/__init__.py +2 -0
inspect_ai/model/_providers/util/hf_handler.py +200 -0
inspect_ai/scorer/_common.py +1 -1
inspect_ai/solver/_plan.py +0 -8
inspect_ai/solver/_task_state.py +18 -1
inspect_ai/solver/_use_tools.py +9 -1
inspect_ai/tool/_tool_def.py +2 -2
inspect_ai/tool/_tool_info.py +14 -2
inspect_ai/tool/_tool_params.py +2 -1
inspect_ai/tool/_tools/_execute.py +1 -1
inspect_ai/tool/_tools/_web_browser/_web_browser.py +6 -0
inspect_ai/util/__init__.py +5 -6
inspect_ai/util/_panel.py +91 -0
inspect_ai/util/_sandbox/__init__.py +2 -6
inspect_ai/util/_sandbox/context.py +4 -3
inspect_ai/util/_sandbox/docker/compose.py +12 -2
inspect_ai/util/_sandbox/docker/docker.py +19 -9
inspect_ai/util/_sandbox/docker/util.py +10 -2
inspect_ai/util/_sandbox/environment.py +47 -41
inspect_ai/util/_sandbox/local.py +15 -10
inspect_ai/util/_subprocess.py +43 -3
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/METADATA +2 -2
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/RECORD +90 -82
inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
inspect_ai/_view/www/node_modules/flatted/python/test.py +0 -63
inspect_ai/approval/_human.py +0 -123
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.49.dist-info → inspect_ai-0.3.51.dist-info}/top_level.txt +0 -0

inspect_ai/_cli/info.py CHANGED Viewed

@@ -5,7 +5,7 @@ import click
 from inspect_ai import __version__
 from inspect_ai._util.constants import PKG_PATH
 from inspect_ai._view.server import resolve_header_only
-from inspect_ai.log._file import eval_log_json, read_eval_log
+from inspect_ai.log._file import eval_log_json_str, read_eval_log
 from .log import headers, schema, types
@@ -46,7 +46,7 @@ def log(path: str, header_only: int) -> None:
     header_only = resolve_header_only(path, header_only)
     log = read_eval_log(path, header_only=header_only)
-    print(eval_log_json(log))
+    print(eval_log_json_str(log))
 @info_command.command("log-file-headers", hidden=True)

inspect_ai/_cli/log.py CHANGED Viewed

@@ -14,7 +14,7 @@ from inspect_ai._util.constants import PKG_PATH
 from inspect_ai.log import list_eval_logs
 from inspect_ai.log._convert import convert_eval_logs
 from inspect_ai.log._file import (
-    eval_log_json,
+    eval_log_json_str,
     read_eval_log,
     read_eval_log_headers,
 )
@@ -127,7 +127,7 @@ def list_command(
 def dump_command(path: str, header_only: bool) -> None:
     """Print log file contents as JSON."""
     log = read_eval_log(path, header_only=header_only)
-    print(eval_log_json(log))
+    print(eval_log_json_str(log))
 @log_command.command("convert")

inspect_ai/_cli/score.py CHANGED Viewed

@@ -61,7 +61,7 @@ async def score(
     # read the eval log
     recorder = create_recorder_for_location(log_file, log_dir)
-    eval_log = recorder.read_log(log_file)
+    eval_log = await recorder.read_log(log_file)
     # check that there are samples therein
     if eval_log.samples is None or len(eval_log.samples) == 0:
@@ -88,7 +88,7 @@ async def score(
     scored = f"{SCORED_SUFFIX}{ext}"
     if not overwrite and not log_file.endswith(scored):
         log_file = log_file.removesuffix(ext) + scored
-    recorder.write_log(log_file, eval_log)
+    await recorder.write_log(log_file, eval_log)
     # print results
     display().print(f"\n{eval_log.eval.task}")

inspect_ai/_display/core/display.py CHANGED Viewed

@@ -19,6 +19,8 @@ from rich.console import Console
 from inspect_ai.log import EvalConfig, EvalResults, EvalStats
 from inspect_ai.model import GenerateConfig, ModelName
+from ...util._panel import InputPanel
 @runtime_checkable
 class Progress(Protocol):
@@ -81,6 +83,8 @@ class TaskWithResult:
 TR = TypeVar("TR")
+TP = TypeVar("TP", bound=InputPanel)
 class TaskScreen(contextlib.AbstractContextManager["TaskScreen"]):
     def __exit__(self, *excinfo: Any) -> None:
@@ -95,12 +99,27 @@ class TaskScreen(contextlib.AbstractContextManager["TaskScreen"]):
     ) -> Iterator[Console]:
         yield rich.get_console()
+    async def input_panel(self, title: str, panel: type[TP]) -> TP:
+        raise NotImplementedError("input_panel not implemented by current display")
+@dataclass
+class TaskDisplayMetric:
+    scorer: str
+    name: str
+    value: float | int
+    reducer: str | None
 @runtime_checkable
 class TaskDisplay(Protocol):
     @contextlib.contextmanager
     def progress(self) -> Iterator[Progress]: ...
+    def sample_complete(self, complete: int, total: int) -> None: ...
+    def update_metrics(self, scores: list[TaskDisplayMetric]) -> None: ...
     def complete(self, result: TaskResult) -> None: ...

inspect_ai/_display/core/panel.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from typing import Tuple
 import rich
 from rich.console import RenderableType
 from rich.panel import Panel
@@ -16,6 +18,10 @@ def task_panel(
     profile: TaskProfile,
     show_model: bool,
     body: RenderableType,
+    subtitle: RenderableType
+    | str
+    | Tuple[RenderableType | str, RenderableType | str]
+    | None,
     footer: RenderableType | tuple[RenderableType, RenderableType] | None,
     log_location: str | None,
 ) -> Panel:
@@ -25,22 +31,39 @@ def task_panel(
     width = CONSOLE_DISPLAY_WIDTH if is_vscode_notebook(console) else None
     jupyter = console.is_jupyter
-    # setup table
+    # root table
     table = Table.grid(expand=True)
     table.add_column()
-    table.add_column(justify="right")
+    # setup table
+    if subtitle is not None:
+        subtitle_table = Table.grid(expand=True)
+        subtitle_table.add_column()
+        if isinstance(subtitle, tuple):
+            subtitle_table.add_column(justify="right")
+            subtitle_table.add_row(
+                to_renderable(subtitle[0]), to_renderable(subtitle[1], style=theme.meta)
+            )
+        else:
+            subtitle_table.add_row(to_renderable(subtitle))
+        table.add_row(subtitle_table)
     # main progress and task info
-    targets = Text.from_markup(task_targets(profile), style=theme.meta)
-    table.add_row(body, targets)
+    table.add_row()
+    table.add_row(body)
+    table.add_row()
     # footer if specified
     if footer:
-        table.add_row()
+        footer_table = Table.grid(expand=True)
+        footer_table.add_column()
         if isinstance(footer, tuple):
-            table.add_row(footer[0], footer[1])
+            footer_table.add_column(justify="right")
+            footer_table.add_row(footer[0], footer[1])
         else:
-            table.add_row(footer)
+            footer_table.add_row(footer)
+        table.add_row(footer_table)
     # enclose in outer table for log link footer
     root = table
@@ -75,6 +98,13 @@ def task_panel(
     return panel
+def to_renderable(item: RenderableType | str, style: str = "") -> RenderableType:
+    if isinstance(item, str):
+        return Text.from_markup(item, style=style)
+    else:
+        return item
 def tasks_title(completed: int, total: int) -> str:
     return f"{completed}/{total} tasks complete"

inspect_ai/_display/core/progress.py CHANGED Viewed

@@ -32,13 +32,20 @@ class RichProgress(Progress):
         model: str = "",
         status: Callable[[], str] | None = None,
         on_update: Callable[[], None] | None = None,
+        count: str = "",
+        score: str = "",
     ) -> None:
         self.total = total
         self.progress = progress
         self.status = status if status else lambda: ""
         self.on_update = on_update
         self.task_id = progress.add_task(
-            description, total=PROGRESS_TOTAL, model=model, status=self.status()
+            description,
+            total=PROGRESS_TOTAL,
+            model=model,
+            status=self.status(),
+            count=count,
+            score=score,
         )
     @override
@@ -56,6 +63,16 @@ class RichProgress(Progress):
             task_id=self.task_id, completed=PROGRESS_TOTAL, status=self.status()
         )
+    def update_count(self, complete: int, total: int) -> None:
+        self.progress.update(
+            task_id=self.task_id, count=progress_count(complete, total), refresh=True
+        )
+        if self.on_update:
+            self.on_update()
+    def update_score(self, score: str) -> None:
+        self.progress.update(task_id=self.task_id, score=score)
 def rich_progress() -> RProgress:
     console = rich.get_console()
@@ -65,10 +82,12 @@ def rich_progress() -> RProgress:
         TextColumn("{task.fields[model]}"),
         BarColumn(bar_width=40 if is_vscode_notebook(console) else None),
         TaskProgressColumn(),
+        TextColumn("{task.fields[count]}"),
+        TextColumn("{task.fields[score]}"),
         TimeElapsedColumn(),
         transient=True,
         console=console,
-        expand=not is_vscode_notebook(console),
+        expand=True,
     )
@@ -109,3 +128,11 @@ def progress_time(time: float) -> str:
     minutes, seconds = divmod(time, 60)
     hours, minutes = divmod(minutes, 60)
     return f"{hours:2.0f}:{minutes:02.0f}:{seconds:02.0f}"
+def progress_count(complete: int, total: int) -> str:
+    # Pad the display to keep it stable
+    total_str = f"{total:,}"
+    complete_str = f"{complete:,}"
+    padding = max(0, len(total_str) - len(complete_str))
+    return " " * padding + f"[{complete_str}/{total_str}]"

inspect_ai/_display/core/results.py CHANGED Viewed

@@ -1,22 +1,24 @@
 from datetime import datetime
 from typing import Sequence, Set
+import numpy as np
 from rich.console import Group, RenderableType
 from rich.table import Table
 from rich.text import Text
 from inspect_ai.log import EvalStats
-from inspect_ai.log._log import rich_traceback
+from inspect_ai.log._log import EvalScore, rich_traceback
 from .config import task_config, task_dict
 from .display import (
     TaskCancelled,
+    TaskDisplayMetric,
     TaskError,
     TaskProfile,
     TaskSuccess,
     TaskWithResult,
 )
-from .panel import task_panel
+from .panel import task_panel, task_targets
 from .rich import rich_theme
@@ -37,10 +39,18 @@ def tasks_results(tasks: Sequence[TaskWithResult]) -> RenderableType:
 def task_result_cancelled(
     profile: TaskProfile, cancelled: TaskCancelled
 ) -> RenderableType:
+    # The contents of the panel
+    config = task_config(profile)
+    targets = task_targets(profile)
+    subtitle = config, targets
+    body = task_stats(cancelled.stats)
+    # The panel
     return task_panel(
         profile=profile,
         show_model=True,
-        body=task_stats(profile, cancelled.stats),
+        body=body,
+        subtitle=subtitle,
         footer=task_interrupted(profile, cancelled.samples_completed),
         log_location=profile.log_location,
     )
@@ -50,36 +60,7 @@ def task_results(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
     theme = rich_theme()
     # do we have more than one scorer name?
-    results = success.results
-    scorer_names: Set[str] = {score.name for score in results.scores}
-    reducer_names: Set[str] = {
-        score.reducer for score in results.scores if score.reducer is not None
-    }
-    show_reducer = len(reducer_names) > 1 or "avg" not in reducer_names
-    output: dict[str, str] = {}
-    for score in results.scores:
-        for name, metric in score.metrics.items():
-            value = (
-                "1.0"
-                if metric.value == 1
-                else (
-                    str(metric.value)
-                    if isinstance(metric.value, int)
-                    else f"{metric.value:.3g}"
-                )
-            )
-            name = (
-                rf"{name}\[{score.reducer}]"
-                if show_reducer and score.reducer is not None
-                else name
-            )
-            key = f"{score.name}/{name}" if (len(scorer_names) > 1) else name
-            output[key] = value
-    if output:
-        message = f"[{theme.metric}]{task_dict(output, True)}[/{theme.metric}]"
-    else:
-        message = ""
+    message = task_metrics(success.results.scores)
     # note if some of our samples had errors
     if success.samples_completed < profile.samples:
@@ -93,10 +74,18 @@ def task_results(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
 def task_result_summary(profile: TaskProfile, success: TaskSuccess) -> RenderableType:
+    # The contents of the panel
+    config = task_config(profile)
+    targets = task_targets(profile)
+    subtitle = config, targets
+    body = task_stats(success.stats)
+    # the panel
     return task_panel(
         profile=profile,
         show_model=True,
-        body=task_stats(profile, success.stats),
+        body=body,
+        subtitle=subtitle,
         footer=task_results(profile, success),
         log_location=profile.log_location,
     )
@@ -107,20 +96,17 @@ def task_result_error(profile: TaskProfile, error: TaskError) -> RenderableType:
         profile=profile,
         show_model=True,
         body=rich_traceback(error.exc_type, error.exc_value, error.traceback),
+        subtitle=None,
         footer=task_interrupted(profile, error.samples_completed),
         log_location=profile.log_location,
     )
-def task_stats(profile: TaskProfile, stats: EvalStats) -> RenderableType:
+def task_stats(stats: EvalStats) -> RenderableType:
     theme = rich_theme()
     panel = Table.grid(expand=True)
     panel.add_column()
-    config = task_config(profile)
-    if config:
-        panel.add_row(config)
-        panel.add_row()
-    elif len(stats.model_usage) < 2:
+    if len(stats.model_usage) < 2:
         panel.add_row()
     table = Table.grid(expand=True)
@@ -178,3 +164,56 @@ def task_interrupted(profile: TaskProfile, samples_completed: int) -> Renderable
         )
     return message
+def task_metric(metrics: list[TaskDisplayMetric]) -> str:
+    reducer_names: Set[str] = {
+        metric.reducer for metric in metrics if metric.reducer is not None
+    }
+    show_reducer = len(reducer_names) > 1 or (
+        len(reducer_names) == 1 and "avg" not in reducer_names
+    )
+    metric = metrics[0]
+    if np.isnan(metric.value):
+        value = " n/a"
+    else:
+        value = f"{metric.value:.2f}"
+    if show_reducer:
+        return f"{metric.name}/{metric.reducer}: {value}"
+    else:
+        return f"{metric.name}: {value}"
+def task_metrics(scores: list[EvalScore]) -> str:
+    theme = rich_theme()
+    scorer_names: Set[str] = {score.name for score in scores}
+    reducer_names: Set[str] = {
+        score.reducer for score in scores if score.reducer is not None
+    }
+    show_reducer = len(reducer_names) > 1 or "avg" not in reducer_names
+    output: dict[str, str] = {}
+    for score in scores:
+        for name, metric in score.metrics.items():
+            value = (
+                "1.0"
+                if metric.value == 1
+                else (
+                    str(metric.value)
+                    if isinstance(metric.value, int)
+                    else f"{metric.value:.3g}"
+                )
+            )
+            name = (
+                rf"{name}\[{score.reducer}]"
+                if show_reducer and score.reducer is not None
+                else name
+            )
+            key = f"{score.name}/{name}" if (len(scorer_names) > 1) else name
+            output[key] = value
+    if output:
+        return f"[{theme.metric}]{task_dict(output, True)}[/{theme.metric}]"
+    else:
+        return ""

inspect_ai/_display/core/textual.py ADDED Viewed

@@ -0,0 +1,21 @@
+from logging import getLogger
+from textual.driver import Driver
+logger = getLogger(__name__)
+# force mouse support for textual -- this works around an issue where
+# mouse events are disabled after a reload of the vs code ide, see:
+#   https://github.com/Textualize/textual/issues/5380
+# ansi codes for enabling mouse support are idempotent so it is fine
+# to do this even in cases where mouse support is already enabled.
+# we try/catch since we aren't 100% sure there aren't cases where doing
+# this won't raise and we'd rather not fail hard in in these case
+def textual_enable_mouse_support(driver: Driver) -> None:
+    enable_mouse_support = getattr(driver, "_enable_mouse_support", None)
+    if enable_mouse_support:
+        try:
+            enable_mouse_support()
+        except Exception as ex:
+            logger.warning(f"Error enabling mouse support: {ex}")

inspect_ai/_display/rich/display.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 from typing import Any, AsyncIterator, Callable, Coroutine, Iterator
 import rich
-from rich.console import Console, Group, RenderableType
+from rich.console import Console, RenderableType
 from rich.live import Live
 from rich.panel import Panel
 from rich.progress import Progress as RProgress
@@ -23,6 +23,7 @@ from ..core.display import (
     Display,
     Progress,
     TaskDisplay,
+    TaskDisplayMetric,
     TaskProfile,
     TaskResult,
     TaskScreen,
@@ -30,7 +31,7 @@ from ..core.display import (
     TaskWithResult,
 )
 from ..core.footer import task_footer
-from ..core.panel import task_panel, task_title, tasks_title
+from ..core.panel import task_panel, task_targets, task_title, tasks_title
 from ..core.progress import (
     RichProgress,
     progress_description,
@@ -38,7 +39,7 @@ from ..core.progress import (
     progress_status_icon,
     rich_progress,
 )
-from ..core.results import tasks_results
+from ..core.results import task_metric, tasks_results
 from ..core.rich import (
     is_vscode_notebook,
     record_console_input,
@@ -275,6 +276,15 @@ class RichTaskDisplay(TaskDisplay):
     def progress(self) -> Iterator[Progress]:
         yield self.p
+    @override
+    def sample_complete(self, complete: int, total: int) -> None:
+        self.p.update_count(complete, total)
+    @override
+    def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
+        if len(metrics) > 0:
+            self.p.update_score(task_metric(metrics))
     @override
     def complete(self, result: TaskResult) -> None:
         self.status.result = result
@@ -283,15 +293,18 @@ class RichTaskDisplay(TaskDisplay):
 def task_live_status(tasks: list[TaskStatus], progress: RProgress) -> RenderableType:
     theme = rich_theme()
-    body: list[RenderableType] = ["", progress]
+    # the panel contents
     config = task_config(tasks[0].profile, style=theme.light)
-    if config:
-        body = [config] + body
+    targets = task_targets(tasks[0].profile)
+    subtitle = config, targets
+    # the panel
     return task_panel(
         profile=tasks[0].profile,
         show_model=len(tasks) == 1,
-        body=Group(*body),
+        body=progress,
+        subtitle=subtitle,
         footer=task_footer(theme.light),
         log_location=None,
     )
@@ -321,9 +334,16 @@ def tasks_live_status(
     footer_table.add_row()
     footer_table.add_row(footer[0], footer[1])
+    # build a layout table
+    layout_table = Table.grid(expand=True)
+    layout_table.add_column()
+    layout_table.add_row(config)
+    layout_table.add_row(progress)
+    layout_table.add_row(footer_table)
     # create panel w/ title
     panel = Panel(
-        Group(config, progress, footer_table, fit=False),
+        layout_table,
         title=f"[bold][{theme.meta}]{tasks_title(completed, total_tasks)}[/{theme.meta}][/bold]",
         title_align="left",
         width=width,

inspect-ai 0.3.49__py3-none-any.whl → 0.3.51__py3-none-any.whl

inspect-ai 0.3.49py3-none-any.whl → 0.3.51py3-none-any.whl