PyPI - inspect-ai - Versions diffs - 0.3.54__py3-none-any.whl → 0.3.56__py3-none-any.whl - Mend

inspect-ai 0.3.54py3-none-any.whl → 0.3.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

inspect_ai/__init__.py +1 -0
inspect_ai/_cli/common.py +1 -1
inspect_ai/_cli/trace.py +33 -20
inspect_ai/_display/core/active.py +1 -1
inspect_ai/_display/core/display.py +1 -1
inspect_ai/_display/core/footer.py +1 -1
inspect_ai/_display/core/progress.py +0 -6
inspect_ai/_display/core/rich.py +1 -1
inspect_ai/_display/rich/display.py +2 -2
inspect_ai/_display/textual/app.py +15 -17
inspect_ai/_display/textual/widgets/clock.py +3 -3
inspect_ai/_display/textual/widgets/samples.py +6 -13
inspect_ai/_eval/context.py +9 -1
inspect_ai/_eval/score.py +4 -10
inspect_ai/_eval/task/log.py +2 -1
inspect_ai/_eval/task/results.py +5 -4
inspect_ai/_eval/task/run.py +6 -12
inspect_ai/_eval/task/task.py +10 -0
inspect_ai/_util/ansi.py +31 -0
inspect_ai/_util/format.py +7 -0
inspect_ai/_util/logger.py +12 -12
inspect_ai/_util/throttle.py +10 -1
inspect_ai/_util/trace.py +43 -47
inspect_ai/_util/transcript.py +4 -0
inspect_ai/_util/vscode.py +51 -0
inspect_ai/_view/notify.py +2 -1
inspect_ai/_view/www/App.css +22 -1
inspect_ai/_view/www/dist/assets/index.css +2374 -2
inspect_ai/_view/www/dist/assets/index.js +29622 -24424
inspect_ai/_view/www/log-schema.json +138 -90
inspect_ai/_view/www/package.json +1 -0
inspect_ai/_view/www/src/App.mjs +1 -0
inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
inspect_ai/_view/www/src/components/Tools.mjs +11 -3
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
inspect_ai/_view/www/src/types/log.d.ts +26 -12
inspect_ai/_view/www/yarn.lock +44 -0
inspect_ai/approval/_apply.py +4 -0
inspect_ai/approval/_human/panel.py +5 -8
inspect_ai/dataset/_dataset.py +51 -10
inspect_ai/dataset/_util.py +31 -3
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_log.py +5 -2
inspect_ai/model/_cache.py +1 -1
inspect_ai/model/_call_tools.py +4 -2
inspect_ai/model/_chat_message.py +3 -0
inspect_ai/model/_model.py +42 -1
inspect_ai/model/_providers/anthropic.py +4 -0
inspect_ai/model/_providers/openai.py +11 -1
inspect_ai/model/_render.py +9 -2
inspect_ai/scorer/_metric.py +12 -1
inspect_ai/solver/__init__.py +2 -0
inspect_ai/solver/_human_agent/agent.py +83 -0
inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
inspect_ai/solver/_human_agent/commands/clock.py +70 -0
inspect_ai/solver/_human_agent/commands/command.py +59 -0
inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
inspect_ai/solver/_human_agent/commands/note.py +42 -0
inspect_ai/solver/_human_agent/commands/score.py +80 -0
inspect_ai/solver/_human_agent/commands/status.py +62 -0
inspect_ai/solver/_human_agent/commands/submit.py +151 -0
inspect_ai/solver/_human_agent/install.py +222 -0
inspect_ai/solver/_human_agent/panel.py +252 -0
inspect_ai/solver/_human_agent/service.py +45 -0
inspect_ai/solver/_human_agent/state.py +55 -0
inspect_ai/solver/_human_agent/view.py +24 -0
inspect_ai/solver/_task_state.py +28 -2
inspect_ai/tool/_tool.py +10 -2
inspect_ai/tool/_tools/_web_browser/_web_browser.py +13 -10
inspect_ai/util/__init__.py +8 -4
inspect_ai/{_util/display.py → util/_display.py} +6 -0
inspect_ai/util/_panel.py +31 -9
inspect_ai/util/_sandbox/__init__.py +0 -3
inspect_ai/util/_sandbox/context.py +5 -1
inspect_ai/util/_sandbox/docker/compose.py +16 -10
inspect_ai/util/_sandbox/docker/docker.py +9 -6
inspect_ai/util/_sandbox/docker/internal.py +1 -1
inspect_ai/util/_sandbox/docker/util.py +2 -2
inspect_ai/util/_sandbox/environment.py +6 -5
inspect_ai/util/_sandbox/local.py +1 -1
inspect_ai/util/_sandbox/service.py +22 -7
inspect_ai/util/_store.py +5 -6
inspect_ai/util/_store_model.py +110 -0
inspect_ai/util/_throttle.py +32 -0
{inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/METADATA +2 -2
{inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/RECORD +98 -76
{inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/WHEEL +0 -0
{inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.54.dist-info → inspect_ai-0.3.56.dist-info}/top_level.txt +0 -0

inspect_ai/model/_render.py CHANGED Viewed

@@ -3,13 +3,20 @@ from rich.console import RenderableType
 from inspect_ai.tool._tool_call import ToolCall
 from inspect_ai.tool._tool_transcript import transcript_tool_call
-from ._chat_message import ChatMessage, ChatMessageAssistant, ChatMessageTool
+from ._chat_message import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ChatMessageTool,
+    ChatMessageUser,
+)
 def messages_preceding_assistant(messages: list[ChatMessage]) -> list[ChatMessage]:
     preceding: list[ChatMessage] = []
     for m in reversed(messages):
-        if not isinstance(m, ChatMessageTool | ChatMessageAssistant):
+        if not isinstance(m, ChatMessageTool | ChatMessageAssistant) and not (
+            isinstance(m, ChatMessageUser) and m.tool_call_id
+        ):
             preceding.append(m)
         else:
             break

inspect_ai/scorer/_metric.py CHANGED Viewed

@@ -90,6 +90,13 @@ class Score(BaseModel):
         """Read the score as a boolean."""
         return bool(self._as_scalar())
+    def as_list(self) -> list[str | int | float | bool]:
+        """Read the score as a list."""
+        if isinstance(self.value, list):
+            return self.value
+        else:
+            raise ValueError("This score is not a list")
     def as_dict(self) -> dict[str, str | int | float | bool | None]:
         """Read the score as a dictionary."""
         if isinstance(self.value, dict):
@@ -104,13 +111,17 @@ class Score(BaseModel):
             raise ValueError("This score is not a scalar")
-class SampleScore(Score):
+class SampleScore(BaseModel):
     """Score for a Sample
     Args:
+       score: Score
        sample_id: (str | int | None) Unique id of a sample
     """
+    score: Score
+    """A score"""
     sample_id: str | int | None = Field(default=None)
     """A sample id"""

inspect_ai/solver/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from ._basic_agent import basic_agent
 from ._chain import chain
 from ._critique import self_critique
 from ._fork import fork
+from ._human_agent.agent import human_agent
 from ._multiple_choice import MultipleChoiceTemplate, multiple_choice
 from ._plan import Plan, plan
 from ._prompt import (
@@ -17,6 +18,7 @@ from ._use_tools import use_tools
 __all__ = [
     "basic_agent",
+    "human_agent",
     "chain",
     "fork",
     "generate",

inspect_ai/solver/_human_agent/agent.py ADDED Viewed

@@ -0,0 +1,83 @@
+import asyncio
+from inspect_ai.util import display_type, input_panel, sandbox
+from .._solver import Generate, Solver, solver
+from .._task_state import TaskState
+from .commands import human_agent_commands
+from .install import install_human_agent
+from .panel import HumanAgentPanel
+from .service import run_human_agent_service
+from .view import ConsoleView, HumanAgentView
+@solver
+def human_agent(
+    answer: bool | str = True,
+    intermediate_scoring: bool = False,
+    record_session: bool = True,
+) -> Solver:
+    """Human solver for agentic tasks that run in a Linux environment.
+    The Human agent solver installs agent task tools in the default
+    sandbox and presents the user with both task instructions and
+    documentation for the various tools (e.g. `task submit`,
+    `task start`, `task stop` `task instructions`, etc.). A human agent panel
+    is displayed with instructions for logging in to the sandbox.
+    If the user is running in VS Code with the Inspect extension,
+    they will also be presented with links to login to the sandbox
+    using a VS Code Window or Terminal.
+    Args:
+       answer (bool | str): Is an explicit answer required for this
+          task or is it scored based on files in the container? Pass a
+          `str` with a regex to validate that the answer matches
+          the expected format.
+       intermediate_scoring (bool): Allow the human agent to
+          check their score while working.
+       record_session (bool): Record all user commands and outputs in
+          the sandbox bash session.
+    Returns:
+       Solver: Human agent solver.
+    """
+    # we can only run one human agent interaction at a time (use lock to enforce)
+    agent_lock = asyncio.Lock()
+    async def solve(state: TaskState, generate: Generate) -> TaskState:
+        async with agent_lock:
+            # ensure that we have a sandbox to work with
+            try:
+                connection = await sandbox().connection()
+            except ProcessLookupError:
+                raise RuntimeError("Human agent must run in a task with a sandbox.")
+            except NotImplementedError:
+                raise RuntimeError(
+                    "Human agent must run with a sandbox that supports connections."
+                )
+            # helper function to run the agent (called for fullscreen vs. fallback below)
+            async def run_human_agent(view: HumanAgentView) -> TaskState:
+                # create agent commands
+                commands = human_agent_commands(
+                    state, answer, intermediate_scoring, record_session
+                )
+                # install agent tools
+                await install_human_agent(state, commands, record_session)
+                # hookup the view ui
+                view.connect(connection)
+                # run sandbox service
+                return await run_human_agent_service(state, commands, view)
+            # support both fullscreen ui and fallback
+            if display_type() == "full":
+                async with await input_panel(HumanAgentPanel) as panel:
+                    return await run_human_agent(panel)
+            else:
+                return await run_human_agent(ConsoleView())
+    return solve

inspect_ai/solver/_human_agent/commands/__init__.py ADDED Viewed

@@ -0,0 +1,36 @@
+from inspect_ai.solver._task_state import TaskState
+from .clock import StartCommand, StopCommand
+from .command import HumanAgentCommand
+from .instructions import InstructionsCommand
+from .note import NoteCommand
+from .score import ScoreCommand
+from .status import StatusCommand
+from .submit import SubmitCommand, ValidateCommand
+def human_agent_commands(
+    state: TaskState,
+    answer: bool | str,
+    intermediate_scoring: bool,
+    record_session: bool,
+) -> list[HumanAgentCommand]:
+    # base submit and validate
+    commands = [SubmitCommand(record_session), ValidateCommand(answer)]
+    # optional intermediate scoring
+    if intermediate_scoring:
+        commands.append(ScoreCommand(state))
+    # remaining commands
+    commands.extend(
+        [
+            NoteCommand(),
+            StatusCommand(),
+            StartCommand(),
+            StopCommand(),
+        ]
+    )
+    # with instructions (letting it see the other commands)
+    return commands + [InstructionsCommand(commands)]

inspect_ai/solver/_human_agent/commands/clock.py ADDED Viewed

@@ -0,0 +1,70 @@
+from argparse import Namespace
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from inspect_ai._util.format import format_progress_time
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+from .status import render_status
+class StartCommand(HumanAgentCommand):
+    @property
+    def name(self) -> str:
+        return "start"
+    @property
+    def description(self) -> str:
+        return "Start the task clock (resume working)."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 2
+    def cli(self, args: Namespace) -> None:
+        print(call_human_agent("start"))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        from inspect_ai.log._transcript import transcript
+        async def start() -> str:
+            if not state.running:
+                state.running = True
+                transcript().info(
+                    f"Task started (total time: {format_progress_time(state.time)})"
+                )
+            return render_status(state)
+        return start
+class StopCommand(HumanAgentCommand):
+    @property
+    def name(self) -> str:
+        return "stop"
+    @property
+    def description(self) -> str:
+        return "Stop the task clock (pause working)."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 2
+    def cli(self, args: Namespace) -> None:
+        print(call_human_agent("stop"))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        from inspect_ai.log._transcript import transcript
+        async def stop() -> str:
+            if state.running:
+                state.running = False
+                transcript().info(
+                    f"Task stopped (total time: {format_progress_time(state.time)})"
+                )
+            return render_status(state)
+        return stop

inspect_ai/solver/_human_agent/commands/command.py ADDED Viewed

@@ -0,0 +1,59 @@
+import abc
+from argparse import Namespace
+from typing import Any, Awaitable, Callable, Literal, NamedTuple
+from pydantic import JsonValue
+from ..state import HumanAgentState
+class HumanAgentCommand:
+    @property
+    @abc.abstractmethod
+    def name(self) -> str:
+        """Command name (e.g. 'submit')"""
+        ...
+    @property
+    @abc.abstractmethod
+    def description(self) -> str:
+        """Command description."""
+        ...
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    @property
+    def contexts(self) -> list[Literal["cli", "service"]]:
+        """Contexts where this command runs (defaults to both cli and service)."""
+        return ["cli", "service"]
+    class CLIArg(NamedTuple):
+        name: str
+        description: str
+        required: bool = False
+    @property
+    def cli_args(self) -> list[CLIArg]:
+        """Positional command line arguments."""
+        return []
+    def cli(self, args: Namespace) -> None:
+        """CLI command (runs in container). Required for context "cli"."""
+        pass
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        """Service handler (runs in solver). Required for context "service"."""
+        async def no_handler() -> None:
+            pass
+        return no_handler
+# Dummy functions for implementation of call methods
+def call_human_agent(method: str, **params: Any) -> Any:
+    return None

inspect_ai/solver/_human_agent/commands/instructions.py ADDED Viewed

@@ -0,0 +1,74 @@
+from argparse import Namespace
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from rich.console import Group
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+from inspect_ai._util.ansi import render_text
+from inspect_ai._util.transcript import DOUBLE_LINE
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+class InstructionsCommand(HumanAgentCommand):
+    def __init__(self, commands: list[HumanAgentCommand]) -> None:
+        self._commands = commands.copy() + [self]
+    @property
+    def name(self) -> str:
+        return "instructions"
+    @property
+    def description(self) -> str:
+        return "Display task commands and instructions."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 3
+    def cli(self, args: Namespace) -> None:
+        print(call_human_agent("instructions", **vars(args)))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def instructions() -> str:
+            intro = "\nYou will be completing a task based on the instructions presented below. You can use the following commands as you work on the task:\n"
+            commands_table = Table(box=None, show_header=False)
+            commands_table.add_column("", justify="left")
+            commands_table.add_column("", justify="left")
+            def add_command_group(group: int) -> None:
+                for command in filter(
+                    lambda c: "cli" in c.contexts and c.group == group, self._commands
+                ):
+                    commands_table.add_row(f"task {command.name}", command.description)
+                if group != 3:
+                    commands_table.add_row("", "")
+            for i in range(1, 4):
+                add_command_group(i)
+            header_panel = Panel(
+                Group(intro, commands_table),
+                title=Text.from_markup("[bold]Human Agent Task[/bold]"),
+                box=DOUBLE_LINE,
+                padding=(0, 0),
+            )
+            instructions_panel = Panel(
+                f"{state.instructions.strip()}",
+                title="Task Instructions",
+                padding=(1, 1),
+            )
+            return render_text(
+                ["", header_panel, instructions_panel],
+                styles=False,
+                no_color=True,
+                width=90,
+            )
+        return instructions

inspect_ai/solver/_human_agent/commands/note.py ADDED Viewed

@@ -0,0 +1,42 @@
+from argparse import Namespace
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+class NoteCommand(HumanAgentCommand):
+    @property
+    def name(self) -> str:
+        return "note"
+    @property
+    def description(self) -> str:
+        return "Record a note in the task transcript."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    def cli(self, args: Namespace) -> None:
+        print(
+            "Enter a multiline markdown note (Press Ctrl+D on a new line to finish):\n"
+        )
+        lines = ["## Human Agent Note"]
+        try:
+            while True:
+                line = input()
+                lines.append(line)
+        except EOFError:
+            pass
+        call_human_agent("note", content="\n".join(lines))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        from inspect_ai.log._transcript import transcript
+        async def note(content: str) -> None:
+            transcript().info(content)
+        return note

inspect_ai/solver/_human_agent/commands/score.py ADDED Viewed

@@ -0,0 +1,80 @@
+from argparse import Namespace
+from copy import deepcopy
+from textwrap import dedent
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from inspect_ai._util.ansi import render_text
+from inspect_ai.model._model_output import ModelOutput
+from inspect_ai.scorer._score import score
+from ..._task_state import TaskState
+from ..state import HumanAgentState, IntermediateScoring
+from .command import HumanAgentCommand, call_human_agent
+class ScoreCommand(HumanAgentCommand):
+    def __init__(self, state: TaskState):
+        self._state = state
+    @property
+    def name(self) -> str:
+        return "score"
+    @property
+    def description(self) -> str:
+        return "Score the task to check progress."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    @property
+    def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
+        return [
+            HumanAgentCommand.CLIArg(
+                name="answer",
+                description="Answer to submit for scoring (optional, not required for all tasks)",
+            )
+        ]
+    def cli(self, args: Namespace) -> None:
+        # first validate (print and exit if we get a str back)
+        call_args = vars(args)
+        error = call_human_agent("validate", **call_args)
+        if error:
+            print(error)
+            return
+        print(call_human_agent("score", **call_args))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def score_task(answer: str | None) -> str:
+            from inspect_ai.log._transcript import transcript
+            # make a copy of TaskState, add the answer, then score
+            if answer:
+                task_state = deepcopy(self._state)
+                task_state.output = ModelOutput.from_content("human_agent", answer)
+                result = await score(task_state)
+            else:
+                result = await score(self._state)
+            # record the scoring action in our state
+            state.scorings.append(IntermediateScoring(time=state.time, scores=result))
+            # record to transcript
+            transcript().info(
+                dedent(f"""
+            ### Intermediate Score
+            **Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
+            """)
+            )
+            # notify user
+            return render_text(
+                f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
+            )
+        return score_task

inspect_ai/solver/_human_agent/commands/status.py ADDED Viewed

@@ -0,0 +1,62 @@
+from argparse import Namespace
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from rich.console import RenderableType
+from rich.table import Table
+from rich.text import Text
+from inspect_ai._util.ansi import render_text
+from inspect_ai._util.format import format_progress_time
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+class StatusCommand(HumanAgentCommand):
+    @property
+    def name(self) -> str:
+        return "status"
+    @property
+    def description(self) -> str:
+        return "Print task status (clock, scoring, etc.)"
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 2
+    def cli(self, args: Namespace) -> None:
+        print(call_human_agent("status"))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def status() -> str:
+            return render_status(state)
+        return status
+def render_status(state: HumanAgentState) -> str:
+    content: list[RenderableType] = [""]
+    content.append(
+        f"[bold]Status:[/bold] {'Running' if state.running else 'Stopped'}  "
+        + f"[bold]Time:[/bold] {format_progress_time(state.time, pad_hours=False)}"
+    )
+    if len(state.scorings) > 0:
+        content.append("")
+        content.append(Text.from_markup("[italic]Intermediate Scores[/italic]"))
+        scores_table = Table(box=None, min_width=35, padding=(0, 0))
+        scores_table.add_column("Answer", justify="left")
+        scores_table.add_column("Score", justify="center")
+        scores_table.add_column("Time", justify="right")
+        for score in state.scorings:
+            scores_table.add_row(
+                score.scores[0].answer,
+                score.scores[0].as_str(),
+                format_progress_time(score.time),
+            )
+        content.append(scores_table)
+    return render_text(content, highlight=False)

inspect-ai 0.3.54__py3-none-any.whl → 0.3.56__py3-none-any.whl

inspect-ai 0.3.54py3-none-any.whl → 0.3.56py3-none-any.whl