PyPI - inspect-ai - Versions diffs - 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl - Mend

inspect-ai 0.3.55py3-none-any.whl → 0.3.57py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

inspect_ai/__init__.py +1 -0
inspect_ai/_cli/common.py +1 -1
inspect_ai/_cli/trace.py +33 -20
inspect_ai/_display/core/active.py +1 -1
inspect_ai/_display/core/display.py +1 -1
inspect_ai/_display/core/footer.py +1 -1
inspect_ai/_display/core/panel.py +1 -1
inspect_ai/_display/core/progress.py +0 -6
inspect_ai/_display/core/rich.py +1 -1
inspect_ai/_display/rich/display.py +2 -2
inspect_ai/_display/textual/app.py +15 -17
inspect_ai/_display/textual/widgets/clock.py +3 -3
inspect_ai/_display/textual/widgets/samples.py +6 -13
inspect_ai/_eval/context.py +9 -1
inspect_ai/_eval/run.py +16 -11
inspect_ai/_eval/score.py +4 -10
inspect_ai/_eval/task/results.py +5 -4
inspect_ai/_eval/task/run.py +6 -12
inspect_ai/_eval/task/task.py +10 -0
inspect_ai/_util/ansi.py +31 -0
inspect_ai/_util/datetime.py +1 -1
inspect_ai/_util/deprecation.py +1 -1
inspect_ai/_util/format.py +7 -0
inspect_ai/_util/json.py +11 -1
inspect_ai/_util/logger.py +14 -13
inspect_ai/_util/throttle.py +10 -1
inspect_ai/_util/trace.py +79 -47
inspect_ai/_util/transcript.py +37 -4
inspect_ai/_util/vscode.py +51 -0
inspect_ai/_view/notify.py +2 -1
inspect_ai/_view/www/.prettierrc.js +12 -0
inspect_ai/_view/www/App.css +22 -1
inspect_ai/_view/www/dist/assets/index.css +2374 -2
inspect_ai/_view/www/dist/assets/index.js +29752 -24492
inspect_ai/_view/www/log-schema.json +262 -215
inspect_ai/_view/www/package.json +1 -0
inspect_ai/_view/www/src/App.mjs +19 -9
inspect_ai/_view/www/src/Types.mjs +0 -1
inspect_ai/_view/www/src/api/Types.mjs +15 -4
inspect_ai/_view/www/src/api/api-http.mjs +2 -0
inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
inspect_ai/_view/www/src/components/Tools.mjs +28 -5
inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
inspect_ai/_view/www/src/types/log.d.ts +28 -20
inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
inspect_ai/_view/www/yarn.lock +44 -0
inspect_ai/approval/_apply.py +4 -0
inspect_ai/approval/_human/panel.py +5 -8
inspect_ai/dataset/_dataset.py +51 -10
inspect_ai/dataset/_util.py +31 -3
inspect_ai/log/__init__.py +2 -0
inspect_ai/log/_log.py +30 -2
inspect_ai/log/_recorders/eval.py +2 -0
inspect_ai/model/_call_tools.py +31 -7
inspect_ai/model/_chat_message.py +3 -0
inspect_ai/model/_model.py +42 -1
inspect_ai/model/_providers/anthropic.py +4 -0
inspect_ai/model/_providers/google.py +24 -6
inspect_ai/model/_providers/openai.py +17 -3
inspect_ai/model/_providers/openai_o1.py +10 -12
inspect_ai/model/_render.py +9 -2
inspect_ai/scorer/_metric.py +12 -1
inspect_ai/solver/__init__.py +2 -0
inspect_ai/solver/_human_agent/agent.py +83 -0
inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
inspect_ai/solver/_human_agent/commands/clock.py +70 -0
inspect_ai/solver/_human_agent/commands/command.py +59 -0
inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
inspect_ai/solver/_human_agent/commands/note.py +42 -0
inspect_ai/solver/_human_agent/commands/score.py +80 -0
inspect_ai/solver/_human_agent/commands/status.py +62 -0
inspect_ai/solver/_human_agent/commands/submit.py +151 -0
inspect_ai/solver/_human_agent/install.py +222 -0
inspect_ai/solver/_human_agent/panel.py +252 -0
inspect_ai/solver/_human_agent/service.py +45 -0
inspect_ai/solver/_human_agent/state.py +55 -0
inspect_ai/solver/_human_agent/view.py +24 -0
inspect_ai/solver/_task_state.py +28 -2
inspect_ai/tool/_tool.py +10 -2
inspect_ai/tool/_tool_info.py +2 -1
inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
inspect_ai/tool/_tools/_web_browser/_web_browser.py +16 -13
inspect_ai/util/__init__.py +12 -4
inspect_ai/{_util/display.py → util/_display.py} +6 -0
inspect_ai/util/_panel.py +31 -9
inspect_ai/util/_sandbox/__init__.py +0 -3
inspect_ai/util/_sandbox/context.py +5 -1
inspect_ai/util/_sandbox/docker/compose.py +17 -13
inspect_ai/util/_sandbox/docker/docker.py +9 -6
inspect_ai/util/_sandbox/docker/internal.py +1 -1
inspect_ai/util/_sandbox/docker/util.py +3 -2
inspect_ai/util/_sandbox/environment.py +6 -5
inspect_ai/util/_sandbox/local.py +1 -1
inspect_ai/util/_sandbox/self_check.py +18 -18
inspect_ai/util/_sandbox/service.py +22 -7
inspect_ai/util/_store.py +7 -8
inspect_ai/util/_store_model.py +110 -0
inspect_ai/util/_subprocess.py +3 -3
inspect_ai/util/_throttle.py +32 -0
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +131 -108
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
{inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0

inspect_ai/solver/_human_agent/commands/clock.py ADDED Viewed

@@ -0,0 +1,70 @@
+from argparse import Namespace
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from inspect_ai._util.format import format_progress_time
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+from .status import render_status
+class StartCommand(HumanAgentCommand):
+    @property
+    def name(self) -> str:
+        return "start"
+    @property
+    def description(self) -> str:
+        return "Start the task clock (resume working)."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 2
+    def cli(self, args: Namespace) -> None:
+        print(call_human_agent("start"))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        from inspect_ai.log._transcript import transcript
+        async def start() -> str:
+            if not state.running:
+                state.running = True
+                transcript().info(
+                    f"Task started (total time: {format_progress_time(state.time)})"
+                )
+            return render_status(state)
+        return start
+class StopCommand(HumanAgentCommand):
+    @property
+    def name(self) -> str:
+        return "stop"
+    @property
+    def description(self) -> str:
+        return "Stop the task clock (pause working)."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 2
+    def cli(self, args: Namespace) -> None:
+        print(call_human_agent("stop"))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        from inspect_ai.log._transcript import transcript
+        async def stop() -> str:
+            if state.running:
+                state.running = False
+                transcript().info(
+                    f"Task stopped (total time: {format_progress_time(state.time)})"
+                )
+            return render_status(state)
+        return stop

inspect_ai/solver/_human_agent/commands/command.py ADDED Viewed

@@ -0,0 +1,59 @@
+import abc
+from argparse import Namespace
+from typing import Any, Awaitable, Callable, Literal, NamedTuple
+from pydantic import JsonValue
+from ..state import HumanAgentState
+class HumanAgentCommand:
+    @property
+    @abc.abstractmethod
+    def name(self) -> str:
+        """Command name (e.g. 'submit')"""
+        ...
+    @property
+    @abc.abstractmethod
+    def description(self) -> str:
+        """Command description."""
+        ...
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    @property
+    def contexts(self) -> list[Literal["cli", "service"]]:
+        """Contexts where this command runs (defaults to both cli and service)."""
+        return ["cli", "service"]
+    class CLIArg(NamedTuple):
+        name: str
+        description: str
+        required: bool = False
+    @property
+    def cli_args(self) -> list[CLIArg]:
+        """Positional command line arguments."""
+        return []
+    def cli(self, args: Namespace) -> None:
+        """CLI command (runs in container). Required for context "cli"."""
+        pass
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        """Service handler (runs in solver). Required for context "service"."""
+        async def no_handler() -> None:
+            pass
+        return no_handler
+# Dummy functions for implementation of call methods
+def call_human_agent(method: str, **params: Any) -> Any:
+    return None

inspect_ai/solver/_human_agent/commands/instructions.py ADDED Viewed

@@ -0,0 +1,74 @@
+from argparse import Namespace
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from rich.console import Group
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+from inspect_ai._util.ansi import render_text
+from inspect_ai._util.transcript import DOUBLE_LINE
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+class InstructionsCommand(HumanAgentCommand):
+    def __init__(self, commands: list[HumanAgentCommand]) -> None:
+        self._commands = commands.copy() + [self]
+    @property
+    def name(self) -> str:
+        return "instructions"
+    @property
+    def description(self) -> str:
+        return "Display task commands and instructions."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 3
+    def cli(self, args: Namespace) -> None:
+        print(call_human_agent("instructions", **vars(args)))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def instructions() -> str:
+            intro = "\nYou will be completing a task based on the instructions presented below. You can use the following commands as you work on the task:\n"
+            commands_table = Table(box=None, show_header=False)
+            commands_table.add_column("", justify="left")
+            commands_table.add_column("", justify="left")
+            def add_command_group(group: int) -> None:
+                for command in filter(
+                    lambda c: "cli" in c.contexts and c.group == group, self._commands
+                ):
+                    commands_table.add_row(f"task {command.name}", command.description)
+                if group != 3:
+                    commands_table.add_row("", "")
+            for i in range(1, 4):
+                add_command_group(i)
+            header_panel = Panel(
+                Group(intro, commands_table),
+                title=Text.from_markup("[bold]Human Agent Task[/bold]"),
+                box=DOUBLE_LINE,
+                padding=(0, 0),
+            )
+            instructions_panel = Panel(
+                f"{state.instructions.strip()}",
+                title="Task Instructions",
+                padding=(1, 1),
+            )
+            return render_text(
+                ["", header_panel, instructions_panel],
+                styles=False,
+                no_color=True,
+                width=90,
+            )
+        return instructions

inspect_ai/solver/_human_agent/commands/note.py ADDED Viewed

@@ -0,0 +1,42 @@
+from argparse import Namespace
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+class NoteCommand(HumanAgentCommand):
+    @property
+    def name(self) -> str:
+        return "note"
+    @property
+    def description(self) -> str:
+        return "Record a note in the task transcript."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    def cli(self, args: Namespace) -> None:
+        print(
+            "Enter a multiline markdown note (Press Ctrl+D on a new line to finish):\n"
+        )
+        lines = ["## Human Agent Note"]
+        try:
+            while True:
+                line = input()
+                lines.append(line)
+        except EOFError:
+            pass
+        call_human_agent("note", content="\n".join(lines))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        from inspect_ai.log._transcript import transcript
+        async def note(content: str) -> None:
+            transcript().info(content)
+        return note

inspect_ai/solver/_human_agent/commands/score.py ADDED Viewed

@@ -0,0 +1,80 @@
+from argparse import Namespace
+from copy import deepcopy
+from textwrap import dedent
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from inspect_ai._util.ansi import render_text
+from inspect_ai.model._model_output import ModelOutput
+from inspect_ai.scorer._score import score
+from ..._task_state import TaskState
+from ..state import HumanAgentState, IntermediateScoring
+from .command import HumanAgentCommand, call_human_agent
+class ScoreCommand(HumanAgentCommand):
+    def __init__(self, state: TaskState):
+        self._state = state
+    @property
+    def name(self) -> str:
+        return "score"
+    @property
+    def description(self) -> str:
+        return "Score the task to check progress."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    @property
+    def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
+        return [
+            HumanAgentCommand.CLIArg(
+                name="answer",
+                description="Answer to submit for scoring (optional, not required for all tasks)",
+            )
+        ]
+    def cli(self, args: Namespace) -> None:
+        # first validate (print and exit if we get a str back)
+        call_args = vars(args)
+        error = call_human_agent("validate", **call_args)
+        if error:
+            print(error)
+            return
+        print(call_human_agent("score", **call_args))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def score_task(answer: str | None) -> str:
+            from inspect_ai.log._transcript import transcript
+            # make a copy of TaskState, add the answer, then score
+            if answer:
+                task_state = deepcopy(self._state)
+                task_state.output = ModelOutput.from_content("human_agent", answer)
+                result = await score(task_state)
+            else:
+                result = await score(self._state)
+            # record the scoring action in our state
+            state.scorings.append(IntermediateScoring(time=state.time, scores=result))
+            # record to transcript
+            transcript().info(
+                dedent(f"""
+            ### Intermediate Score
+            **Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
+            """)
+            )
+            # notify user
+            return render_text(
+                f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
+            )
+        return score_task

inspect_ai/solver/_human_agent/commands/status.py ADDED Viewed

@@ -0,0 +1,62 @@
+from argparse import Namespace
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from rich.console import RenderableType
+from rich.table import Table
+from rich.text import Text
+from inspect_ai._util.ansi import render_text
+from inspect_ai._util.format import format_progress_time
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+class StatusCommand(HumanAgentCommand):
+    @property
+    def name(self) -> str:
+        return "status"
+    @property
+    def description(self) -> str:
+        return "Print task status (clock, scoring, etc.)"
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 2
+    def cli(self, args: Namespace) -> None:
+        print(call_human_agent("status"))
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def status() -> str:
+            return render_status(state)
+        return status
+def render_status(state: HumanAgentState) -> str:
+    content: list[RenderableType] = [""]
+    content.append(
+        f"[bold]Status:[/bold] {'Running' if state.running else 'Stopped'}  "
+        + f"[bold]Time:[/bold] {format_progress_time(state.time, pad_hours=False)}"
+    )
+    if len(state.scorings) > 0:
+        content.append("")
+        content.append(Text.from_markup("[italic]Intermediate Scores[/italic]"))
+        scores_table = Table(box=None, min_width=35, padding=(0, 0))
+        scores_table.add_column("Answer", justify="left")
+        scores_table.add_column("Score", justify="center")
+        scores_table.add_column("Time", justify="right")
+        for score in state.scorings:
+            scores_table.add_row(
+                score.scores[0].answer,
+                score.scores[0].as_str(),
+                format_progress_time(score.time),
+            )
+        content.append(scores_table)
+    return render_text(content, highlight=False)

inspect_ai/solver/_human_agent/commands/submit.py ADDED Viewed

@@ -0,0 +1,151 @@
+from argparse import Namespace
+from logging import getLogger
+from pathlib import PurePosixPath
+from re import Pattern, compile, match
+from typing import Awaitable, Callable, Literal
+from pydantic import JsonValue
+from inspect_ai._util.ansi import render_text
+from inspect_ai.util._sandbox import sandbox
+from ..install import RECORD_SESSION_DIR
+from ..state import HumanAgentState
+from .command import HumanAgentCommand, call_human_agent
+logger = getLogger(__name__)
+class SubmitCommand(HumanAgentCommand):
+    def __init__(self, record_session: bool):
+        super().__init__()
+        self._record_session = record_session
+    @property
+    def name(self) -> str:
+        return "submit"
+    @property
+    def description(self) -> str:
+        return "Submit your final answer for the task."
+    @property
+    def group(self) -> Literal[1, 2, 3]:
+        return 1
+    @property
+    def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
+        return [
+            HumanAgentCommand.CLIArg(
+                name="answer",
+                description="Answer to submit for scoring (optional, not required for all tasks)",
+            )
+        ]
+    def cli(self, args: Namespace) -> None:
+        # read cli args
+        call_args = vars(args)
+        # first validate (print and exit if we get a str back)
+        error = call_human_agent("validate", **call_args)
+        if error:
+            print(error)
+            return
+        # verify that the user wants to proceed
+        answer = call_args.get("answer", None)
+        answer_text = f" '{answer}'" if answer else ""
+        while True:
+            response = (
+                input(
+                    f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
+                )
+                .lower()
+                .strip()
+            )
+            if response in ["yes", "y"]:
+                break
+            elif response in ["no", "n"]:
+                return
+            else:
+                print("Please enter yes or no.")
+        # thank the user!
+        print(
+            "\nThank you for working on this task!\n\n"
+            + "Your task will now be scored and you will be disconnected from this container.\n"
+        )
+        # submit the task
+        call_human_agent("submit", **call_args)
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def submit(
+            answer: str | None, session_logs: dict[str, str] | None = None
+        ) -> None:
+            if self._record_session:
+                state.logs = await self._read_session_logs()
+            state.running = False
+            state.answer = answer
+        return submit
+    async def _read_session_logs(self) -> dict[str, str]:
+        # retreive session logs (don't fail)
+        sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
+        result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
+        if not result.success:
+            logger.warning(f"Error listing human agent session logs: {result.stderr}")
+            return {}
+        # read logs
+        session_logs: dict[str, str] = {}
+        for session_log in result.stdout.strip().splitlines():
+            try:
+                session_logs[session_log] = await sandbox().read_file(
+                    (sessions_dir / session_log).as_posix()
+                )
+            except Exception as ex:
+                logger.warning(f"Error reading human agent session log: {ex}")
+        return session_logs
+class ValidateCommand(HumanAgentCommand):
+    def __init__(self, answer: bool | str) -> None:
+        self._answer = compile(answer) if isinstance(answer, str) else answer
+    @property
+    def name(self) -> str:
+        return "validate"
+    @property
+    def description(self) -> str:
+        return "Validate a task submission."
+    @property
+    def contexts(self) -> list[Literal["cli", "service"]]:
+        return ["service"]
+    def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
+        async def validate(answer: str | None) -> str | None:
+            def failed(reason: str) -> str:
+                return render_text(f"[bold]FAILED:[/bold] {reason}")
+            if not state.running:
+                return failed("Task is stopped (use 'task start' to start)")
+            if self._answer:
+                answer = answer.strip() if isinstance(answer, str) else answer
+                if not answer:
+                    return failed(
+                        "An explicit answer is required for scoring this task."
+                    )
+                elif isinstance(self._answer, Pattern) and not match(
+                    self._answer, answer
+                ):
+                    return failed(
+                        "Your answer was not in the required format (please review the task instructions)"
+                    )
+            return None  # made it through verification
+        return validate

inspect-ai 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl

inspect-ai 0.3.55py3-none-any.whl → 0.3.57py3-none-any.whl