inspect-ai 0.3.55__py3-none-any.whl → 0.3.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +1 -0
- inspect_ai/_cli/common.py +1 -1
- inspect_ai/_cli/trace.py +33 -20
- inspect_ai/_display/core/active.py +1 -1
- inspect_ai/_display/core/display.py +1 -1
- inspect_ai/_display/core/footer.py +1 -1
- inspect_ai/_display/core/progress.py +0 -6
- inspect_ai/_display/core/rich.py +1 -1
- inspect_ai/_display/rich/display.py +2 -2
- inspect_ai/_display/textual/app.py +15 -17
- inspect_ai/_display/textual/widgets/clock.py +3 -3
- inspect_ai/_display/textual/widgets/samples.py +6 -13
- inspect_ai/_eval/context.py +9 -1
- inspect_ai/_eval/score.py +4 -10
- inspect_ai/_eval/task/results.py +5 -4
- inspect_ai/_eval/task/run.py +6 -12
- inspect_ai/_eval/task/task.py +10 -0
- inspect_ai/_util/ansi.py +31 -0
- inspect_ai/_util/format.py +7 -0
- inspect_ai/_util/logger.py +12 -12
- inspect_ai/_util/throttle.py +10 -1
- inspect_ai/_util/trace.py +43 -47
- inspect_ai/_util/transcript.py +4 -0
- inspect_ai/_util/vscode.py +51 -0
- inspect_ai/_view/notify.py +2 -1
- inspect_ai/_view/www/App.css +22 -1
- inspect_ai/_view/www/dist/assets/index.css +2374 -2
- inspect_ai/_view/www/dist/assets/index.js +29622 -24424
- inspect_ai/_view/www/log-schema.json +138 -90
- inspect_ai/_view/www/package.json +1 -0
- inspect_ai/_view/www/src/App.mjs +1 -0
- inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
- inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
- inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
- inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
- inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
- inspect_ai/_view/www/src/components/Tools.mjs +11 -3
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
- inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
- inspect_ai/_view/www/src/types/log.d.ts +26 -12
- inspect_ai/_view/www/yarn.lock +44 -0
- inspect_ai/approval/_apply.py +4 -0
- inspect_ai/approval/_human/panel.py +5 -8
- inspect_ai/dataset/_dataset.py +51 -10
- inspect_ai/dataset/_util.py +31 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +5 -2
- inspect_ai/model/_call_tools.py +4 -2
- inspect_ai/model/_chat_message.py +3 -0
- inspect_ai/model/_model.py +42 -1
- inspect_ai/model/_providers/anthropic.py +4 -0
- inspect_ai/model/_render.py +9 -2
- inspect_ai/scorer/_metric.py +12 -1
- inspect_ai/solver/__init__.py +2 -0
- inspect_ai/solver/_human_agent/agent.py +83 -0
- inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
- inspect_ai/solver/_human_agent/commands/clock.py +70 -0
- inspect_ai/solver/_human_agent/commands/command.py +59 -0
- inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
- inspect_ai/solver/_human_agent/commands/note.py +42 -0
- inspect_ai/solver/_human_agent/commands/score.py +80 -0
- inspect_ai/solver/_human_agent/commands/status.py +62 -0
- inspect_ai/solver/_human_agent/commands/submit.py +151 -0
- inspect_ai/solver/_human_agent/install.py +222 -0
- inspect_ai/solver/_human_agent/panel.py +252 -0
- inspect_ai/solver/_human_agent/service.py +45 -0
- inspect_ai/solver/_human_agent/state.py +55 -0
- inspect_ai/solver/_human_agent/view.py +24 -0
- inspect_ai/solver/_task_state.py +28 -2
- inspect_ai/tool/_tool.py +10 -2
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +13 -10
- inspect_ai/util/__init__.py +8 -4
- inspect_ai/{_util/display.py → util/_display.py} +6 -0
- inspect_ai/util/_panel.py +31 -9
- inspect_ai/util/_sandbox/__init__.py +0 -3
- inspect_ai/util/_sandbox/context.py +5 -1
- inspect_ai/util/_sandbox/docker/compose.py +16 -10
- inspect_ai/util/_sandbox/docker/docker.py +9 -6
- inspect_ai/util/_sandbox/docker/internal.py +1 -1
- inspect_ai/util/_sandbox/docker/util.py +2 -2
- inspect_ai/util/_sandbox/environment.py +6 -5
- inspect_ai/util/_sandbox/local.py +1 -1
- inspect_ai/util/_sandbox/service.py +22 -7
- inspect_ai/util/_store.py +5 -6
- inspect_ai/util/_store_model.py +110 -0
- inspect_ai/util/_throttle.py +32 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/RECORD +95 -73
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/WHEEL +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/top_level.txt +0 -0
inspect_ai/solver/__init__.py
CHANGED
@@ -4,6 +4,7 @@ from ._basic_agent import basic_agent
|
|
4
4
|
from ._chain import chain
|
5
5
|
from ._critique import self_critique
|
6
6
|
from ._fork import fork
|
7
|
+
from ._human_agent.agent import human_agent
|
7
8
|
from ._multiple_choice import MultipleChoiceTemplate, multiple_choice
|
8
9
|
from ._plan import Plan, plan
|
9
10
|
from ._prompt import (
|
@@ -17,6 +18,7 @@ from ._use_tools import use_tools
|
|
17
18
|
|
18
19
|
__all__ = [
|
19
20
|
"basic_agent",
|
21
|
+
"human_agent",
|
20
22
|
"chain",
|
21
23
|
"fork",
|
22
24
|
"generate",
|
@@ -0,0 +1,83 @@
|
|
1
|
+
import asyncio
|
2
|
+
|
3
|
+
from inspect_ai.util import display_type, input_panel, sandbox
|
4
|
+
|
5
|
+
from .._solver import Generate, Solver, solver
|
6
|
+
from .._task_state import TaskState
|
7
|
+
from .commands import human_agent_commands
|
8
|
+
from .install import install_human_agent
|
9
|
+
from .panel import HumanAgentPanel
|
10
|
+
from .service import run_human_agent_service
|
11
|
+
from .view import ConsoleView, HumanAgentView
|
12
|
+
|
13
|
+
|
14
|
+
@solver
|
15
|
+
def human_agent(
|
16
|
+
answer: bool | str = True,
|
17
|
+
intermediate_scoring: bool = False,
|
18
|
+
record_session: bool = True,
|
19
|
+
) -> Solver:
|
20
|
+
"""Human solver for agentic tasks that run in a Linux environment.
|
21
|
+
|
22
|
+
The Human agent solver installs agent task tools in the default
|
23
|
+
sandbox and presents the user with both task instructions and
|
24
|
+
documentation for the various tools (e.g. `task submit`,
|
25
|
+
`task start`, `task stop` `task instructions`, etc.). A human agent panel
|
26
|
+
is displayed with instructions for logging in to the sandbox.
|
27
|
+
|
28
|
+
If the user is running in VS Code with the Inspect extension,
|
29
|
+
they will also be presented with links to login to the sandbox
|
30
|
+
using a VS Code Window or Terminal.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
answer (bool | str): Is an explicit answer required for this
|
34
|
+
task or is it scored based on files in the container? Pass a
|
35
|
+
`str` with a regex to validate that the answer matches
|
36
|
+
the expected format.
|
37
|
+
intermediate_scoring (bool): Allow the human agent to
|
38
|
+
check their score while working.
|
39
|
+
record_session (bool): Record all user commands and outputs in
|
40
|
+
the sandbox bash session.
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
Solver: Human agent solver.
|
44
|
+
"""
|
45
|
+
# we can only run one human agent interaction at a time (use lock to enforce)
|
46
|
+
agent_lock = asyncio.Lock()
|
47
|
+
|
48
|
+
async def solve(state: TaskState, generate: Generate) -> TaskState:
|
49
|
+
async with agent_lock:
|
50
|
+
# ensure that we have a sandbox to work with
|
51
|
+
try:
|
52
|
+
connection = await sandbox().connection()
|
53
|
+
except ProcessLookupError:
|
54
|
+
raise RuntimeError("Human agent must run in a task with a sandbox.")
|
55
|
+
except NotImplementedError:
|
56
|
+
raise RuntimeError(
|
57
|
+
"Human agent must run with a sandbox that supports connections."
|
58
|
+
)
|
59
|
+
|
60
|
+
# helper function to run the agent (called for fullscreen vs. fallback below)
|
61
|
+
async def run_human_agent(view: HumanAgentView) -> TaskState:
|
62
|
+
# create agent commands
|
63
|
+
commands = human_agent_commands(
|
64
|
+
state, answer, intermediate_scoring, record_session
|
65
|
+
)
|
66
|
+
|
67
|
+
# install agent tools
|
68
|
+
await install_human_agent(state, commands, record_session)
|
69
|
+
|
70
|
+
# hookup the view ui
|
71
|
+
view.connect(connection)
|
72
|
+
|
73
|
+
# run sandbox service
|
74
|
+
return await run_human_agent_service(state, commands, view)
|
75
|
+
|
76
|
+
# support both fullscreen ui and fallback
|
77
|
+
if display_type() == "full":
|
78
|
+
async with await input_panel(HumanAgentPanel) as panel:
|
79
|
+
return await run_human_agent(panel)
|
80
|
+
else:
|
81
|
+
return await run_human_agent(ConsoleView())
|
82
|
+
|
83
|
+
return solve
|
@@ -0,0 +1,36 @@
|
|
1
|
+
from inspect_ai.solver._task_state import TaskState
|
2
|
+
|
3
|
+
from .clock import StartCommand, StopCommand
|
4
|
+
from .command import HumanAgentCommand
|
5
|
+
from .instructions import InstructionsCommand
|
6
|
+
from .note import NoteCommand
|
7
|
+
from .score import ScoreCommand
|
8
|
+
from .status import StatusCommand
|
9
|
+
from .submit import SubmitCommand, ValidateCommand
|
10
|
+
|
11
|
+
|
12
|
+
def human_agent_commands(
|
13
|
+
state: TaskState,
|
14
|
+
answer: bool | str,
|
15
|
+
intermediate_scoring: bool,
|
16
|
+
record_session: bool,
|
17
|
+
) -> list[HumanAgentCommand]:
|
18
|
+
# base submit and validate
|
19
|
+
commands = [SubmitCommand(record_session), ValidateCommand(answer)]
|
20
|
+
|
21
|
+
# optional intermediate scoring
|
22
|
+
if intermediate_scoring:
|
23
|
+
commands.append(ScoreCommand(state))
|
24
|
+
|
25
|
+
# remaining commands
|
26
|
+
commands.extend(
|
27
|
+
[
|
28
|
+
NoteCommand(),
|
29
|
+
StatusCommand(),
|
30
|
+
StartCommand(),
|
31
|
+
StopCommand(),
|
32
|
+
]
|
33
|
+
)
|
34
|
+
|
35
|
+
# with instructions (letting it see the other commands)
|
36
|
+
return commands + [InstructionsCommand(commands)]
|
@@ -0,0 +1,70 @@
|
|
1
|
+
from argparse import Namespace
|
2
|
+
from typing import Awaitable, Callable, Literal
|
3
|
+
|
4
|
+
from pydantic import JsonValue
|
5
|
+
|
6
|
+
from inspect_ai._util.format import format_progress_time
|
7
|
+
|
8
|
+
from ..state import HumanAgentState
|
9
|
+
from .command import HumanAgentCommand, call_human_agent
|
10
|
+
from .status import render_status
|
11
|
+
|
12
|
+
|
13
|
+
class StartCommand(HumanAgentCommand):
|
14
|
+
@property
|
15
|
+
def name(self) -> str:
|
16
|
+
return "start"
|
17
|
+
|
18
|
+
@property
|
19
|
+
def description(self) -> str:
|
20
|
+
return "Start the task clock (resume working)."
|
21
|
+
|
22
|
+
@property
|
23
|
+
def group(self) -> Literal[1, 2, 3]:
|
24
|
+
return 2
|
25
|
+
|
26
|
+
def cli(self, args: Namespace) -> None:
|
27
|
+
print(call_human_agent("start"))
|
28
|
+
|
29
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
30
|
+
from inspect_ai.log._transcript import transcript
|
31
|
+
|
32
|
+
async def start() -> str:
|
33
|
+
if not state.running:
|
34
|
+
state.running = True
|
35
|
+
transcript().info(
|
36
|
+
f"Task started (total time: {format_progress_time(state.time)})"
|
37
|
+
)
|
38
|
+
return render_status(state)
|
39
|
+
|
40
|
+
return start
|
41
|
+
|
42
|
+
|
43
|
+
class StopCommand(HumanAgentCommand):
|
44
|
+
@property
|
45
|
+
def name(self) -> str:
|
46
|
+
return "stop"
|
47
|
+
|
48
|
+
@property
|
49
|
+
def description(self) -> str:
|
50
|
+
return "Stop the task clock (pause working)."
|
51
|
+
|
52
|
+
@property
|
53
|
+
def group(self) -> Literal[1, 2, 3]:
|
54
|
+
return 2
|
55
|
+
|
56
|
+
def cli(self, args: Namespace) -> None:
|
57
|
+
print(call_human_agent("stop"))
|
58
|
+
|
59
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
60
|
+
from inspect_ai.log._transcript import transcript
|
61
|
+
|
62
|
+
async def stop() -> str:
|
63
|
+
if state.running:
|
64
|
+
state.running = False
|
65
|
+
transcript().info(
|
66
|
+
f"Task stopped (total time: {format_progress_time(state.time)})"
|
67
|
+
)
|
68
|
+
return render_status(state)
|
69
|
+
|
70
|
+
return stop
|
@@ -0,0 +1,59 @@
|
|
1
|
+
import abc
|
2
|
+
from argparse import Namespace
|
3
|
+
from typing import Any, Awaitable, Callable, Literal, NamedTuple
|
4
|
+
|
5
|
+
from pydantic import JsonValue
|
6
|
+
|
7
|
+
from ..state import HumanAgentState
|
8
|
+
|
9
|
+
|
10
|
+
class HumanAgentCommand:
|
11
|
+
@property
|
12
|
+
@abc.abstractmethod
|
13
|
+
def name(self) -> str:
|
14
|
+
"""Command name (e.g. 'submit')"""
|
15
|
+
...
|
16
|
+
|
17
|
+
@property
|
18
|
+
@abc.abstractmethod
|
19
|
+
def description(self) -> str:
|
20
|
+
"""Command description."""
|
21
|
+
...
|
22
|
+
|
23
|
+
@property
|
24
|
+
def group(self) -> Literal[1, 2, 3]:
|
25
|
+
return 1
|
26
|
+
|
27
|
+
@property
|
28
|
+
def contexts(self) -> list[Literal["cli", "service"]]:
|
29
|
+
"""Contexts where this command runs (defaults to both cli and service)."""
|
30
|
+
return ["cli", "service"]
|
31
|
+
|
32
|
+
class CLIArg(NamedTuple):
|
33
|
+
name: str
|
34
|
+
description: str
|
35
|
+
required: bool = False
|
36
|
+
|
37
|
+
@property
|
38
|
+
def cli_args(self) -> list[CLIArg]:
|
39
|
+
"""Positional command line arguments."""
|
40
|
+
return []
|
41
|
+
|
42
|
+
def cli(self, args: Namespace) -> None:
|
43
|
+
"""CLI command (runs in container). Required for context "cli"."""
|
44
|
+
pass
|
45
|
+
|
46
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
47
|
+
"""Service handler (runs in solver). Required for context "service"."""
|
48
|
+
|
49
|
+
async def no_handler() -> None:
|
50
|
+
pass
|
51
|
+
|
52
|
+
return no_handler
|
53
|
+
|
54
|
+
|
55
|
+
# Dummy functions for implementation of call methods
|
56
|
+
|
57
|
+
|
58
|
+
def call_human_agent(method: str, **params: Any) -> Any:
|
59
|
+
return None
|
@@ -0,0 +1,74 @@
|
|
1
|
+
from argparse import Namespace
|
2
|
+
from typing import Awaitable, Callable, Literal
|
3
|
+
|
4
|
+
from pydantic import JsonValue
|
5
|
+
from rich.console import Group
|
6
|
+
from rich.panel import Panel
|
7
|
+
from rich.table import Table
|
8
|
+
from rich.text import Text
|
9
|
+
|
10
|
+
from inspect_ai._util.ansi import render_text
|
11
|
+
from inspect_ai._util.transcript import DOUBLE_LINE
|
12
|
+
|
13
|
+
from ..state import HumanAgentState
|
14
|
+
from .command import HumanAgentCommand, call_human_agent
|
15
|
+
|
16
|
+
|
17
|
+
class InstructionsCommand(HumanAgentCommand):
|
18
|
+
def __init__(self, commands: list[HumanAgentCommand]) -> None:
|
19
|
+
self._commands = commands.copy() + [self]
|
20
|
+
|
21
|
+
@property
|
22
|
+
def name(self) -> str:
|
23
|
+
return "instructions"
|
24
|
+
|
25
|
+
@property
|
26
|
+
def description(self) -> str:
|
27
|
+
return "Display task commands and instructions."
|
28
|
+
|
29
|
+
@property
|
30
|
+
def group(self) -> Literal[1, 2, 3]:
|
31
|
+
return 3
|
32
|
+
|
33
|
+
def cli(self, args: Namespace) -> None:
|
34
|
+
print(call_human_agent("instructions", **vars(args)))
|
35
|
+
|
36
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
37
|
+
async def instructions() -> str:
|
38
|
+
intro = "\nYou will be completing a task based on the instructions presented below. You can use the following commands as you work on the task:\n"
|
39
|
+
commands_table = Table(box=None, show_header=False)
|
40
|
+
commands_table.add_column("", justify="left")
|
41
|
+
commands_table.add_column("", justify="left")
|
42
|
+
|
43
|
+
def add_command_group(group: int) -> None:
|
44
|
+
for command in filter(
|
45
|
+
lambda c: "cli" in c.contexts and c.group == group, self._commands
|
46
|
+
):
|
47
|
+
commands_table.add_row(f"task {command.name}", command.description)
|
48
|
+
if group != 3:
|
49
|
+
commands_table.add_row("", "")
|
50
|
+
|
51
|
+
for i in range(1, 4):
|
52
|
+
add_command_group(i)
|
53
|
+
|
54
|
+
header_panel = Panel(
|
55
|
+
Group(intro, commands_table),
|
56
|
+
title=Text.from_markup("[bold]Human Agent Task[/bold]"),
|
57
|
+
box=DOUBLE_LINE,
|
58
|
+
padding=(0, 0),
|
59
|
+
)
|
60
|
+
|
61
|
+
instructions_panel = Panel(
|
62
|
+
f"{state.instructions.strip()}",
|
63
|
+
title="Task Instructions",
|
64
|
+
padding=(1, 1),
|
65
|
+
)
|
66
|
+
|
67
|
+
return render_text(
|
68
|
+
["", header_panel, instructions_panel],
|
69
|
+
styles=False,
|
70
|
+
no_color=True,
|
71
|
+
width=90,
|
72
|
+
)
|
73
|
+
|
74
|
+
return instructions
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from argparse import Namespace
|
2
|
+
from typing import Awaitable, Callable, Literal
|
3
|
+
|
4
|
+
from pydantic import JsonValue
|
5
|
+
|
6
|
+
from ..state import HumanAgentState
|
7
|
+
from .command import HumanAgentCommand, call_human_agent
|
8
|
+
|
9
|
+
|
10
|
+
class NoteCommand(HumanAgentCommand):
|
11
|
+
@property
|
12
|
+
def name(self) -> str:
|
13
|
+
return "note"
|
14
|
+
|
15
|
+
@property
|
16
|
+
def description(self) -> str:
|
17
|
+
return "Record a note in the task transcript."
|
18
|
+
|
19
|
+
@property
|
20
|
+
def group(self) -> Literal[1, 2, 3]:
|
21
|
+
return 1
|
22
|
+
|
23
|
+
def cli(self, args: Namespace) -> None:
|
24
|
+
print(
|
25
|
+
"Enter a multiline markdown note (Press Ctrl+D on a new line to finish):\n"
|
26
|
+
)
|
27
|
+
lines = ["## Human Agent Note"]
|
28
|
+
try:
|
29
|
+
while True:
|
30
|
+
line = input()
|
31
|
+
lines.append(line)
|
32
|
+
except EOFError:
|
33
|
+
pass
|
34
|
+
call_human_agent("note", content="\n".join(lines))
|
35
|
+
|
36
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
37
|
+
from inspect_ai.log._transcript import transcript
|
38
|
+
|
39
|
+
async def note(content: str) -> None:
|
40
|
+
transcript().info(content)
|
41
|
+
|
42
|
+
return note
|
@@ -0,0 +1,80 @@
|
|
1
|
+
from argparse import Namespace
|
2
|
+
from copy import deepcopy
|
3
|
+
from textwrap import dedent
|
4
|
+
from typing import Awaitable, Callable, Literal
|
5
|
+
|
6
|
+
from pydantic import JsonValue
|
7
|
+
|
8
|
+
from inspect_ai._util.ansi import render_text
|
9
|
+
from inspect_ai.model._model_output import ModelOutput
|
10
|
+
from inspect_ai.scorer._score import score
|
11
|
+
|
12
|
+
from ..._task_state import TaskState
|
13
|
+
from ..state import HumanAgentState, IntermediateScoring
|
14
|
+
from .command import HumanAgentCommand, call_human_agent
|
15
|
+
|
16
|
+
|
17
|
+
class ScoreCommand(HumanAgentCommand):
|
18
|
+
def __init__(self, state: TaskState):
|
19
|
+
self._state = state
|
20
|
+
|
21
|
+
@property
|
22
|
+
def name(self) -> str:
|
23
|
+
return "score"
|
24
|
+
|
25
|
+
@property
|
26
|
+
def description(self) -> str:
|
27
|
+
return "Score the task to check progress."
|
28
|
+
|
29
|
+
@property
|
30
|
+
def group(self) -> Literal[1, 2, 3]:
|
31
|
+
return 1
|
32
|
+
|
33
|
+
@property
|
34
|
+
def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
|
35
|
+
return [
|
36
|
+
HumanAgentCommand.CLIArg(
|
37
|
+
name="answer",
|
38
|
+
description="Answer to submit for scoring (optional, not required for all tasks)",
|
39
|
+
)
|
40
|
+
]
|
41
|
+
|
42
|
+
def cli(self, args: Namespace) -> None:
|
43
|
+
# first validate (print and exit if we get a str back)
|
44
|
+
call_args = vars(args)
|
45
|
+
error = call_human_agent("validate", **call_args)
|
46
|
+
if error:
|
47
|
+
print(error)
|
48
|
+
return
|
49
|
+
|
50
|
+
print(call_human_agent("score", **call_args))
|
51
|
+
|
52
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
53
|
+
async def score_task(answer: str | None) -> str:
|
54
|
+
from inspect_ai.log._transcript import transcript
|
55
|
+
|
56
|
+
# make a copy of TaskState, add the answer, then score
|
57
|
+
if answer:
|
58
|
+
task_state = deepcopy(self._state)
|
59
|
+
task_state.output = ModelOutput.from_content("human_agent", answer)
|
60
|
+
result = await score(task_state)
|
61
|
+
else:
|
62
|
+
result = await score(self._state)
|
63
|
+
|
64
|
+
# record the scoring action in our state
|
65
|
+
state.scorings.append(IntermediateScoring(time=state.time, scores=result))
|
66
|
+
|
67
|
+
# record to transcript
|
68
|
+
transcript().info(
|
69
|
+
dedent(f"""
|
70
|
+
### Intermediate Score
|
71
|
+
**Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
|
72
|
+
""")
|
73
|
+
)
|
74
|
+
|
75
|
+
# notify user
|
76
|
+
return render_text(
|
77
|
+
f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
|
78
|
+
)
|
79
|
+
|
80
|
+
return score_task
|
@@ -0,0 +1,62 @@
|
|
1
|
+
from argparse import Namespace
|
2
|
+
from typing import Awaitable, Callable, Literal
|
3
|
+
|
4
|
+
from pydantic import JsonValue
|
5
|
+
from rich.console import RenderableType
|
6
|
+
from rich.table import Table
|
7
|
+
from rich.text import Text
|
8
|
+
|
9
|
+
from inspect_ai._util.ansi import render_text
|
10
|
+
from inspect_ai._util.format import format_progress_time
|
11
|
+
|
12
|
+
from ..state import HumanAgentState
|
13
|
+
from .command import HumanAgentCommand, call_human_agent
|
14
|
+
|
15
|
+
|
16
|
+
class StatusCommand(HumanAgentCommand):
|
17
|
+
@property
|
18
|
+
def name(self) -> str:
|
19
|
+
return "status"
|
20
|
+
|
21
|
+
@property
|
22
|
+
def description(self) -> str:
|
23
|
+
return "Print task status (clock, scoring, etc.)"
|
24
|
+
|
25
|
+
@property
|
26
|
+
def group(self) -> Literal[1, 2, 3]:
|
27
|
+
return 2
|
28
|
+
|
29
|
+
def cli(self, args: Namespace) -> None:
|
30
|
+
print(call_human_agent("status"))
|
31
|
+
|
32
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
33
|
+
async def status() -> str:
|
34
|
+
return render_status(state)
|
35
|
+
|
36
|
+
return status
|
37
|
+
|
38
|
+
|
39
|
+
def render_status(state: HumanAgentState) -> str:
|
40
|
+
content: list[RenderableType] = [""]
|
41
|
+
content.append(
|
42
|
+
f"[bold]Status:[/bold] {'Running' if state.running else 'Stopped'} "
|
43
|
+
+ f"[bold]Time:[/bold] {format_progress_time(state.time, pad_hours=False)}"
|
44
|
+
)
|
45
|
+
|
46
|
+
if len(state.scorings) > 0:
|
47
|
+
content.append("")
|
48
|
+
content.append(Text.from_markup("[italic]Intermediate Scores[/italic]"))
|
49
|
+
scores_table = Table(box=None, min_width=35, padding=(0, 0))
|
50
|
+
scores_table.add_column("Answer", justify="left")
|
51
|
+
scores_table.add_column("Score", justify="center")
|
52
|
+
scores_table.add_column("Time", justify="right")
|
53
|
+
|
54
|
+
for score in state.scorings:
|
55
|
+
scores_table.add_row(
|
56
|
+
score.scores[0].answer,
|
57
|
+
score.scores[0].as_str(),
|
58
|
+
format_progress_time(score.time),
|
59
|
+
)
|
60
|
+
content.append(scores_table)
|
61
|
+
|
62
|
+
return render_text(content, highlight=False)
|
@@ -0,0 +1,151 @@
|
|
1
|
+
from argparse import Namespace
|
2
|
+
from logging import getLogger
|
3
|
+
from pathlib import PurePosixPath
|
4
|
+
from re import Pattern, compile, match
|
5
|
+
from typing import Awaitable, Callable, Literal
|
6
|
+
|
7
|
+
from pydantic import JsonValue
|
8
|
+
|
9
|
+
from inspect_ai._util.ansi import render_text
|
10
|
+
from inspect_ai.util._sandbox import sandbox
|
11
|
+
|
12
|
+
from ..install import RECORD_SESSION_DIR
|
13
|
+
from ..state import HumanAgentState
|
14
|
+
from .command import HumanAgentCommand, call_human_agent
|
15
|
+
|
16
|
+
logger = getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class SubmitCommand(HumanAgentCommand):
|
20
|
+
def __init__(self, record_session: bool):
|
21
|
+
super().__init__()
|
22
|
+
self._record_session = record_session
|
23
|
+
|
24
|
+
@property
|
25
|
+
def name(self) -> str:
|
26
|
+
return "submit"
|
27
|
+
|
28
|
+
@property
|
29
|
+
def description(self) -> str:
|
30
|
+
return "Submit your final answer for the task."
|
31
|
+
|
32
|
+
@property
|
33
|
+
def group(self) -> Literal[1, 2, 3]:
|
34
|
+
return 1
|
35
|
+
|
36
|
+
@property
|
37
|
+
def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
|
38
|
+
return [
|
39
|
+
HumanAgentCommand.CLIArg(
|
40
|
+
name="answer",
|
41
|
+
description="Answer to submit for scoring (optional, not required for all tasks)",
|
42
|
+
)
|
43
|
+
]
|
44
|
+
|
45
|
+
def cli(self, args: Namespace) -> None:
|
46
|
+
# read cli args
|
47
|
+
call_args = vars(args)
|
48
|
+
|
49
|
+
# first validate (print and exit if we get a str back)
|
50
|
+
error = call_human_agent("validate", **call_args)
|
51
|
+
if error:
|
52
|
+
print(error)
|
53
|
+
return
|
54
|
+
|
55
|
+
# verify that the user wants to proceed
|
56
|
+
answer = call_args.get("answer", None)
|
57
|
+
answer_text = f" '{answer}'" if answer else ""
|
58
|
+
while True:
|
59
|
+
response = (
|
60
|
+
input(
|
61
|
+
f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
|
62
|
+
)
|
63
|
+
.lower()
|
64
|
+
.strip()
|
65
|
+
)
|
66
|
+
if response in ["yes", "y"]:
|
67
|
+
break
|
68
|
+
elif response in ["no", "n"]:
|
69
|
+
return
|
70
|
+
else:
|
71
|
+
print("Please enter yes or no.")
|
72
|
+
|
73
|
+
# thank the user!
|
74
|
+
print(
|
75
|
+
"\nThank you for working on this task!\n\n"
|
76
|
+
+ "Your task will now be scored and you will be disconnected from this container.\n"
|
77
|
+
)
|
78
|
+
|
79
|
+
# submit the task
|
80
|
+
call_human_agent("submit", **call_args)
|
81
|
+
|
82
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
83
|
+
async def submit(
|
84
|
+
answer: str | None, session_logs: dict[str, str] | None = None
|
85
|
+
) -> None:
|
86
|
+
if self._record_session:
|
87
|
+
state.logs = await self._read_session_logs()
|
88
|
+
state.running = False
|
89
|
+
state.answer = answer
|
90
|
+
|
91
|
+
return submit
|
92
|
+
|
93
|
+
async def _read_session_logs(self) -> dict[str, str]:
|
94
|
+
# retreive session logs (don't fail)
|
95
|
+
sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
|
96
|
+
result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
|
97
|
+
if not result.success:
|
98
|
+
logger.warning(f"Error listing human agent session logs: {result.stderr}")
|
99
|
+
return {}
|
100
|
+
|
101
|
+
# read logs
|
102
|
+
session_logs: dict[str, str] = {}
|
103
|
+
for session_log in result.stdout.strip().splitlines():
|
104
|
+
try:
|
105
|
+
session_logs[session_log] = await sandbox().read_file(
|
106
|
+
(sessions_dir / session_log).as_posix()
|
107
|
+
)
|
108
|
+
except Exception as ex:
|
109
|
+
logger.warning(f"Error reading human agent session log: {ex}")
|
110
|
+
|
111
|
+
return session_logs
|
112
|
+
|
113
|
+
|
114
|
+
class ValidateCommand(HumanAgentCommand):
|
115
|
+
def __init__(self, answer: bool | str) -> None:
|
116
|
+
self._answer = compile(answer) if isinstance(answer, str) else answer
|
117
|
+
|
118
|
+
@property
|
119
|
+
def name(self) -> str:
|
120
|
+
return "validate"
|
121
|
+
|
122
|
+
@property
|
123
|
+
def description(self) -> str:
|
124
|
+
return "Validate a task submission."
|
125
|
+
|
126
|
+
@property
|
127
|
+
def contexts(self) -> list[Literal["cli", "service"]]:
|
128
|
+
return ["service"]
|
129
|
+
|
130
|
+
def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
|
131
|
+
async def validate(answer: str | None) -> str | None:
|
132
|
+
def failed(reason: str) -> str:
|
133
|
+
return render_text(f"[bold]FAILED:[/bold] {reason}")
|
134
|
+
|
135
|
+
if not state.running:
|
136
|
+
return failed("Task is stopped (use 'task start' to start)")
|
137
|
+
if self._answer:
|
138
|
+
answer = answer.strip() if isinstance(answer, str) else answer
|
139
|
+
if not answer:
|
140
|
+
return failed(
|
141
|
+
"An explicit answer is required for scoring this task."
|
142
|
+
)
|
143
|
+
elif isinstance(self._answer, Pattern) and not match(
|
144
|
+
self._answer, answer
|
145
|
+
):
|
146
|
+
return failed(
|
147
|
+
"Your answer was not in the required format (please review the task instructions)"
|
148
|
+
)
|
149
|
+
return None # made it through verification
|
150
|
+
|
151
|
+
return validate
|