inspect-ai 0.3.55__py3-none-any.whl → 0.3.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. inspect_ai/__init__.py +1 -0
  2. inspect_ai/_cli/common.py +1 -1
  3. inspect_ai/_cli/trace.py +33 -20
  4. inspect_ai/_display/core/active.py +1 -1
  5. inspect_ai/_display/core/display.py +1 -1
  6. inspect_ai/_display/core/footer.py +1 -1
  7. inspect_ai/_display/core/progress.py +0 -6
  8. inspect_ai/_display/core/rich.py +1 -1
  9. inspect_ai/_display/rich/display.py +2 -2
  10. inspect_ai/_display/textual/app.py +15 -17
  11. inspect_ai/_display/textual/widgets/clock.py +3 -3
  12. inspect_ai/_display/textual/widgets/samples.py +6 -13
  13. inspect_ai/_eval/context.py +9 -1
  14. inspect_ai/_eval/score.py +4 -10
  15. inspect_ai/_eval/task/results.py +5 -4
  16. inspect_ai/_eval/task/run.py +6 -12
  17. inspect_ai/_eval/task/task.py +10 -0
  18. inspect_ai/_util/ansi.py +31 -0
  19. inspect_ai/_util/format.py +7 -0
  20. inspect_ai/_util/logger.py +12 -12
  21. inspect_ai/_util/throttle.py +10 -1
  22. inspect_ai/_util/trace.py +43 -47
  23. inspect_ai/_util/transcript.py +4 -0
  24. inspect_ai/_util/vscode.py +51 -0
  25. inspect_ai/_view/notify.py +2 -1
  26. inspect_ai/_view/www/App.css +22 -1
  27. inspect_ai/_view/www/dist/assets/index.css +2374 -2
  28. inspect_ai/_view/www/dist/assets/index.js +29622 -24424
  29. inspect_ai/_view/www/log-schema.json +138 -90
  30. inspect_ai/_view/www/package.json +1 -0
  31. inspect_ai/_view/www/src/App.mjs +1 -0
  32. inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
  33. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
  34. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
  35. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
  36. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
  37. inspect_ai/_view/www/src/components/Tools.mjs +11 -3
  38. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
  39. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
  40. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
  41. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
  42. inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
  43. inspect_ai/_view/www/src/types/log.d.ts +26 -12
  44. inspect_ai/_view/www/yarn.lock +44 -0
  45. inspect_ai/approval/_apply.py +4 -0
  46. inspect_ai/approval/_human/panel.py +5 -8
  47. inspect_ai/dataset/_dataset.py +51 -10
  48. inspect_ai/dataset/_util.py +31 -3
  49. inspect_ai/log/__init__.py +2 -0
  50. inspect_ai/log/_log.py +5 -2
  51. inspect_ai/model/_call_tools.py +4 -2
  52. inspect_ai/model/_chat_message.py +3 -0
  53. inspect_ai/model/_model.py +42 -1
  54. inspect_ai/model/_providers/anthropic.py +4 -0
  55. inspect_ai/model/_render.py +9 -2
  56. inspect_ai/scorer/_metric.py +12 -1
  57. inspect_ai/solver/__init__.py +2 -0
  58. inspect_ai/solver/_human_agent/agent.py +83 -0
  59. inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
  60. inspect_ai/solver/_human_agent/commands/clock.py +70 -0
  61. inspect_ai/solver/_human_agent/commands/command.py +59 -0
  62. inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
  63. inspect_ai/solver/_human_agent/commands/note.py +42 -0
  64. inspect_ai/solver/_human_agent/commands/score.py +80 -0
  65. inspect_ai/solver/_human_agent/commands/status.py +62 -0
  66. inspect_ai/solver/_human_agent/commands/submit.py +151 -0
  67. inspect_ai/solver/_human_agent/install.py +222 -0
  68. inspect_ai/solver/_human_agent/panel.py +252 -0
  69. inspect_ai/solver/_human_agent/service.py +45 -0
  70. inspect_ai/solver/_human_agent/state.py +55 -0
  71. inspect_ai/solver/_human_agent/view.py +24 -0
  72. inspect_ai/solver/_task_state.py +28 -2
  73. inspect_ai/tool/_tool.py +10 -2
  74. inspect_ai/tool/_tools/_web_browser/_web_browser.py +13 -10
  75. inspect_ai/util/__init__.py +8 -4
  76. inspect_ai/{_util/display.py → util/_display.py} +6 -0
  77. inspect_ai/util/_panel.py +31 -9
  78. inspect_ai/util/_sandbox/__init__.py +0 -3
  79. inspect_ai/util/_sandbox/context.py +5 -1
  80. inspect_ai/util/_sandbox/docker/compose.py +16 -10
  81. inspect_ai/util/_sandbox/docker/docker.py +9 -6
  82. inspect_ai/util/_sandbox/docker/internal.py +1 -1
  83. inspect_ai/util/_sandbox/docker/util.py +2 -2
  84. inspect_ai/util/_sandbox/environment.py +6 -5
  85. inspect_ai/util/_sandbox/local.py +1 -1
  86. inspect_ai/util/_sandbox/service.py +22 -7
  87. inspect_ai/util/_store.py +5 -6
  88. inspect_ai/util/_store_model.py +110 -0
  89. inspect_ai/util/_throttle.py +32 -0
  90. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/METADATA +1 -1
  91. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/RECORD +95 -73
  92. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/LICENSE +0 -0
  93. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/WHEEL +0 -0
  94. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/entry_points.txt +0 -0
  95. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.56.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ from ._basic_agent import basic_agent
4
4
  from ._chain import chain
5
5
  from ._critique import self_critique
6
6
  from ._fork import fork
7
+ from ._human_agent.agent import human_agent
7
8
  from ._multiple_choice import MultipleChoiceTemplate, multiple_choice
8
9
  from ._plan import Plan, plan
9
10
  from ._prompt import (
@@ -17,6 +18,7 @@ from ._use_tools import use_tools
17
18
 
18
19
  __all__ = [
19
20
  "basic_agent",
21
+ "human_agent",
20
22
  "chain",
21
23
  "fork",
22
24
  "generate",
@@ -0,0 +1,83 @@
1
+ import asyncio
2
+
3
+ from inspect_ai.util import display_type, input_panel, sandbox
4
+
5
+ from .._solver import Generate, Solver, solver
6
+ from .._task_state import TaskState
7
+ from .commands import human_agent_commands
8
+ from .install import install_human_agent
9
+ from .panel import HumanAgentPanel
10
+ from .service import run_human_agent_service
11
+ from .view import ConsoleView, HumanAgentView
12
+
13
+
14
+ @solver
15
+ def human_agent(
16
+ answer: bool | str = True,
17
+ intermediate_scoring: bool = False,
18
+ record_session: bool = True,
19
+ ) -> Solver:
20
+ """Human solver for agentic tasks that run in a Linux environment.
21
+
22
+ The Human agent solver installs agent task tools in the default
23
+ sandbox and presents the user with both task instructions and
24
+ documentation for the various tools (e.g. `task submit`,
25
+ `task start`, `task stop` `task instructions`, etc.). A human agent panel
26
+ is displayed with instructions for logging in to the sandbox.
27
+
28
+ If the user is running in VS Code with the Inspect extension,
29
+ they will also be presented with links to login to the sandbox
30
+ using a VS Code Window or Terminal.
31
+
32
+ Args:
33
+ answer (bool | str): Is an explicit answer required for this
34
+ task or is it scored based on files in the container? Pass a
35
+ `str` with a regex to validate that the answer matches
36
+ the expected format.
37
+ intermediate_scoring (bool): Allow the human agent to
38
+ check their score while working.
39
+ record_session (bool): Record all user commands and outputs in
40
+ the sandbox bash session.
41
+
42
+ Returns:
43
+ Solver: Human agent solver.
44
+ """
45
+ # we can only run one human agent interaction at a time (use lock to enforce)
46
+ agent_lock = asyncio.Lock()
47
+
48
+ async def solve(state: TaskState, generate: Generate) -> TaskState:
49
+ async with agent_lock:
50
+ # ensure that we have a sandbox to work with
51
+ try:
52
+ connection = await sandbox().connection()
53
+ except ProcessLookupError:
54
+ raise RuntimeError("Human agent must run in a task with a sandbox.")
55
+ except NotImplementedError:
56
+ raise RuntimeError(
57
+ "Human agent must run with a sandbox that supports connections."
58
+ )
59
+
60
+ # helper function to run the agent (called for fullscreen vs. fallback below)
61
+ async def run_human_agent(view: HumanAgentView) -> TaskState:
62
+ # create agent commands
63
+ commands = human_agent_commands(
64
+ state, answer, intermediate_scoring, record_session
65
+ )
66
+
67
+ # install agent tools
68
+ await install_human_agent(state, commands, record_session)
69
+
70
+ # hookup the view ui
71
+ view.connect(connection)
72
+
73
+ # run sandbox service
74
+ return await run_human_agent_service(state, commands, view)
75
+
76
+ # support both fullscreen ui and fallback
77
+ if display_type() == "full":
78
+ async with await input_panel(HumanAgentPanel) as panel:
79
+ return await run_human_agent(panel)
80
+ else:
81
+ return await run_human_agent(ConsoleView())
82
+
83
+ return solve
@@ -0,0 +1,36 @@
1
+ from inspect_ai.solver._task_state import TaskState
2
+
3
+ from .clock import StartCommand, StopCommand
4
+ from .command import HumanAgentCommand
5
+ from .instructions import InstructionsCommand
6
+ from .note import NoteCommand
7
+ from .score import ScoreCommand
8
+ from .status import StatusCommand
9
+ from .submit import SubmitCommand, ValidateCommand
10
+
11
+
12
+ def human_agent_commands(
13
+ state: TaskState,
14
+ answer: bool | str,
15
+ intermediate_scoring: bool,
16
+ record_session: bool,
17
+ ) -> list[HumanAgentCommand]:
18
+ # base submit and validate
19
+ commands = [SubmitCommand(record_session), ValidateCommand(answer)]
20
+
21
+ # optional intermediate scoring
22
+ if intermediate_scoring:
23
+ commands.append(ScoreCommand(state))
24
+
25
+ # remaining commands
26
+ commands.extend(
27
+ [
28
+ NoteCommand(),
29
+ StatusCommand(),
30
+ StartCommand(),
31
+ StopCommand(),
32
+ ]
33
+ )
34
+
35
+ # with instructions (letting it see the other commands)
36
+ return commands + [InstructionsCommand(commands)]
@@ -0,0 +1,70 @@
1
+ from argparse import Namespace
2
+ from typing import Awaitable, Callable, Literal
3
+
4
+ from pydantic import JsonValue
5
+
6
+ from inspect_ai._util.format import format_progress_time
7
+
8
+ from ..state import HumanAgentState
9
+ from .command import HumanAgentCommand, call_human_agent
10
+ from .status import render_status
11
+
12
+
13
+ class StartCommand(HumanAgentCommand):
14
+ @property
15
+ def name(self) -> str:
16
+ return "start"
17
+
18
+ @property
19
+ def description(self) -> str:
20
+ return "Start the task clock (resume working)."
21
+
22
+ @property
23
+ def group(self) -> Literal[1, 2, 3]:
24
+ return 2
25
+
26
+ def cli(self, args: Namespace) -> None:
27
+ print(call_human_agent("start"))
28
+
29
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
30
+ from inspect_ai.log._transcript import transcript
31
+
32
+ async def start() -> str:
33
+ if not state.running:
34
+ state.running = True
35
+ transcript().info(
36
+ f"Task started (total time: {format_progress_time(state.time)})"
37
+ )
38
+ return render_status(state)
39
+
40
+ return start
41
+
42
+
43
+ class StopCommand(HumanAgentCommand):
44
+ @property
45
+ def name(self) -> str:
46
+ return "stop"
47
+
48
+ @property
49
+ def description(self) -> str:
50
+ return "Stop the task clock (pause working)."
51
+
52
+ @property
53
+ def group(self) -> Literal[1, 2, 3]:
54
+ return 2
55
+
56
+ def cli(self, args: Namespace) -> None:
57
+ print(call_human_agent("stop"))
58
+
59
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
60
+ from inspect_ai.log._transcript import transcript
61
+
62
+ async def stop() -> str:
63
+ if state.running:
64
+ state.running = False
65
+ transcript().info(
66
+ f"Task stopped (total time: {format_progress_time(state.time)})"
67
+ )
68
+ return render_status(state)
69
+
70
+ return stop
@@ -0,0 +1,59 @@
1
+ import abc
2
+ from argparse import Namespace
3
+ from typing import Any, Awaitable, Callable, Literal, NamedTuple
4
+
5
+ from pydantic import JsonValue
6
+
7
+ from ..state import HumanAgentState
8
+
9
+
10
+ class HumanAgentCommand:
11
+ @property
12
+ @abc.abstractmethod
13
+ def name(self) -> str:
14
+ """Command name (e.g. 'submit')"""
15
+ ...
16
+
17
+ @property
18
+ @abc.abstractmethod
19
+ def description(self) -> str:
20
+ """Command description."""
21
+ ...
22
+
23
+ @property
24
+ def group(self) -> Literal[1, 2, 3]:
25
+ return 1
26
+
27
+ @property
28
+ def contexts(self) -> list[Literal["cli", "service"]]:
29
+ """Contexts where this command runs (defaults to both cli and service)."""
30
+ return ["cli", "service"]
31
+
32
+ class CLIArg(NamedTuple):
33
+ name: str
34
+ description: str
35
+ required: bool = False
36
+
37
+ @property
38
+ def cli_args(self) -> list[CLIArg]:
39
+ """Positional command line arguments."""
40
+ return []
41
+
42
+ def cli(self, args: Namespace) -> None:
43
+ """CLI command (runs in container). Required for context "cli"."""
44
+ pass
45
+
46
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
47
+ """Service handler (runs in solver). Required for context "service"."""
48
+
49
+ async def no_handler() -> None:
50
+ pass
51
+
52
+ return no_handler
53
+
54
+
55
+ # Dummy functions for implementation of call methods
56
+
57
+
58
+ def call_human_agent(method: str, **params: Any) -> Any:
59
+ return None
@@ -0,0 +1,74 @@
1
+ from argparse import Namespace
2
+ from typing import Awaitable, Callable, Literal
3
+
4
+ from pydantic import JsonValue
5
+ from rich.console import Group
6
+ from rich.panel import Panel
7
+ from rich.table import Table
8
+ from rich.text import Text
9
+
10
+ from inspect_ai._util.ansi import render_text
11
+ from inspect_ai._util.transcript import DOUBLE_LINE
12
+
13
+ from ..state import HumanAgentState
14
+ from .command import HumanAgentCommand, call_human_agent
15
+
16
+
17
+ class InstructionsCommand(HumanAgentCommand):
18
+ def __init__(self, commands: list[HumanAgentCommand]) -> None:
19
+ self._commands = commands.copy() + [self]
20
+
21
+ @property
22
+ def name(self) -> str:
23
+ return "instructions"
24
+
25
+ @property
26
+ def description(self) -> str:
27
+ return "Display task commands and instructions."
28
+
29
+ @property
30
+ def group(self) -> Literal[1, 2, 3]:
31
+ return 3
32
+
33
+ def cli(self, args: Namespace) -> None:
34
+ print(call_human_agent("instructions", **vars(args)))
35
+
36
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
37
+ async def instructions() -> str:
38
+ intro = "\nYou will be completing a task based on the instructions presented below. You can use the following commands as you work on the task:\n"
39
+ commands_table = Table(box=None, show_header=False)
40
+ commands_table.add_column("", justify="left")
41
+ commands_table.add_column("", justify="left")
42
+
43
+ def add_command_group(group: int) -> None:
44
+ for command in filter(
45
+ lambda c: "cli" in c.contexts and c.group == group, self._commands
46
+ ):
47
+ commands_table.add_row(f"task {command.name}", command.description)
48
+ if group != 3:
49
+ commands_table.add_row("", "")
50
+
51
+ for i in range(1, 4):
52
+ add_command_group(i)
53
+
54
+ header_panel = Panel(
55
+ Group(intro, commands_table),
56
+ title=Text.from_markup("[bold]Human Agent Task[/bold]"),
57
+ box=DOUBLE_LINE,
58
+ padding=(0, 0),
59
+ )
60
+
61
+ instructions_panel = Panel(
62
+ f"{state.instructions.strip()}",
63
+ title="Task Instructions",
64
+ padding=(1, 1),
65
+ )
66
+
67
+ return render_text(
68
+ ["", header_panel, instructions_panel],
69
+ styles=False,
70
+ no_color=True,
71
+ width=90,
72
+ )
73
+
74
+ return instructions
@@ -0,0 +1,42 @@
1
+ from argparse import Namespace
2
+ from typing import Awaitable, Callable, Literal
3
+
4
+ from pydantic import JsonValue
5
+
6
+ from ..state import HumanAgentState
7
+ from .command import HumanAgentCommand, call_human_agent
8
+
9
+
10
+ class NoteCommand(HumanAgentCommand):
11
+ @property
12
+ def name(self) -> str:
13
+ return "note"
14
+
15
+ @property
16
+ def description(self) -> str:
17
+ return "Record a note in the task transcript."
18
+
19
+ @property
20
+ def group(self) -> Literal[1, 2, 3]:
21
+ return 1
22
+
23
+ def cli(self, args: Namespace) -> None:
24
+ print(
25
+ "Enter a multiline markdown note (Press Ctrl+D on a new line to finish):\n"
26
+ )
27
+ lines = ["## Human Agent Note"]
28
+ try:
29
+ while True:
30
+ line = input()
31
+ lines.append(line)
32
+ except EOFError:
33
+ pass
34
+ call_human_agent("note", content="\n".join(lines))
35
+
36
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
37
+ from inspect_ai.log._transcript import transcript
38
+
39
+ async def note(content: str) -> None:
40
+ transcript().info(content)
41
+
42
+ return note
@@ -0,0 +1,80 @@
1
+ from argparse import Namespace
2
+ from copy import deepcopy
3
+ from textwrap import dedent
4
+ from typing import Awaitable, Callable, Literal
5
+
6
+ from pydantic import JsonValue
7
+
8
+ from inspect_ai._util.ansi import render_text
9
+ from inspect_ai.model._model_output import ModelOutput
10
+ from inspect_ai.scorer._score import score
11
+
12
+ from ..._task_state import TaskState
13
+ from ..state import HumanAgentState, IntermediateScoring
14
+ from .command import HumanAgentCommand, call_human_agent
15
+
16
+
17
+ class ScoreCommand(HumanAgentCommand):
18
+ def __init__(self, state: TaskState):
19
+ self._state = state
20
+
21
+ @property
22
+ def name(self) -> str:
23
+ return "score"
24
+
25
+ @property
26
+ def description(self) -> str:
27
+ return "Score the task to check progress."
28
+
29
+ @property
30
+ def group(self) -> Literal[1, 2, 3]:
31
+ return 1
32
+
33
+ @property
34
+ def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
35
+ return [
36
+ HumanAgentCommand.CLIArg(
37
+ name="answer",
38
+ description="Answer to submit for scoring (optional, not required for all tasks)",
39
+ )
40
+ ]
41
+
42
+ def cli(self, args: Namespace) -> None:
43
+ # first validate (print and exit if we get a str back)
44
+ call_args = vars(args)
45
+ error = call_human_agent("validate", **call_args)
46
+ if error:
47
+ print(error)
48
+ return
49
+
50
+ print(call_human_agent("score", **call_args))
51
+
52
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
53
+ async def score_task(answer: str | None) -> str:
54
+ from inspect_ai.log._transcript import transcript
55
+
56
+ # make a copy of TaskState, add the answer, then score
57
+ if answer:
58
+ task_state = deepcopy(self._state)
59
+ task_state.output = ModelOutput.from_content("human_agent", answer)
60
+ result = await score(task_state)
61
+ else:
62
+ result = await score(self._state)
63
+
64
+ # record the scoring action in our state
65
+ state.scorings.append(IntermediateScoring(time=state.time, scores=result))
66
+
67
+ # record to transcript
68
+ transcript().info(
69
+ dedent(f"""
70
+ ### Intermediate Score
71
+ **Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
72
+ """)
73
+ )
74
+
75
+ # notify user
76
+ return render_text(
77
+ f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
78
+ )
79
+
80
+ return score_task
@@ -0,0 +1,62 @@
1
+ from argparse import Namespace
2
+ from typing import Awaitable, Callable, Literal
3
+
4
+ from pydantic import JsonValue
5
+ from rich.console import RenderableType
6
+ from rich.table import Table
7
+ from rich.text import Text
8
+
9
+ from inspect_ai._util.ansi import render_text
10
+ from inspect_ai._util.format import format_progress_time
11
+
12
+ from ..state import HumanAgentState
13
+ from .command import HumanAgentCommand, call_human_agent
14
+
15
+
16
+ class StatusCommand(HumanAgentCommand):
17
+ @property
18
+ def name(self) -> str:
19
+ return "status"
20
+
21
+ @property
22
+ def description(self) -> str:
23
+ return "Print task status (clock, scoring, etc.)"
24
+
25
+ @property
26
+ def group(self) -> Literal[1, 2, 3]:
27
+ return 2
28
+
29
+ def cli(self, args: Namespace) -> None:
30
+ print(call_human_agent("status"))
31
+
32
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
33
+ async def status() -> str:
34
+ return render_status(state)
35
+
36
+ return status
37
+
38
+
39
+ def render_status(state: HumanAgentState) -> str:
40
+ content: list[RenderableType] = [""]
41
+ content.append(
42
+ f"[bold]Status:[/bold] {'Running' if state.running else 'Stopped'} "
43
+ + f"[bold]Time:[/bold] {format_progress_time(state.time, pad_hours=False)}"
44
+ )
45
+
46
+ if len(state.scorings) > 0:
47
+ content.append("")
48
+ content.append(Text.from_markup("[italic]Intermediate Scores[/italic]"))
49
+ scores_table = Table(box=None, min_width=35, padding=(0, 0))
50
+ scores_table.add_column("Answer", justify="left")
51
+ scores_table.add_column("Score", justify="center")
52
+ scores_table.add_column("Time", justify="right")
53
+
54
+ for score in state.scorings:
55
+ scores_table.add_row(
56
+ score.scores[0].answer,
57
+ score.scores[0].as_str(),
58
+ format_progress_time(score.time),
59
+ )
60
+ content.append(scores_table)
61
+
62
+ return render_text(content, highlight=False)
@@ -0,0 +1,151 @@
1
+ from argparse import Namespace
2
+ from logging import getLogger
3
+ from pathlib import PurePosixPath
4
+ from re import Pattern, compile, match
5
+ from typing import Awaitable, Callable, Literal
6
+
7
+ from pydantic import JsonValue
8
+
9
+ from inspect_ai._util.ansi import render_text
10
+ from inspect_ai.util._sandbox import sandbox
11
+
12
+ from ..install import RECORD_SESSION_DIR
13
+ from ..state import HumanAgentState
14
+ from .command import HumanAgentCommand, call_human_agent
15
+
16
+ logger = getLogger(__name__)
17
+
18
+
19
+ class SubmitCommand(HumanAgentCommand):
20
+ def __init__(self, record_session: bool):
21
+ super().__init__()
22
+ self._record_session = record_session
23
+
24
+ @property
25
+ def name(self) -> str:
26
+ return "submit"
27
+
28
+ @property
29
+ def description(self) -> str:
30
+ return "Submit your final answer for the task."
31
+
32
+ @property
33
+ def group(self) -> Literal[1, 2, 3]:
34
+ return 1
35
+
36
+ @property
37
+ def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
38
+ return [
39
+ HumanAgentCommand.CLIArg(
40
+ name="answer",
41
+ description="Answer to submit for scoring (optional, not required for all tasks)",
42
+ )
43
+ ]
44
+
45
+ def cli(self, args: Namespace) -> None:
46
+ # read cli args
47
+ call_args = vars(args)
48
+
49
+ # first validate (print and exit if we get a str back)
50
+ error = call_human_agent("validate", **call_args)
51
+ if error:
52
+ print(error)
53
+ return
54
+
55
+ # verify that the user wants to proceed
56
+ answer = call_args.get("answer", None)
57
+ answer_text = f" '{answer}'" if answer else ""
58
+ while True:
59
+ response = (
60
+ input(
61
+ f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
62
+ )
63
+ .lower()
64
+ .strip()
65
+ )
66
+ if response in ["yes", "y"]:
67
+ break
68
+ elif response in ["no", "n"]:
69
+ return
70
+ else:
71
+ print("Please enter yes or no.")
72
+
73
+ # thank the user!
74
+ print(
75
+ "\nThank you for working on this task!\n\n"
76
+ + "Your task will now be scored and you will be disconnected from this container.\n"
77
+ )
78
+
79
+ # submit the task
80
+ call_human_agent("submit", **call_args)
81
+
82
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
83
+ async def submit(
84
+ answer: str | None, session_logs: dict[str, str] | None = None
85
+ ) -> None:
86
+ if self._record_session:
87
+ state.logs = await self._read_session_logs()
88
+ state.running = False
89
+ state.answer = answer
90
+
91
+ return submit
92
+
93
+ async def _read_session_logs(self) -> dict[str, str]:
94
+ # retreive session logs (don't fail)
95
+ sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
96
+ result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
97
+ if not result.success:
98
+ logger.warning(f"Error listing human agent session logs: {result.stderr}")
99
+ return {}
100
+
101
+ # read logs
102
+ session_logs: dict[str, str] = {}
103
+ for session_log in result.stdout.strip().splitlines():
104
+ try:
105
+ session_logs[session_log] = await sandbox().read_file(
106
+ (sessions_dir / session_log).as_posix()
107
+ )
108
+ except Exception as ex:
109
+ logger.warning(f"Error reading human agent session log: {ex}")
110
+
111
+ return session_logs
112
+
113
+
114
+ class ValidateCommand(HumanAgentCommand):
115
+ def __init__(self, answer: bool | str) -> None:
116
+ self._answer = compile(answer) if isinstance(answer, str) else answer
117
+
118
+ @property
119
+ def name(self) -> str:
120
+ return "validate"
121
+
122
+ @property
123
+ def description(self) -> str:
124
+ return "Validate a task submission."
125
+
126
+ @property
127
+ def contexts(self) -> list[Literal["cli", "service"]]:
128
+ return ["service"]
129
+
130
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
131
+ async def validate(answer: str | None) -> str | None:
132
+ def failed(reason: str) -> str:
133
+ return render_text(f"[bold]FAILED:[/bold] {reason}")
134
+
135
+ if not state.running:
136
+ return failed("Task is stopped (use 'task start' to start)")
137
+ if self._answer:
138
+ answer = answer.strip() if isinstance(answer, str) else answer
139
+ if not answer:
140
+ return failed(
141
+ "An explicit answer is required for scoring this task."
142
+ )
143
+ elif isinstance(self._answer, Pattern) and not match(
144
+ self._answer, answer
145
+ ):
146
+ return failed(
147
+ "Your answer was not in the required format (please review the task instructions)"
148
+ )
149
+ return None # made it through verification
150
+
151
+ return validate