inspect-ai 0.3.55__py3-none-any.whl → 0.3.57__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. inspect_ai/__init__.py +1 -0
  2. inspect_ai/_cli/common.py +1 -1
  3. inspect_ai/_cli/trace.py +33 -20
  4. inspect_ai/_display/core/active.py +1 -1
  5. inspect_ai/_display/core/display.py +1 -1
  6. inspect_ai/_display/core/footer.py +1 -1
  7. inspect_ai/_display/core/panel.py +1 -1
  8. inspect_ai/_display/core/progress.py +0 -6
  9. inspect_ai/_display/core/rich.py +1 -1
  10. inspect_ai/_display/rich/display.py +2 -2
  11. inspect_ai/_display/textual/app.py +15 -17
  12. inspect_ai/_display/textual/widgets/clock.py +3 -3
  13. inspect_ai/_display/textual/widgets/samples.py +6 -13
  14. inspect_ai/_eval/context.py +9 -1
  15. inspect_ai/_eval/run.py +16 -11
  16. inspect_ai/_eval/score.py +4 -10
  17. inspect_ai/_eval/task/results.py +5 -4
  18. inspect_ai/_eval/task/run.py +6 -12
  19. inspect_ai/_eval/task/task.py +10 -0
  20. inspect_ai/_util/ansi.py +31 -0
  21. inspect_ai/_util/datetime.py +1 -1
  22. inspect_ai/_util/deprecation.py +1 -1
  23. inspect_ai/_util/format.py +7 -0
  24. inspect_ai/_util/json.py +11 -1
  25. inspect_ai/_util/logger.py +14 -13
  26. inspect_ai/_util/throttle.py +10 -1
  27. inspect_ai/_util/trace.py +79 -47
  28. inspect_ai/_util/transcript.py +37 -4
  29. inspect_ai/_util/vscode.py +51 -0
  30. inspect_ai/_view/notify.py +2 -1
  31. inspect_ai/_view/www/.prettierrc.js +12 -0
  32. inspect_ai/_view/www/App.css +22 -1
  33. inspect_ai/_view/www/dist/assets/index.css +2374 -2
  34. inspect_ai/_view/www/dist/assets/index.js +29752 -24492
  35. inspect_ai/_view/www/log-schema.json +262 -215
  36. inspect_ai/_view/www/package.json +1 -0
  37. inspect_ai/_view/www/src/App.mjs +19 -9
  38. inspect_ai/_view/www/src/Types.mjs +0 -1
  39. inspect_ai/_view/www/src/api/Types.mjs +15 -4
  40. inspect_ai/_view/www/src/api/api-http.mjs +2 -0
  41. inspect_ai/_view/www/src/appearance/Icons.mjs +2 -0
  42. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +74 -0
  43. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -1
  44. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +2 -2
  45. inspect_ai/_view/www/src/components/FindBand.mjs +5 -4
  46. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +168 -0
  47. inspect_ai/_view/www/src/components/LargeModal.mjs +1 -1
  48. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +217 -0
  49. inspect_ai/_view/www/src/components/MessageContent.mjs +1 -1
  50. inspect_ai/_view/www/src/components/TabSet.mjs +1 -1
  51. inspect_ai/_view/www/src/components/Tools.mjs +28 -5
  52. inspect_ai/_view/www/src/components/VirtualList.mjs +15 -17
  53. inspect_ai/_view/www/src/log/remoteLogFile.mjs +2 -1
  54. inspect_ai/_view/www/src/navbar/Navbar.mjs +44 -32
  55. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +1 -2
  56. inspect_ai/_view/www/src/samples/SampleList.mjs +35 -4
  57. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +13 -2
  58. inspect_ai/_view/www/src/samples/SampleScores.mjs +11 -2
  59. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +238 -178
  60. inspect_ai/_view/www/src/samples/SamplesTab.mjs +4 -2
  61. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +5 -5
  62. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +7 -0
  63. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +3 -3
  64. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +3 -2
  65. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +1 -1
  66. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +1 -0
  67. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +56 -0
  68. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +17 -5
  69. inspect_ai/_view/www/src/types/asciicinema-player.d.ts +26 -0
  70. inspect_ai/_view/www/src/types/log.d.ts +28 -20
  71. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +1 -1
  72. inspect_ai/_view/www/yarn.lock +44 -0
  73. inspect_ai/approval/_apply.py +4 -0
  74. inspect_ai/approval/_human/panel.py +5 -8
  75. inspect_ai/dataset/_dataset.py +51 -10
  76. inspect_ai/dataset/_util.py +31 -3
  77. inspect_ai/log/__init__.py +2 -0
  78. inspect_ai/log/_log.py +30 -2
  79. inspect_ai/log/_recorders/eval.py +2 -0
  80. inspect_ai/model/_call_tools.py +31 -7
  81. inspect_ai/model/_chat_message.py +3 -0
  82. inspect_ai/model/_model.py +42 -1
  83. inspect_ai/model/_providers/anthropic.py +4 -0
  84. inspect_ai/model/_providers/google.py +24 -6
  85. inspect_ai/model/_providers/openai.py +17 -3
  86. inspect_ai/model/_providers/openai_o1.py +10 -12
  87. inspect_ai/model/_render.py +9 -2
  88. inspect_ai/scorer/_metric.py +12 -1
  89. inspect_ai/solver/__init__.py +2 -0
  90. inspect_ai/solver/_human_agent/agent.py +83 -0
  91. inspect_ai/solver/_human_agent/commands/__init__.py +36 -0
  92. inspect_ai/solver/_human_agent/commands/clock.py +70 -0
  93. inspect_ai/solver/_human_agent/commands/command.py +59 -0
  94. inspect_ai/solver/_human_agent/commands/instructions.py +74 -0
  95. inspect_ai/solver/_human_agent/commands/note.py +42 -0
  96. inspect_ai/solver/_human_agent/commands/score.py +80 -0
  97. inspect_ai/solver/_human_agent/commands/status.py +62 -0
  98. inspect_ai/solver/_human_agent/commands/submit.py +151 -0
  99. inspect_ai/solver/_human_agent/install.py +222 -0
  100. inspect_ai/solver/_human_agent/panel.py +252 -0
  101. inspect_ai/solver/_human_agent/service.py +45 -0
  102. inspect_ai/solver/_human_agent/state.py +55 -0
  103. inspect_ai/solver/_human_agent/view.py +24 -0
  104. inspect_ai/solver/_task_state.py +28 -2
  105. inspect_ai/tool/_tool.py +10 -2
  106. inspect_ai/tool/_tool_info.py +2 -1
  107. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +9 -9
  108. inspect_ai/tool/_tools/_web_browser/_web_browser.py +16 -13
  109. inspect_ai/util/__init__.py +12 -4
  110. inspect_ai/{_util/display.py → util/_display.py} +6 -0
  111. inspect_ai/util/_panel.py +31 -9
  112. inspect_ai/util/_sandbox/__init__.py +0 -3
  113. inspect_ai/util/_sandbox/context.py +5 -1
  114. inspect_ai/util/_sandbox/docker/compose.py +17 -13
  115. inspect_ai/util/_sandbox/docker/docker.py +9 -6
  116. inspect_ai/util/_sandbox/docker/internal.py +1 -1
  117. inspect_ai/util/_sandbox/docker/util.py +3 -2
  118. inspect_ai/util/_sandbox/environment.py +6 -5
  119. inspect_ai/util/_sandbox/local.py +1 -1
  120. inspect_ai/util/_sandbox/self_check.py +18 -18
  121. inspect_ai/util/_sandbox/service.py +22 -7
  122. inspect_ai/util/_store.py +7 -8
  123. inspect_ai/util/_store_model.py +110 -0
  124. inspect_ai/util/_subprocess.py +3 -3
  125. inspect_ai/util/_throttle.py +32 -0
  126. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/METADATA +3 -3
  127. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/RECORD +131 -108
  128. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/WHEEL +1 -1
  129. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/LICENSE +0 -0
  130. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/entry_points.txt +0 -0
  131. {inspect_ai-0.3.55.dist-info → inspect_ai-0.3.57.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,70 @@
1
+ from argparse import Namespace
2
+ from typing import Awaitable, Callable, Literal
3
+
4
+ from pydantic import JsonValue
5
+
6
+ from inspect_ai._util.format import format_progress_time
7
+
8
+ from ..state import HumanAgentState
9
+ from .command import HumanAgentCommand, call_human_agent
10
+ from .status import render_status
11
+
12
+
13
+ class StartCommand(HumanAgentCommand):
14
+ @property
15
+ def name(self) -> str:
16
+ return "start"
17
+
18
+ @property
19
+ def description(self) -> str:
20
+ return "Start the task clock (resume working)."
21
+
22
+ @property
23
+ def group(self) -> Literal[1, 2, 3]:
24
+ return 2
25
+
26
+ def cli(self, args: Namespace) -> None:
27
+ print(call_human_agent("start"))
28
+
29
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
30
+ from inspect_ai.log._transcript import transcript
31
+
32
+ async def start() -> str:
33
+ if not state.running:
34
+ state.running = True
35
+ transcript().info(
36
+ f"Task started (total time: {format_progress_time(state.time)})"
37
+ )
38
+ return render_status(state)
39
+
40
+ return start
41
+
42
+
43
+ class StopCommand(HumanAgentCommand):
44
+ @property
45
+ def name(self) -> str:
46
+ return "stop"
47
+
48
+ @property
49
+ def description(self) -> str:
50
+ return "Stop the task clock (pause working)."
51
+
52
+ @property
53
+ def group(self) -> Literal[1, 2, 3]:
54
+ return 2
55
+
56
+ def cli(self, args: Namespace) -> None:
57
+ print(call_human_agent("stop"))
58
+
59
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
60
+ from inspect_ai.log._transcript import transcript
61
+
62
+ async def stop() -> str:
63
+ if state.running:
64
+ state.running = False
65
+ transcript().info(
66
+ f"Task stopped (total time: {format_progress_time(state.time)})"
67
+ )
68
+ return render_status(state)
69
+
70
+ return stop
@@ -0,0 +1,59 @@
1
+ import abc
2
+ from argparse import Namespace
3
+ from typing import Any, Awaitable, Callable, Literal, NamedTuple
4
+
5
+ from pydantic import JsonValue
6
+
7
+ from ..state import HumanAgentState
8
+
9
+
10
+ class HumanAgentCommand:
11
+ @property
12
+ @abc.abstractmethod
13
+ def name(self) -> str:
14
+ """Command name (e.g. 'submit')"""
15
+ ...
16
+
17
+ @property
18
+ @abc.abstractmethod
19
+ def description(self) -> str:
20
+ """Command description."""
21
+ ...
22
+
23
+ @property
24
+ def group(self) -> Literal[1, 2, 3]:
25
+ return 1
26
+
27
+ @property
28
+ def contexts(self) -> list[Literal["cli", "service"]]:
29
+ """Contexts where this command runs (defaults to both cli and service)."""
30
+ return ["cli", "service"]
31
+
32
+ class CLIArg(NamedTuple):
33
+ name: str
34
+ description: str
35
+ required: bool = False
36
+
37
+ @property
38
+ def cli_args(self) -> list[CLIArg]:
39
+ """Positional command line arguments."""
40
+ return []
41
+
42
+ def cli(self, args: Namespace) -> None:
43
+ """CLI command (runs in container). Required for context "cli"."""
44
+ pass
45
+
46
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
47
+ """Service handler (runs in solver). Required for context "service"."""
48
+
49
+ async def no_handler() -> None:
50
+ pass
51
+
52
+ return no_handler
53
+
54
+
55
+ # Dummy functions for implementation of call methods
56
+
57
+
58
+ def call_human_agent(method: str, **params: Any) -> Any:
59
+ return None
@@ -0,0 +1,74 @@
1
+ from argparse import Namespace
2
+ from typing import Awaitable, Callable, Literal
3
+
4
+ from pydantic import JsonValue
5
+ from rich.console import Group
6
+ from rich.panel import Panel
7
+ from rich.table import Table
8
+ from rich.text import Text
9
+
10
+ from inspect_ai._util.ansi import render_text
11
+ from inspect_ai._util.transcript import DOUBLE_LINE
12
+
13
+ from ..state import HumanAgentState
14
+ from .command import HumanAgentCommand, call_human_agent
15
+
16
+
17
+ class InstructionsCommand(HumanAgentCommand):
18
+ def __init__(self, commands: list[HumanAgentCommand]) -> None:
19
+ self._commands = commands.copy() + [self]
20
+
21
+ @property
22
+ def name(self) -> str:
23
+ return "instructions"
24
+
25
+ @property
26
+ def description(self) -> str:
27
+ return "Display task commands and instructions."
28
+
29
+ @property
30
+ def group(self) -> Literal[1, 2, 3]:
31
+ return 3
32
+
33
+ def cli(self, args: Namespace) -> None:
34
+ print(call_human_agent("instructions", **vars(args)))
35
+
36
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
37
+ async def instructions() -> str:
38
+ intro = "\nYou will be completing a task based on the instructions presented below. You can use the following commands as you work on the task:\n"
39
+ commands_table = Table(box=None, show_header=False)
40
+ commands_table.add_column("", justify="left")
41
+ commands_table.add_column("", justify="left")
42
+
43
+ def add_command_group(group: int) -> None:
44
+ for command in filter(
45
+ lambda c: "cli" in c.contexts and c.group == group, self._commands
46
+ ):
47
+ commands_table.add_row(f"task {command.name}", command.description)
48
+ if group != 3:
49
+ commands_table.add_row("", "")
50
+
51
+ for i in range(1, 4):
52
+ add_command_group(i)
53
+
54
+ header_panel = Panel(
55
+ Group(intro, commands_table),
56
+ title=Text.from_markup("[bold]Human Agent Task[/bold]"),
57
+ box=DOUBLE_LINE,
58
+ padding=(0, 0),
59
+ )
60
+
61
+ instructions_panel = Panel(
62
+ f"{state.instructions.strip()}",
63
+ title="Task Instructions",
64
+ padding=(1, 1),
65
+ )
66
+
67
+ return render_text(
68
+ ["", header_panel, instructions_panel],
69
+ styles=False,
70
+ no_color=True,
71
+ width=90,
72
+ )
73
+
74
+ return instructions
@@ -0,0 +1,42 @@
1
+ from argparse import Namespace
2
+ from typing import Awaitable, Callable, Literal
3
+
4
+ from pydantic import JsonValue
5
+
6
+ from ..state import HumanAgentState
7
+ from .command import HumanAgentCommand, call_human_agent
8
+
9
+
10
+ class NoteCommand(HumanAgentCommand):
11
+ @property
12
+ def name(self) -> str:
13
+ return "note"
14
+
15
+ @property
16
+ def description(self) -> str:
17
+ return "Record a note in the task transcript."
18
+
19
+ @property
20
+ def group(self) -> Literal[1, 2, 3]:
21
+ return 1
22
+
23
+ def cli(self, args: Namespace) -> None:
24
+ print(
25
+ "Enter a multiline markdown note (Press Ctrl+D on a new line to finish):\n"
26
+ )
27
+ lines = ["## Human Agent Note"]
28
+ try:
29
+ while True:
30
+ line = input()
31
+ lines.append(line)
32
+ except EOFError:
33
+ pass
34
+ call_human_agent("note", content="\n".join(lines))
35
+
36
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
37
+ from inspect_ai.log._transcript import transcript
38
+
39
+ async def note(content: str) -> None:
40
+ transcript().info(content)
41
+
42
+ return note
@@ -0,0 +1,80 @@
1
+ from argparse import Namespace
2
+ from copy import deepcopy
3
+ from textwrap import dedent
4
+ from typing import Awaitable, Callable, Literal
5
+
6
+ from pydantic import JsonValue
7
+
8
+ from inspect_ai._util.ansi import render_text
9
+ from inspect_ai.model._model_output import ModelOutput
10
+ from inspect_ai.scorer._score import score
11
+
12
+ from ..._task_state import TaskState
13
+ from ..state import HumanAgentState, IntermediateScoring
14
+ from .command import HumanAgentCommand, call_human_agent
15
+
16
+
17
+ class ScoreCommand(HumanAgentCommand):
18
+ def __init__(self, state: TaskState):
19
+ self._state = state
20
+
21
+ @property
22
+ def name(self) -> str:
23
+ return "score"
24
+
25
+ @property
26
+ def description(self) -> str:
27
+ return "Score the task to check progress."
28
+
29
+ @property
30
+ def group(self) -> Literal[1, 2, 3]:
31
+ return 1
32
+
33
+ @property
34
+ def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
35
+ return [
36
+ HumanAgentCommand.CLIArg(
37
+ name="answer",
38
+ description="Answer to submit for scoring (optional, not required for all tasks)",
39
+ )
40
+ ]
41
+
42
+ def cli(self, args: Namespace) -> None:
43
+ # first validate (print and exit if we get a str back)
44
+ call_args = vars(args)
45
+ error = call_human_agent("validate", **call_args)
46
+ if error:
47
+ print(error)
48
+ return
49
+
50
+ print(call_human_agent("score", **call_args))
51
+
52
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
53
+ async def score_task(answer: str | None) -> str:
54
+ from inspect_ai.log._transcript import transcript
55
+
56
+ # make a copy of TaskState, add the answer, then score
57
+ if answer:
58
+ task_state = deepcopy(self._state)
59
+ task_state.output = ModelOutput.from_content("human_agent", answer)
60
+ result = await score(task_state)
61
+ else:
62
+ result = await score(self._state)
63
+
64
+ # record the scoring action in our state
65
+ state.scorings.append(IntermediateScoring(time=state.time, scores=result))
66
+
67
+ # record to transcript
68
+ transcript().info(
69
+ dedent(f"""
70
+ ### Intermediate Score
71
+ **Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
72
+ """)
73
+ )
74
+
75
+ # notify user
76
+ return render_text(
77
+ f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
78
+ )
79
+
80
+ return score_task
@@ -0,0 +1,62 @@
1
+ from argparse import Namespace
2
+ from typing import Awaitable, Callable, Literal
3
+
4
+ from pydantic import JsonValue
5
+ from rich.console import RenderableType
6
+ from rich.table import Table
7
+ from rich.text import Text
8
+
9
+ from inspect_ai._util.ansi import render_text
10
+ from inspect_ai._util.format import format_progress_time
11
+
12
+ from ..state import HumanAgentState
13
+ from .command import HumanAgentCommand, call_human_agent
14
+
15
+
16
+ class StatusCommand(HumanAgentCommand):
17
+ @property
18
+ def name(self) -> str:
19
+ return "status"
20
+
21
+ @property
22
+ def description(self) -> str:
23
+ return "Print task status (clock, scoring, etc.)"
24
+
25
+ @property
26
+ def group(self) -> Literal[1, 2, 3]:
27
+ return 2
28
+
29
+ def cli(self, args: Namespace) -> None:
30
+ print(call_human_agent("status"))
31
+
32
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
33
+ async def status() -> str:
34
+ return render_status(state)
35
+
36
+ return status
37
+
38
+
39
+ def render_status(state: HumanAgentState) -> str:
40
+ content: list[RenderableType] = [""]
41
+ content.append(
42
+ f"[bold]Status:[/bold] {'Running' if state.running else 'Stopped'} "
43
+ + f"[bold]Time:[/bold] {format_progress_time(state.time, pad_hours=False)}"
44
+ )
45
+
46
+ if len(state.scorings) > 0:
47
+ content.append("")
48
+ content.append(Text.from_markup("[italic]Intermediate Scores[/italic]"))
49
+ scores_table = Table(box=None, min_width=35, padding=(0, 0))
50
+ scores_table.add_column("Answer", justify="left")
51
+ scores_table.add_column("Score", justify="center")
52
+ scores_table.add_column("Time", justify="right")
53
+
54
+ for score in state.scorings:
55
+ scores_table.add_row(
56
+ score.scores[0].answer,
57
+ score.scores[0].as_str(),
58
+ format_progress_time(score.time),
59
+ )
60
+ content.append(scores_table)
61
+
62
+ return render_text(content, highlight=False)
@@ -0,0 +1,151 @@
1
+ from argparse import Namespace
2
+ from logging import getLogger
3
+ from pathlib import PurePosixPath
4
+ from re import Pattern, compile, match
5
+ from typing import Awaitable, Callable, Literal
6
+
7
+ from pydantic import JsonValue
8
+
9
+ from inspect_ai._util.ansi import render_text
10
+ from inspect_ai.util._sandbox import sandbox
11
+
12
+ from ..install import RECORD_SESSION_DIR
13
+ from ..state import HumanAgentState
14
+ from .command import HumanAgentCommand, call_human_agent
15
+
16
+ logger = getLogger(__name__)
17
+
18
+
19
+ class SubmitCommand(HumanAgentCommand):
20
+ def __init__(self, record_session: bool):
21
+ super().__init__()
22
+ self._record_session = record_session
23
+
24
+ @property
25
+ def name(self) -> str:
26
+ return "submit"
27
+
28
+ @property
29
+ def description(self) -> str:
30
+ return "Submit your final answer for the task."
31
+
32
+ @property
33
+ def group(self) -> Literal[1, 2, 3]:
34
+ return 1
35
+
36
+ @property
37
+ def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
38
+ return [
39
+ HumanAgentCommand.CLIArg(
40
+ name="answer",
41
+ description="Answer to submit for scoring (optional, not required for all tasks)",
42
+ )
43
+ ]
44
+
45
+ def cli(self, args: Namespace) -> None:
46
+ # read cli args
47
+ call_args = vars(args)
48
+
49
+ # first validate (print and exit if we get a str back)
50
+ error = call_human_agent("validate", **call_args)
51
+ if error:
52
+ print(error)
53
+ return
54
+
55
+ # verify that the user wants to proceed
56
+ answer = call_args.get("answer", None)
57
+ answer_text = f" '{answer}'" if answer else ""
58
+ while True:
59
+ response = (
60
+ input(
61
+ f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
62
+ )
63
+ .lower()
64
+ .strip()
65
+ )
66
+ if response in ["yes", "y"]:
67
+ break
68
+ elif response in ["no", "n"]:
69
+ return
70
+ else:
71
+ print("Please enter yes or no.")
72
+
73
+ # thank the user!
74
+ print(
75
+ "\nThank you for working on this task!\n\n"
76
+ + "Your task will now be scored and you will be disconnected from this container.\n"
77
+ )
78
+
79
+ # submit the task
80
+ call_human_agent("submit", **call_args)
81
+
82
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
83
+ async def submit(
84
+ answer: str | None, session_logs: dict[str, str] | None = None
85
+ ) -> None:
86
+ if self._record_session:
87
+ state.logs = await self._read_session_logs()
88
+ state.running = False
89
+ state.answer = answer
90
+
91
+ return submit
92
+
93
+ async def _read_session_logs(self) -> dict[str, str]:
94
+ # retreive session logs (don't fail)
95
+ sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
96
+ result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
97
+ if not result.success:
98
+ logger.warning(f"Error listing human agent session logs: {result.stderr}")
99
+ return {}
100
+
101
+ # read logs
102
+ session_logs: dict[str, str] = {}
103
+ for session_log in result.stdout.strip().splitlines():
104
+ try:
105
+ session_logs[session_log] = await sandbox().read_file(
106
+ (sessions_dir / session_log).as_posix()
107
+ )
108
+ except Exception as ex:
109
+ logger.warning(f"Error reading human agent session log: {ex}")
110
+
111
+ return session_logs
112
+
113
+
114
+ class ValidateCommand(HumanAgentCommand):
115
+ def __init__(self, answer: bool | str) -> None:
116
+ self._answer = compile(answer) if isinstance(answer, str) else answer
117
+
118
+ @property
119
+ def name(self) -> str:
120
+ return "validate"
121
+
122
+ @property
123
+ def description(self) -> str:
124
+ return "Validate a task submission."
125
+
126
+ @property
127
+ def contexts(self) -> list[Literal["cli", "service"]]:
128
+ return ["service"]
129
+
130
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
131
+ async def validate(answer: str | None) -> str | None:
132
+ def failed(reason: str) -> str:
133
+ return render_text(f"[bold]FAILED:[/bold] {reason}")
134
+
135
+ if not state.running:
136
+ return failed("Task is stopped (use 'task start' to start)")
137
+ if self._answer:
138
+ answer = answer.strip() if isinstance(answer, str) else answer
139
+ if not answer:
140
+ return failed(
141
+ "An explicit answer is required for scoring this task."
142
+ )
143
+ elif isinstance(self._answer, Pattern) and not match(
144
+ self._answer, answer
145
+ ):
146
+ return failed(
147
+ "Your answer was not in the required format (please review the task instructions)"
148
+ )
149
+ return None # made it through verification
150
+
151
+ return validate