inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. inspect_ai/_cli/eval.py +27 -9
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +9 -3
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/samples.py +4 -10
  9. inspect_ai/_display/textual/widgets/transcript.py +35 -18
  10. inspect_ai/_eval/eval.py +14 -2
  11. inspect_ai/_eval/evalset.py +6 -1
  12. inspect_ai/_eval/run.py +6 -0
  13. inspect_ai/_eval/task/run.py +49 -23
  14. inspect_ai/_eval/task/task.py +26 -3
  15. inspect_ai/_util/content.py +20 -1
  16. inspect_ai/_util/interrupt.py +6 -0
  17. inspect_ai/_util/logger.py +19 -0
  18. inspect_ai/_util/rich.py +7 -8
  19. inspect_ai/_util/text.py +13 -0
  20. inspect_ai/_util/transcript.py +20 -6
  21. inspect_ai/_util/working.py +50 -0
  22. inspect_ai/_view/www/App.css +6 -0
  23. inspect_ai/_view/www/dist/assets/index.css +171 -99
  24. inspect_ai/_view/www/dist/assets/index.js +5972 -2770
  25. inspect_ai/_view/www/eslint.config.mjs +24 -1
  26. inspect_ai/_view/www/log-schema.json +619 -21
  27. inspect_ai/_view/www/package.json +8 -3
  28. inspect_ai/_view/www/src/App.tsx +2 -2
  29. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  30. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  31. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  32. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  33. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  34. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  35. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  36. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  37. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  38. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  39. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  40. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  41. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  42. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  43. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  44. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  45. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  46. inspect_ai/_view/www/src/index.tsx +2 -2
  47. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  48. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  49. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  50. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
  51. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  52. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  53. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  54. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  55. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  56. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  57. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  58. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  59. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  60. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
  61. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  62. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
  63. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  64. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  65. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  66. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  67. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  68. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  69. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  70. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  72. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  73. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  74. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  75. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  76. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  77. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  78. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  79. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  80. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  83. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  84. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  85. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  86. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  87. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  88. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  89. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  90. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  91. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  92. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  93. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  94. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  95. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  96. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  97. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  98. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  99. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  100. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
  101. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  102. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  103. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
  104. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  105. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
  106. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  107. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  108. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
  109. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  110. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  111. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  112. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  113. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  114. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  115. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  116. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  117. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  118. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  119. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  120. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  121. inspect_ai/_view/www/src/types/log.d.ts +312 -137
  122. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  123. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  124. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  125. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  126. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  127. inspect_ai/_view/www/src/utils/format.ts +8 -5
  128. inspect_ai/_view/www/src/utils/json.ts +24 -0
  129. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  130. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
  131. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  132. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  133. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  134. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  135. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  136. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  137. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  138. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  139. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  140. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  141. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  142. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  143. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  144. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  145. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  146. inspect_ai/_view/www/yarn.lock +241 -5
  147. inspect_ai/log/__init__.py +2 -0
  148. inspect_ai/log/_condense.py +4 -0
  149. inspect_ai/log/_log.py +72 -12
  150. inspect_ai/log/_recorders/eval.py +6 -1
  151. inspect_ai/log/_samples.py +5 -1
  152. inspect_ai/log/_transcript.py +89 -2
  153. inspect_ai/model/__init__.py +2 -0
  154. inspect_ai/model/_call_tools.py +8 -1
  155. inspect_ai/model/_chat_message.py +22 -7
  156. inspect_ai/model/_conversation.py +11 -9
  157. inspect_ai/model/_generate_config.py +25 -4
  158. inspect_ai/model/_model.py +164 -72
  159. inspect_ai/model/_model_call.py +10 -3
  160. inspect_ai/model/_model_output.py +3 -0
  161. inspect_ai/model/_openai.py +106 -40
  162. inspect_ai/model/_providers/anthropic.py +145 -26
  163. inspect_ai/model/_providers/bedrock.py +7 -0
  164. inspect_ai/model/_providers/cloudflare.py +20 -7
  165. inspect_ai/model/_providers/google.py +29 -8
  166. inspect_ai/model/_providers/groq.py +66 -27
  167. inspect_ai/model/_providers/hf.py +6 -0
  168. inspect_ai/model/_providers/mistral.py +78 -51
  169. inspect_ai/model/_providers/openai.py +66 -4
  170. inspect_ai/model/_providers/openai_o1.py +10 -0
  171. inspect_ai/model/_providers/providers.py +2 -2
  172. inspect_ai/model/_providers/util/tracker.py +92 -0
  173. inspect_ai/model/_providers/vllm.py +13 -5
  174. inspect_ai/model/_reasoning.py +15 -2
  175. inspect_ai/scorer/_model.py +23 -19
  176. inspect_ai/solver/_basic_agent.py +1 -3
  177. inspect_ai/solver/_bridge/patch.py +0 -2
  178. inspect_ai/solver/_human_agent/agent.py +14 -10
  179. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  180. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  181. inspect_ai/solver/_limit.py +4 -4
  182. inspect_ai/solver/_plan.py +0 -3
  183. inspect_ai/solver/_task_state.py +7 -0
  184. inspect_ai/tool/__init__.py +2 -0
  185. inspect_ai/tool/_tool.py +3 -1
  186. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  187. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  191. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  192. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  196. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  197. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  198. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  199. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  200. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  201. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  202. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  203. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  204. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  205. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  206. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  207. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  208. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  209. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  210. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  211. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  212. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  214. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  215. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  216. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  217. inspect_ai/tool/_tools/_web_search.py +3 -3
  218. inspect_ai/util/__init__.py +2 -1
  219. inspect_ai/util/_concurrency.py +14 -8
  220. inspect_ai/util/_display.py +12 -0
  221. inspect_ai/util/_sandbox/context.py +15 -0
  222. inspect_ai/util/_sandbox/docker/docker.py +7 -5
  223. inspect_ai/util/_sandbox/environment.py +32 -1
  224. inspect_ai/util/_sandbox/events.py +183 -0
  225. inspect_ai/util/_sandbox/local.py +3 -3
  226. inspect_ai/util/_sandbox/self_check.py +131 -43
  227. inspect_ai/util/_subtask.py +11 -0
  228. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
  229. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
  230. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
  231. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  232. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  233. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  234. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  235. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  236. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  237. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  238. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  239. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  240. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
  241. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
  242. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
1
1
  import asyncio
2
+ from typing import cast
2
3
 
3
4
  from inspect_ai.util import display_type, input_panel, sandbox
5
+ from inspect_ai.util._sandbox.events import SandboxEnvironmentProxy
4
6
 
5
7
  from .._solver import Generate, Solver, solver
6
8
  from .._task_state import TaskState
@@ -56,19 +58,21 @@ def human_agent(
56
58
 
57
59
  # helper function to run the agent (called for fullscreen vs. fallback below)
58
60
  async def run_human_agent(view: HumanAgentView) -> TaskState:
59
- # create agent commands
60
- commands = human_agent_commands(
61
- state, answer, intermediate_scoring, record_session
62
- )
61
+ sandbox_proxy = cast(SandboxEnvironmentProxy, sandbox())
62
+ with sandbox_proxy.no_events():
63
+ # create agent commands
64
+ commands = human_agent_commands(
65
+ state, answer, intermediate_scoring, record_session
66
+ )
63
67
 
64
- # install agent tools
65
- await install_human_agent(state, commands, record_session)
68
+ # install agent tools
69
+ await install_human_agent(state, commands, record_session)
66
70
 
67
- # hookup the view ui
68
- view.connect(connection)
71
+ # hookup the view ui
72
+ view.connect(connection)
69
73
 
70
- # run sandbox service
71
- return await run_human_agent_service(state, commands, view)
74
+ # run sandbox service
75
+ return await run_human_agent_service(state, commands, view)
72
76
 
73
77
  # support both fullscreen ui and fallback
74
78
  if display_type() == "full":
@@ -6,7 +6,7 @@ from .instructions import InstructionsCommand
6
6
  from .note import NoteCommand
7
7
  from .score import ScoreCommand
8
8
  from .status import StatusCommand
9
- from .submit import SubmitCommand, ValidateCommand
9
+ from .submit import QuitCommand, SubmitCommand, ValidateCommand
10
10
 
11
11
 
12
12
  def human_agent_commands(
@@ -15,8 +15,12 @@ def human_agent_commands(
15
15
  intermediate_scoring: bool,
16
16
  record_session: bool,
17
17
  ) -> list[HumanAgentCommand]:
18
- # base submit and validate
19
- commands = [SubmitCommand(record_session), ValidateCommand(answer)]
18
+ # base submit, validate, and quit
19
+ commands = [
20
+ SubmitCommand(record_session),
21
+ ValidateCommand(answer),
22
+ QuitCommand(record_session),
23
+ ]
20
24
 
21
25
  # optional intermediate scoring
22
26
  if intermediate_scoring:
@@ -16,22 +16,89 @@ from .command import HumanAgentCommand, call_human_agent
16
16
  logger = getLogger(__name__)
17
17
 
18
18
 
19
- class SubmitCommand(HumanAgentCommand):
19
+ class SessionEndCommand(HumanAgentCommand):
20
20
  def __init__(self, record_session: bool):
21
21
  super().__init__()
22
22
  self._record_session = record_session
23
23
 
24
+ @property
25
+ def group(self) -> Literal[1, 2, 3]:
26
+ return 1
27
+
28
+ async def _read_session_logs(self) -> dict[str, str]:
29
+ # retreive session logs (don't fail)
30
+ sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
31
+ result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
32
+ if not result.success:
33
+ logger.warning(f"Error listing human agent session logs: {result.stderr}")
34
+ return {}
35
+
36
+ # read logs
37
+ session_logs: dict[str, str] = {}
38
+ for session_log in result.stdout.strip().splitlines():
39
+ try:
40
+ session_logs[session_log] = await sandbox().read_file(
41
+ (sessions_dir / session_log).as_posix()
42
+ )
43
+ except Exception as ex:
44
+ logger.warning(f"Error reading human agent session log: {ex}")
45
+
46
+ return session_logs
47
+
48
+
49
+ class QuitCommand(SessionEndCommand):
24
50
  @property
25
51
  def name(self) -> str:
26
- return "submit"
52
+ return "quit"
27
53
 
28
54
  @property
29
55
  def description(self) -> str:
30
- return "Submit your final answer for the task."
56
+ return "Quit the task without submitting an answer."
57
+
58
+ def cli(self, args: Namespace) -> None:
59
+ # verify that the user wants to proceed
60
+ action = "quit the task without submitting an answer (ending the exercise)"
61
+ while True:
62
+ response = (
63
+ input(
64
+ f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
65
+ )
66
+ .lower()
67
+ .strip()
68
+ )
69
+ if response in ["yes", "y"]:
70
+ break
71
+ elif response in ["no", "n"]:
72
+ return
73
+ else:
74
+ print("Please enter yes or no.")
31
75
 
76
+ # thank the user!
77
+ print(
78
+ "\nThank you for working on this task!\n\n"
79
+ + "Your task will now be scored and you will be disconnected from this container.\n"
80
+ )
81
+
82
+ call_human_agent("quit")
83
+
84
+ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
85
+ async def submit() -> None:
86
+ if self._record_session:
87
+ state.logs = await self._read_session_logs()
88
+ state.running = False
89
+ state.answer = ""
90
+
91
+ return submit
92
+
93
+
94
+ class SubmitCommand(SessionEndCommand):
32
95
  @property
33
- def group(self) -> Literal[1, 2, 3]:
34
- return 1
96
+ def name(self) -> str:
97
+ return "submit"
98
+
99
+ @property
100
+ def description(self) -> str:
101
+ return "Submit your final answer for the task."
35
102
 
36
103
  @property
37
104
  def cli_args(self) -> list[HumanAgentCommand.CLIArg]:
@@ -55,10 +122,12 @@ class SubmitCommand(HumanAgentCommand):
55
122
  # verify that the user wants to proceed
56
123
  answer = call_args.get("answer", None)
57
124
  answer_text = f" '{answer}'" if answer else ""
125
+ action = f"end the task and submit{answer_text}"
126
+
58
127
  while True:
59
128
  response = (
60
129
  input(
61
- f"\nDo you definitely want to end the task and submit{answer_text}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
130
+ f"\nDo you definitely want to {action}?\n\nThis will disconnect you from the task environment and you won't be able to reconnect.\n\nYes (y) or No (n): "
62
131
  )
63
132
  .lower()
64
133
  .strip()
@@ -76,13 +145,10 @@ class SubmitCommand(HumanAgentCommand):
76
145
  + "Your task will now be scored and you will be disconnected from this container.\n"
77
146
  )
78
147
 
79
- # submit the task
80
148
  call_human_agent("submit", **call_args)
81
149
 
82
150
  def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
83
- async def submit(
84
- answer: str | None, session_logs: dict[str, str] | None = None
85
- ) -> None:
151
+ async def submit(answer: str) -> None:
86
152
  if self._record_session:
87
153
  state.logs = await self._read_session_logs()
88
154
  state.running = False
@@ -90,26 +156,6 @@ class SubmitCommand(HumanAgentCommand):
90
156
 
91
157
  return submit
92
158
 
93
- async def _read_session_logs(self) -> dict[str, str]:
94
- # retreive session logs (don't fail)
95
- sessions_dir = PurePosixPath(RECORD_SESSION_DIR)
96
- result = await sandbox().exec(["ls", "-1", sessions_dir.as_posix()])
97
- if not result.success:
98
- logger.warning(f"Error listing human agent session logs: {result.stderr}")
99
- return {}
100
-
101
- # read logs
102
- session_logs: dict[str, str] = {}
103
- for session_log in result.stdout.strip().splitlines():
104
- try:
105
- session_logs[session_log] = await sandbox().read_file(
106
- (sessions_dir / session_log).as_posix()
107
- )
108
- except Exception as ex:
109
- logger.warning(f"Error reading human agent session log: {ex}")
110
-
111
- return session_logs
112
-
113
159
 
114
160
  class ValidateCommand(HumanAgentCommand):
115
161
  def __init__(self, answer: bool | str) -> None:
@@ -7,15 +7,15 @@ class SampleLimitExceededError(Exception):
7
7
  """Exception raised when a sample limit is exceeded.
8
8
 
9
9
  Args:
10
- type (Literal["message", "time", "token", "operator"]): Type of limit exceeded.
11
- value (int): Value compared to
12
- limit (int): Limit applied.
10
+ type: Type of limit exceeded.
11
+ value: Value compared to
12
+ limit: Limit applied.
13
13
  message (str | None): Optional. Human readable message.
14
14
  """
15
15
 
16
16
  def __init__(
17
17
  self,
18
- type: Literal["message", "time", "token", "operator", "custom"],
18
+ type: Literal["message", "time", "working", "token", "operator", "custom"],
19
19
  *,
20
20
  value: int,
21
21
  limit: int,
@@ -118,9 +118,6 @@ class Plan(Solver):
118
118
  st.complete(state)
119
119
  check_sample_interrupt()
120
120
 
121
- # mark completed
122
- state.completed = True
123
-
124
121
  finally:
125
122
  # always do cleanup if we have one
126
123
  if self.cleanup:
@@ -7,6 +7,7 @@ from random import Random
7
7
  from typing import Any, Iterable, SupportsIndex, Type, Union, cast, overload
8
8
 
9
9
  from pydantic_core import to_jsonable_python
10
+ from shortuuid import uuid
10
11
 
11
12
  from inspect_ai._util.interrupt import check_sample_interrupt
12
13
  from inspect_ai.dataset._dataset import MT, Sample, metadata_as
@@ -165,6 +166,7 @@ class TaskState:
165
166
  self._token_limit = token_limit
166
167
  self._completed = completed
167
168
  self._store = Store()
169
+ self._uuid = uuid()
168
170
 
169
171
  if choices:
170
172
  self.choices = Choices(choices)
@@ -373,6 +375,11 @@ class TaskState:
373
375
  scores: dict[str, Score] | None = None
374
376
  """Scores yielded by running task."""
375
377
 
378
+ @property
379
+ def uuid(self) -> str:
380
+ """Globally unique identifier for sample run."""
381
+ return self._uuid
382
+
376
383
  def metadata_as(self, metadata_cls: Type[MT]) -> MT:
377
384
  """Pydantic model interface to metadata.
378
385
 
@@ -2,6 +2,7 @@ from inspect_ai._util.content import (
2
2
  Content,
3
3
  ContentAudio,
4
4
  ContentImage,
5
+ ContentReasoning,
5
6
  ContentText,
6
7
  ContentVideo,
7
8
  )
@@ -41,6 +42,7 @@ __all__ = [
41
42
  "Content",
42
43
  "ContentAudio",
43
44
  "ContentImage",
45
+ "ContentReasoning",
44
46
  "ContentText",
45
47
  "ContentVideo",
46
48
  "ToolCall",
inspect_ai/tool/_tool.py CHANGED
@@ -14,6 +14,7 @@ from typing import (
14
14
  from inspect_ai._util.content import (
15
15
  ContentAudio,
16
16
  ContentImage,
17
+ ContentReasoning,
17
18
  ContentText,
18
19
  ContentVideo,
19
20
  )
@@ -35,10 +36,11 @@ ToolResult = (
35
36
  | float
36
37
  | bool
37
38
  | ContentText
39
+ | ContentReasoning
38
40
  | ContentImage
39
41
  | ContentAudio
40
42
  | ContentVideo
41
- | list[ContentText | ContentImage | ContentAudio | ContentVideo]
43
+ | list[ContentText | ContentReasoning | ContentImage | ContentAudio | ContentVideo]
42
44
  )
43
45
  """Valid types for results from tool calls."""
44
46
 
@@ -32,7 +32,7 @@ async def run(
32
32
  maybe_truncate(stdout.decode(), truncate_after=truncate_after),
33
33
  maybe_truncate(stderr.decode(), truncate_after=truncate_after),
34
34
  )
35
- except asyncio.TimeoutError as exc:
35
+ except (TimeoutError, asyncio.TimeoutError) as exc:
36
36
  try:
37
37
  process.kill()
38
38
  except ProcessLookupError:
@@ -0,0 +1,8 @@
1
+ [MASTER]
2
+ ; R - Refactorings
3
+ ; C - Convention
4
+ ; W - Warning
5
+ ; E - Error
6
+ enable=C,R,W,E
7
+ disable=R0903,C0114,C0115,C0116,C0301,C0411,C1804,C1805,W0120,W0511,E0401,E1101,E0611,E1128
8
+ score=no
@@ -0,0 +1,24 @@
1
+ {
2
+ "version": "0.2.0",
3
+ "configurations": [
4
+ {
5
+ "type": "debugpy",
6
+ "request": "launch",
7
+ "name": "Debug Web Server",
8
+ "program": "${workspaceFolder}/web_server.py"
9
+ },
10
+ {
11
+ "type": "debugpy",
12
+ "request": "launch",
13
+ "name": "Debug Web Client interactive mode",
14
+ "program": "${workspaceFolder}/web_client.py"
15
+ },
16
+ {
17
+ "type": "debugpy",
18
+ "request": "launch",
19
+ "name": "Debug Web Client w/arguments",
20
+ "program": "${workspaceFolder}/web_client.py",
21
+ "args": ["${command:pickArgs}"]
22
+ }
23
+ ]
24
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "cSpell.words": [
3
+ "activedescendant",
4
+ "describedby",
5
+ "domcontentloaded",
6
+ "figcaption",
7
+ "flowto",
8
+ "framenavigated",
9
+ "headful",
10
+ "idref",
11
+ "jsonrpcclient",
12
+ "jsonrpcserver",
13
+ "keepalive",
14
+ "keyshortcuts",
15
+ "labelfor",
16
+ "labelledby",
17
+ "labelwrapped",
18
+ "multiselectable",
19
+ "Rects",
20
+ "roledescription",
21
+ "rubyannotation",
22
+ "tablecaption",
23
+ "valuetext"
24
+ ]
25
+ }
@@ -8,16 +8,15 @@ RUN apt-get update
8
8
 
9
9
  RUN pip install --upgrade pip
10
10
 
11
+ RUN pip install playwright jsonrpcclient jsonrpcserver httpx aiohttp pillow pydantic tenacity
12
+
11
13
  # Install playwright
12
- RUN pip install playwright
13
14
  RUN playwright install
14
15
  RUN playwright install-deps
15
16
 
16
- # Install other dependancies
17
- RUN pip install dm-env-rpc pillow bs4 lxml
18
-
19
17
  # Copy Python files alongside the Dockerfile
20
- COPY *.py ./
18
+ COPY . .
21
19
 
22
20
  # Run the server
23
- CMD ["python3", "/app/web_browser/web_server.py"]
21
+ CMD ["python3", "/app/web_browser/web_server.py"]
22
+ # CMD ["tail", "-f", "/dev/null"]
@@ -1,7 +1,6 @@
1
1
  ## Headless Browser Tool
2
2
 
3
- This directory contains an implementation for the Headless Browser Tool which can be used to test web browsing agents.
4
-
3
+ This directory contains an implementation for the Headless Browser Tool which can be used to test web browsing agents.
5
4
 
6
5
  ### Usage
7
6
 
@@ -37,27 +36,27 @@ The result will be printed out in _stdout_ in the following format:
37
36
 
38
37
  ```
39
38
  # Inside the Docker container
40
- error: <an ERROR message if one occured>
39
+ error: <an ERROR message if one occurred>
41
40
  info: <general info about the container>
42
41
  web_url: <the URL of the page the browser is currently at>
43
42
  web_at: <accessibility tree of the visible elements of the page>
44
- ```
45
-
43
+ ```
46
44
 
47
45
  ### Design
48
46
 
49
47
  The following diagram describes the design and the intended usage of the tool:
50
48
 
51
- ![diagram](images/usage_diagram.png)
49
+ ![diagram](images/usage_diagram.svg)
52
50
 
53
51
  The tool consists of the following components:
54
52
 
55
- * [WebServer](web_server.py) - a server which launches a stateful session with the headless chromium browser and interracts with it through the [Playwright API](https://playwright.dev/python/docs/intro) upon receiving client commands. The server components are:
56
- * _dm_env_servicer.py_ - an implementation for the gRPC Service based on [dm_env_rpc protocol](https://github.com/google-deepmind/dm_env_rpc).
57
- * _web_environment.py_ - an environment which gets instantiated by the servicer and which launches the browser, stores its state and maps client commands to Playwright API.
58
- * _playwright_crawler.py_ - a wrapper over the sync Playwright API.
53
+ - [WebServer](web_server.py) - a server which launches a stateful session with the headless chromium browser and interacts with it through the [Playwright API](https://playwright.dev/python/docs/intro) upon receiving client commands. The server components are:
54
+
55
+ - _dm_env_servicer.py_ - an implementation for the gRPC Service based on [dm_env_rpc protocol](https://github.com/google-deepmind/dm_env_rpc).
56
+ - _web_environment.py_ - an environment which gets instantiated by the servicer and which launches the browser, stores its state and maps client commands to Playwright API.
57
+ - _playwright_crawler.py_ - a wrapper over the sync Playwright API.
59
58
 
60
- * [WebClient](web_client.py) - a simple stateless client to interact with the server. When launched, the client:
59
+ - [WebClient](web_client.py) - a simple stateless client to interact with the server. When launched, the client:
61
60
  1. creates a connection with the server;
62
61
  2. sends user command to the server;
63
62
  3. receives the response in the form of observations and prints them to stdout;
@@ -0,0 +1,71 @@
1
+ from functools import reduce
2
+ from typing import Iterable, TypedDict
3
+
4
+ from accessibility_tree_node import AccessibilityTreeNode
5
+ from cdp.a11y import AXNode, AXNodeId
6
+ from cdp.dom_snapshot import DOMSnapshot, create_snapshot_context
7
+ from rectangle import Rectangle
8
+
9
+ _AccType = tuple[
10
+ AXNode | None,
11
+ dict[AXNodeId, AXNode],
12
+ ]
13
+
14
+
15
+ class AccessibilityTree(TypedDict):
16
+ root: AccessibilityTreeNode
17
+ nodes: dict[AXNodeId, AccessibilityTreeNode]
18
+
19
+
20
+ def create_accessibility_tree(
21
+ *,
22
+ ax_nodes: Iterable[AXNode],
23
+ dom_snapshot: DOMSnapshot,
24
+ device_scale_factor: float,
25
+ window_bounds: Rectangle,
26
+ ) -> AccessibilityTree | None:
27
+ """
28
+ Creates an accessibility tree from the given Chrome DevTools Protocol AX nodes and DOM snapshot.
29
+
30
+ Args:
31
+ ax_nodes (Iterable[AXNode]): An iterable of AXNode objects representing the accessibility nodes.
32
+ dom_snapshot (DOMSnapshot): A snapshot of the DOM at the time of accessibility tree creation.
33
+ device_scale_factor (float): The scale factor of the device.
34
+ window_bounds (Bounds): The bounds of the window.
35
+
36
+ Returns:
37
+ AccessibilityTree: The accessibility tree.
38
+ """
39
+
40
+ # first make a dict of AXNodeId's to AXNode's and find the root on the way
41
+ def reducer(acc: _AccType, ax_node: AXNode) -> _AccType:
42
+ root_node, nodes = acc
43
+ nodes[ax_node.nodeId] = ax_node
44
+ return (
45
+ # TODO: What do we want for multiple roots?
46
+ root_node or (ax_node if ax_node.parentId is None else None),
47
+ nodes,
48
+ )
49
+
50
+ initial_acc: _AccType = (None, {}) # The inference engine is weak
51
+ root_node, nodes = reduce(reducer, ax_nodes, initial_acc)
52
+
53
+ if not root_node:
54
+ return None
55
+
56
+ # Now create the AccessibilityTreeNode hierarchy
57
+ snapshot_context = create_snapshot_context(dom_snapshot)
58
+ all_accessibility_tree_nodes: dict[AXNodeId, AccessibilityTreeNode] = {}
59
+
60
+ return AccessibilityTree(
61
+ root=AccessibilityTreeNode(
62
+ ax_node=root_node,
63
+ ax_nodes=nodes,
64
+ parent=None,
65
+ all_accessibility_tree_nodes=all_accessibility_tree_nodes,
66
+ snapshot_context=snapshot_context,
67
+ device_scale_factor=device_scale_factor,
68
+ window_bounds=window_bounds,
69
+ ),
70
+ nodes=all_accessibility_tree_nodes,
71
+ )