inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (297) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/eval.py +35 -2
  3. inspect_ai/_cli/util.py +44 -1
  4. inspect_ai/_display/core/config.py +1 -1
  5. inspect_ai/_display/core/display.py +13 -4
  6. inspect_ai/_display/core/results.py +1 -1
  7. inspect_ai/_display/textual/app.py +14 -3
  8. inspect_ai/_display/textual/display.py +4 -0
  9. inspect_ai/_display/textual/widgets/samples.py +9 -3
  10. inspect_ai/_display/textual/widgets/task_detail.py +8 -8
  11. inspect_ai/_display/textual/widgets/tasks.py +17 -1
  12. inspect_ai/_display/textual/widgets/vscode.py +44 -0
  13. inspect_ai/_eval/eval.py +74 -25
  14. inspect_ai/_eval/evalset.py +22 -18
  15. inspect_ai/_eval/loader.py +34 -11
  16. inspect_ai/_eval/run.py +13 -15
  17. inspect_ai/_eval/score.py +13 -3
  18. inspect_ai/_eval/task/generate.py +8 -9
  19. inspect_ai/_eval/task/log.py +55 -6
  20. inspect_ai/_eval/task/run.py +51 -10
  21. inspect_ai/_eval/task/task.py +23 -9
  22. inspect_ai/_util/constants.py +2 -0
  23. inspect_ai/_util/file.py +30 -1
  24. inspect_ai/_util/json.py +37 -1
  25. inspect_ai/_util/registry.py +1 -0
  26. inspect_ai/_util/vscode.py +37 -0
  27. inspect_ai/_view/server.py +113 -1
  28. inspect_ai/_view/www/App.css +7 -1
  29. inspect_ai/_view/www/dist/assets/index.css +813 -415
  30. inspect_ai/_view/www/dist/assets/index.js +54475 -32003
  31. inspect_ai/_view/www/eslint.config.mjs +1 -1
  32. inspect_ai/_view/www/log-schema.json +137 -31
  33. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  34. inspect_ai/_view/www/package.json +11 -2
  35. inspect_ai/_view/www/src/App.tsx +161 -853
  36. inspect_ai/_view/www/src/api/api-browser.ts +176 -5
  37. inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
  38. inspect_ai/_view/www/src/api/client-api.ts +66 -10
  39. inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
  40. inspect_ai/_view/www/src/api/types.ts +107 -2
  41. inspect_ai/_view/www/src/appearance/icons.ts +2 -0
  42. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
  43. inspect_ai/_view/www/src/components/Card.tsx +6 -4
  44. inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
  45. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
  46. inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
  47. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
  48. inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
  49. inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
  50. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
  51. inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
  52. inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
  53. inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
  54. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
  55. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
  56. inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
  57. inspect_ai/_view/www/src/components/Modal.module.css +38 -0
  58. inspect_ai/_view/www/src/components/Modal.tsx +77 -0
  59. inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
  60. inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
  61. inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
  62. inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
  63. inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
  64. inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
  65. inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
  66. inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
  67. inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
  68. inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
  69. inspect_ai/_view/www/src/index.tsx +26 -94
  70. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
  71. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
  72. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
  73. inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
  74. inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
  75. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
  76. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
  77. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
  78. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
  79. inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
  80. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
  81. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
  82. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
  83. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
  84. inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
  85. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
  86. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
  87. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
  88. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
  89. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
  90. inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
  91. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
  92. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
  93. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
  94. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
  95. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
  96. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
  97. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
  98. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
  99. inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
  100. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
  101. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
  102. inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
  103. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
  104. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
  105. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
  106. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
  107. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
  108. inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
  109. inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
  110. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
  111. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
  112. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
  113. inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
  114. inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
  115. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
  116. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
  117. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
  118. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
  119. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
  120. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
  121. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
  122. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
  123. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
  124. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
  125. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
  126. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
  127. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
  128. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
  129. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
  130. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
  131. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
  132. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
  133. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
  134. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
  135. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
  136. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
  137. inspect_ai/_view/www/src/scoring/utils.ts +87 -0
  138. inspect_ai/_view/www/src/state/appSlice.ts +244 -0
  139. inspect_ai/_view/www/src/state/hooks.ts +399 -0
  140. inspect_ai/_view/www/src/state/logPolling.ts +200 -0
  141. inspect_ai/_view/www/src/state/logSlice.ts +224 -0
  142. inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
  143. inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
  144. inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
  145. inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
  146. inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
  147. inspect_ai/_view/www/src/state/scrolling.ts +206 -0
  148. inspect_ai/_view/www/src/state/store.ts +168 -0
  149. inspect_ai/_view/www/src/state/store_filter.ts +84 -0
  150. inspect_ai/_view/www/src/state/utils.ts +23 -0
  151. inspect_ai/_view/www/src/storage/index.ts +26 -0
  152. inspect_ai/_view/www/src/types/log.d.ts +36 -26
  153. inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
  154. inspect_ai/_view/www/src/types.ts +94 -32
  155. inspect_ai/_view/www/src/utils/attachments.ts +58 -23
  156. inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
  157. inspect_ai/_view/www/src/utils/logger.ts +52 -0
  158. inspect_ai/_view/www/src/utils/polling.ts +100 -0
  159. inspect_ai/_view/www/src/utils/react.ts +30 -0
  160. inspect_ai/_view/www/src/utils/vscode.ts +1 -1
  161. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
  162. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
  163. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
  164. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
  165. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
  166. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
  167. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
  168. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
  169. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
  170. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
  171. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
  172. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
  173. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
  174. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
  175. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
  176. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
  177. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
  178. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
  179. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
  180. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
  181. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
  182. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
  183. inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
  184. inspect_ai/_view/www/src/workspace/types.ts +4 -3
  185. inspect_ai/_view/www/src/workspace/utils.ts +4 -4
  186. inspect_ai/_view/www/vite.config.js +6 -0
  187. inspect_ai/_view/www/yarn.lock +464 -355
  188. inspect_ai/agent/__init__.py +36 -0
  189. inspect_ai/agent/_agent.py +268 -0
  190. inspect_ai/agent/_as_solver.py +72 -0
  191. inspect_ai/agent/_as_tool.py +122 -0
  192. inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
  193. inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
  194. inspect_ai/agent/_filter.py +46 -0
  195. inspect_ai/agent/_handoff.py +93 -0
  196. inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
  197. inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
  198. inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
  199. inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
  200. inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
  201. inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
  202. inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
  203. inspect_ai/agent/_react.py +241 -0
  204. inspect_ai/agent/_run.py +36 -0
  205. inspect_ai/agent/_types.py +81 -0
  206. inspect_ai/log/_condense.py +26 -0
  207. inspect_ai/log/_log.py +17 -5
  208. inspect_ai/log/_recorders/buffer/__init__.py +14 -0
  209. inspect_ai/log/_recorders/buffer/buffer.py +30 -0
  210. inspect_ai/log/_recorders/buffer/database.py +685 -0
  211. inspect_ai/log/_recorders/buffer/filestore.py +259 -0
  212. inspect_ai/log/_recorders/buffer/types.py +84 -0
  213. inspect_ai/log/_recorders/eval.py +2 -11
  214. inspect_ai/log/_recorders/types.py +30 -0
  215. inspect_ai/log/_transcript.py +32 -2
  216. inspect_ai/model/__init__.py +7 -1
  217. inspect_ai/model/_call_tools.py +257 -52
  218. inspect_ai/model/_chat_message.py +7 -4
  219. inspect_ai/model/_conversation.py +13 -62
  220. inspect_ai/model/_display.py +85 -0
  221. inspect_ai/model/_generate_config.py +2 -2
  222. inspect_ai/model/_model.py +114 -14
  223. inspect_ai/model/_model_output.py +14 -9
  224. inspect_ai/model/_openai.py +16 -4
  225. inspect_ai/model/_openai_computer_use.py +162 -0
  226. inspect_ai/model/_openai_responses.py +319 -165
  227. inspect_ai/model/_providers/anthropic.py +20 -21
  228. inspect_ai/model/_providers/azureai.py +24 -13
  229. inspect_ai/model/_providers/bedrock.py +1 -7
  230. inspect_ai/model/_providers/cloudflare.py +3 -3
  231. inspect_ai/model/_providers/goodfire.py +2 -6
  232. inspect_ai/model/_providers/google.py +11 -10
  233. inspect_ai/model/_providers/groq.py +6 -3
  234. inspect_ai/model/_providers/hf.py +7 -3
  235. inspect_ai/model/_providers/mistral.py +7 -10
  236. inspect_ai/model/_providers/openai.py +47 -17
  237. inspect_ai/model/_providers/openai_o1.py +11 -4
  238. inspect_ai/model/_providers/openai_responses.py +12 -14
  239. inspect_ai/model/_providers/providers.py +2 -2
  240. inspect_ai/model/_providers/together.py +12 -2
  241. inspect_ai/model/_providers/util/chatapi.py +7 -2
  242. inspect_ai/model/_providers/util/hf_handler.py +4 -2
  243. inspect_ai/model/_providers/util/llama31.py +4 -2
  244. inspect_ai/model/_providers/vertex.py +11 -9
  245. inspect_ai/model/_providers/vllm.py +4 -4
  246. inspect_ai/scorer/__init__.py +2 -0
  247. inspect_ai/scorer/_metrics/__init__.py +2 -0
  248. inspect_ai/scorer/_metrics/grouped.py +84 -0
  249. inspect_ai/scorer/_score.py +26 -6
  250. inspect_ai/solver/__init__.py +2 -2
  251. inspect_ai/solver/_basic_agent.py +22 -9
  252. inspect_ai/solver/_bridge.py +31 -0
  253. inspect_ai/solver/_chain.py +20 -12
  254. inspect_ai/solver/_fork.py +5 -1
  255. inspect_ai/solver/_human_agent.py +52 -0
  256. inspect_ai/solver/_prompt.py +3 -1
  257. inspect_ai/solver/_run.py +59 -0
  258. inspect_ai/solver/_solver.py +14 -4
  259. inspect_ai/solver/_task_state.py +5 -3
  260. inspect_ai/tool/_tool_call.py +15 -8
  261. inspect_ai/tool/_tool_def.py +17 -12
  262. inspect_ai/tool/_tool_support_helpers.py +4 -4
  263. inspect_ai/tool/_tool_with.py +14 -11
  264. inspect_ai/tool/_tools/_bash_session.py +11 -2
  265. inspect_ai/tool/_tools/_computer/_common.py +18 -2
  266. inspect_ai/tool/_tools/_computer/_computer.py +18 -2
  267. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
  268. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
  269. inspect_ai/tool/_tools/_think.py +1 -1
  270. inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
  271. inspect_ai/util/__init__.py +2 -0
  272. inspect_ai/util/_anyio.py +27 -0
  273. inspect_ai/util/_sandbox/__init__.py +2 -1
  274. inspect_ai/util/_sandbox/context.py +32 -7
  275. inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
  276. inspect_ai/util/_sandbox/docker/compose.py +2 -2
  277. inspect_ai/util/_sandbox/docker/docker.py +12 -1
  278. inspect_ai/util/_store_model.py +30 -7
  279. inspect_ai/util/_subprocess.py +13 -3
  280. inspect_ai/util/_subtask.py +1 -0
  281. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
  282. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
  283. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
  284. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
  285. /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
  286. /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
  287. /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
  288. /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
  289. /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
  290. /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
  291. /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
  292. /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
  293. /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
  294. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
  295. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
  296. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
  297. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,52 @@
1
+ from logging import getLogger
2
+
3
+ from inspect_ai._util.logger import warn_once
4
+ from inspect_ai.agent._as_solver import as_solver
5
+
6
+ from ._solver import Solver, solver
7
+
8
+ logger = getLogger(__name__)
9
+
10
+
11
+ @solver
12
+ def human_agent(
13
+ answer: bool | str = True,
14
+ intermediate_scoring: bool = False,
15
+ record_session: bool = True,
16
+ ) -> Solver:
17
+ """Human solver for agentic tasks that run in a Linux environment.
18
+
19
+ The Human agent solver installs agent task tools in the default
20
+ sandbox and presents the user with both task instructions and
21
+ documentation for the various tools (e.g. `task submit`,
22
+ `task start`, `task stop` `task instructions`, etc.). A human agent panel
23
+ is displayed with instructions for logging in to the sandbox.
24
+
25
+ If the user is running in VS Code with the Inspect extension,
26
+ they will also be presented with links to login to the sandbox
27
+ using a VS Code Window or Terminal.
28
+
29
+ Args:
30
+ answer: Is an explicit answer required for this task or is it scored
31
+ based on files in the container? Pass a `str` with a regex to validate
32
+ that the answer matches the expected format.
33
+ intermediate_scoring: Allow the human agent to check their score while working.
34
+ record_session: Record all user commands and outputs in the sandbox bash session.
35
+
36
+ Returns:
37
+ Solver: Human agent solver.
38
+ """
39
+ from inspect_ai.agent._human.agent import human_cli
40
+
41
+ warn_once(
42
+ logger,
43
+ "The human_agent solver is deprecated. Please use the human_cli agent from the agents module instead.",
44
+ )
45
+
46
+ return as_solver(
47
+ human_cli(
48
+ answer=answer,
49
+ intermediate_scoring=intermediate_scoring,
50
+ record_session=record_session,
51
+ )
52
+ )
@@ -123,7 +123,9 @@ def assistant_message(template: str, **params: Any) -> Solver:
123
123
  async def solve(state: TaskState, generate: Generate) -> TaskState:
124
124
  kwargs = state.metadata | state.store._data | params
125
125
  state.messages.append(
126
- ChatMessageAssistant(content=format_template(content, kwargs))
126
+ ChatMessageAssistant(
127
+ content=format_template(content, kwargs), model=state.model.name
128
+ )
127
129
  )
128
130
  return state
129
131
 
@@ -0,0 +1,59 @@
1
+ from copy import copy
2
+
3
+ from inspect_ai.model import ChatMessage, ChatMessageUser, ModelName, ModelOutput
4
+
5
+ from ._fork import task_generate
6
+ from ._solver import Solver
7
+ from ._task_state import TaskState
8
+
9
+
10
+ async def run(
11
+ solver: Solver, input: str | list[ChatMessage]
12
+ ) -> tuple[list[ChatMessage], ModelOutput | None]:
13
+ """Run a solver over chat message input.
14
+
15
+ Args:
16
+ solver: Solver to run.
17
+ input: Chat message input
18
+
19
+ Returns:
20
+ Tuple of `list[ChatMessage], ModelOutput | None` (returns
21
+ [], None if no generates were done by the solver)
22
+ """
23
+ from inspect_ai.log._samples import sample_active
24
+
25
+ # get the generate function for the current task
26
+ generate = task_generate()
27
+ if generate is None:
28
+ raise RuntimeError("Called run() outside of a running task.")
29
+
30
+ # get the active sample
31
+ active = sample_active()
32
+ if active is None:
33
+ raise RuntimeError("Called run() outside of a running task")
34
+ assert active.sample.id
35
+
36
+ # build messages list
37
+ messages: list[ChatMessage] = (
38
+ [ChatMessageUser(content=input)] if isinstance(input, str) else input
39
+ )
40
+
41
+ # build state
42
+ state = TaskState(
43
+ model=ModelName(active.model),
44
+ sample_id=active.sample.id,
45
+ epoch=active.epoch,
46
+ input=input,
47
+ messages=copy(messages),
48
+ )
49
+
50
+ # run solver
51
+ state = await solver(state, generate)
52
+
53
+ # return any messages that don't match our initial prefix
54
+ new_messages: list[ChatMessage] = []
55
+ for index, message in enumerate(state.messages):
56
+ if index >= len(messages) or message.id != messages[index].id:
57
+ new_messages.append(message)
58
+
59
+ return new_messages, state.output if len(state.output.choices) > 0 else None
@@ -7,6 +7,7 @@ from typing import (
7
7
  Literal,
8
8
  ParamSpec,
9
9
  Protocol,
10
+ TypeAlias,
10
11
  cast,
11
12
  overload,
12
13
  runtime_checkable,
@@ -23,6 +24,8 @@ from inspect_ai._util.registry import (
23
24
  registry_name,
24
25
  registry_tag,
25
26
  )
27
+ from inspect_ai.agent._agent import Agent, is_agent
28
+ from inspect_ai.agent._as_solver import as_solver
26
29
  from inspect_ai.model import CachePolicy, GenerateConfigArgs
27
30
 
28
31
  from ._task_state import TaskState, set_sample_state
@@ -136,23 +139,27 @@ def solver_create(name: str, **kwargs: Any) -> Solver:
136
139
  return cast(Solver, registry_create("solver", name, **kwargs))
137
140
 
138
141
 
142
+ SolverType: TypeAlias = Solver | Agent
143
+ """Return type for @solver decorated functions. """
144
+
145
+
139
146
  @overload
140
147
  def solver(name: str) -> Callable[[Callable[P, Solver]], Callable[P, Solver]]: ...
141
148
 
142
149
 
143
150
  @overload
144
- def solver(name: Callable[P, Solver]) -> Callable[P, Solver]: ...
151
+ def solver(name: Callable[P, SolverType]) -> Callable[P, Solver]: ...
145
152
 
146
153
 
147
154
  def solver(
148
- name: str | Callable[P, Solver],
155
+ name: str | Callable[P, SolverType],
149
156
  ) -> Callable[[Callable[P, Solver]], Callable[P, Solver]] | Callable[P, Solver]:
150
157
  r"""Decorator for registering solvers.
151
158
 
152
159
  Args:
153
160
  name:
154
161
  Optional name for solver. If the decorator has no name
155
- argument then the name of the underlying Callable[P, Solver]
162
+ argument then the name of the underlying Callable[P, SolverType]
156
163
  object will be used to automatically assign a name.
157
164
 
158
165
  Returns:
@@ -176,7 +183,7 @@ def solver(
176
183
  # (b) Ensure that instances of Solver created by SolverType also
177
184
  # carry registry info.
178
185
  def create_solver_wrapper(
179
- solver_type: Callable[P, Solver], name: str | None = None
186
+ solver_type: Callable[P, SolverType], name: str | None = None
180
187
  ) -> Callable[P, Solver]:
181
188
  solver_name = registry_name(
182
189
  solver_type, name if name else getattr(solver_type, "__name__")
@@ -185,6 +192,9 @@ def solver(
185
192
  @wraps(solver_type)
186
193
  def solver_wrapper(*args: P.args, **kwargs: P.kwargs) -> Solver:
187
194
  solver = solver_type(*args, **kwargs)
195
+ if is_agent(solver):
196
+ solver = as_solver(solver)
197
+ solver = cast(Solver, solver)
188
198
 
189
199
  if not is_callable_coroutine(solver):
190
200
  raise TypeError(f"'{solver}' is not declared as an async callable.")
@@ -394,16 +394,18 @@ class TaskState:
394
394
 
395
395
  return metadata_as(self.metadata, metadata_cls)
396
396
 
397
- def store_as(self, model_cls: Type[SMT]) -> SMT:
397
+ def store_as(self, model_cls: Type[SMT], instance: str | None = None) -> SMT:
398
398
  """Pydantic model interface to the store.
399
399
 
400
400
  Args:
401
401
  model_cls: Pydantic model type (must derive from StoreModel)
402
+ instance: Optional instances name for store (enables multiple instances
403
+ of a given StoreModel type within a single sample)
402
404
 
403
405
  Returns:
404
- StoreModel: Instance of model_cls bound to current Store.
406
+ StoreModel: model_cls bound to sample store data.
405
407
  """
406
- return model_cls(store=self.store)
408
+ return model_cls(store=self.store, instance=instance)
407
409
 
408
410
 
409
411
  def sample_state() -> TaskState | None:
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import Any, Callable, Literal
2
+ from typing import Any, Callable, Literal, TypedDict
3
3
 
4
- from pydantic import BaseModel, Field
4
+ from pydantic import BaseModel, Field, JsonValue
5
5
 
6
6
  from inspect_ai._util.content import Content
7
7
 
@@ -44,11 +44,8 @@ class ToolCall:
44
44
  arguments: dict[str, Any]
45
45
  """Arguments to function."""
46
46
 
47
- type: str
48
- """Type of tool call ('function' or a model specific internal tool type)"""
49
-
50
- internal_name: str | None = field(default=None)
51
- """Model's internal name for the tool - if any."""
47
+ internal: JsonValue | None = field(default=None)
48
+ """Model provider specific payload - typically used to aid transformation back to model types."""
52
49
 
53
50
  parse_error: str | None = field(default=None)
54
51
  """Error which occurred parsing tool call."""
@@ -82,7 +79,17 @@ ToolCallViewer = Callable[[ToolCall], ToolCallView]
82
79
  """Custom view renderer for tool calls."""
83
80
 
84
81
 
85
- ToolCallModelInput = Callable[[int, int, str | list[Content]], str | list[Content]]
82
+ class ToolCallModelInputHints(TypedDict):
83
+ # This type is a little sketchy but it allows tools to customize their
84
+ # input hook behavior based on model limitations without creating a tight
85
+ # coupling to the model provider.
86
+ disable_computer_screenshot_truncation: bool
87
+ """The model does not support the truncation/redaction of computer screenshots."""
88
+
89
+
90
+ ToolCallModelInput = Callable[
91
+ [int, int, str | list[Content], ToolCallModelInputHints], str | list[Content]
92
+ ]
86
93
  """Determine how tool call results are played back as model input.
87
94
 
88
95
  The first argument is an index into the total number of tool results
@@ -21,7 +21,7 @@ from ._tool_description import (
21
21
  tool_description,
22
22
  )
23
23
  from ._tool_info import parse_tool_info
24
- from ._tool_params import ToolParams
24
+ from ._tool_params import ToolParam, ToolParams
25
25
 
26
26
 
27
27
  class ToolDef:
@@ -194,17 +194,7 @@ def tool_def_fields(tool: Tool) -> ToolDefFields:
194
194
  raise ValueError(f"Description not provided for tool function '{name}'")
195
195
 
196
196
  # validate that we have types/descriptions for paramters
197
- for param_name, param in tool_info.parameters.properties.items():
198
-
199
- def raise_not_provided_error(context: str) -> None:
200
- raise ValueError(
201
- f"{context} not provided for parameter '{param_name}' of tool function '{name}'."
202
- )
203
-
204
- if param.type is None and not param.anyOf and not param.enum:
205
- raise_not_provided_error("Unsupported type or type annotation")
206
- elif not param.description:
207
- raise_not_provided_error("Description")
197
+ validate_tool_parameters(name, tool_info.parameters.properties)
208
198
 
209
199
  # see if the user has overriden any of the tool's descriptions
210
200
  desc = tool_description(tool)
@@ -238,3 +228,18 @@ def tool_registry_info(
238
228
  viewer = info.metadata.get(TOOL_VIEWER, None)
239
229
  model_input = info.metadata.get(TOOL_MODEL_INPUT, None)
240
230
  return name, prompt, parallel, viewer, model_input
231
+
232
+
233
+ def validate_tool_parameters(tool_name: str, parameters: dict[str, ToolParam]) -> None:
234
+ # validate that we have types/descriptions for paramters
235
+ for param_name, param in parameters.items():
236
+
237
+ def raise_not_provided_error(context: str) -> None:
238
+ raise ValueError(
239
+ f"{context} provided for parameter '{param_name}' of function '{tool_name}'."
240
+ )
241
+
242
+ if param.type is None and not param.anyOf and not param.enum:
243
+ raise_not_provided_error("Unsupported type or type annotation")
244
+ elif not param.description:
245
+ raise_not_provided_error("Description not")
@@ -128,10 +128,10 @@ async def tool_container_sandbox(tool_name: str) -> SandboxEnvironment:
128
128
 
129
129
  Alternatively, you can include the service into your own Dockerfile:
130
130
 
131
- RUN python -m venv /opt/inspect_tool_support
132
- ENV PATH="/opt/inspect_tool_support/bin:$PATH"
133
- RUN pip install inspect-tool-support
134
- RUN inspect-tool-support post-install
131
+ ENV PATH="$PATH:/opt/inspect_tool_support/bin"
132
+ RUN python -m venv /opt/inspect_tool_support && \\
133
+ /opt/inspect_tool_support/bin/pip install inspect-tool-support && \\
134
+ /opt/inspect_tool_support/bin/inspect-tool-support post-install
135
135
  """).strip()
136
136
  raise PrerequisiteError(msg)
137
137
 
@@ -1,5 +1,3 @@
1
- from copy import deepcopy
2
-
3
1
  from inspect_ai._util.registry import (
4
2
  registry_info,
5
3
  registry_params,
@@ -22,10 +20,15 @@ def tool_with(
22
20
  viewer: ToolCallViewer | None = None,
23
21
  model_input: ToolCallModelInput | None = None,
24
22
  ) -> Tool:
25
- """Tool with modifications to name and descriptions.
23
+ """Tool with modifications to various attributes.
24
+
25
+ This function modifies the passed tool in place and
26
+ returns it. If you want to create multiple variations
27
+ of a single tool using `tool_with()` you should create
28
+ the underlying tool multiple times.
26
29
 
27
30
  Args:
28
- tool: Tool instance to copy and add descriptions to.
31
+ tool: Tool instance to modify.
29
32
  name: Tool name (optional).
30
33
  description: Tool description (optional).
31
34
  parameters: Parameter descriptions (optional)
@@ -36,7 +39,7 @@ def tool_with(
36
39
  tool call results are played back as model input.
37
40
 
38
41
  Returns:
39
- A copy of the passed tool with the specified descriptive information.
42
+ The passed tool with the requested modifications.
40
43
  """
41
44
  # get the existing tool info
42
45
  tool_info = parse_tool_info(tool)
@@ -54,8 +57,7 @@ def tool_with(
54
57
  param_name
55
58
  ]
56
59
 
57
- # copy the tool and set the descriptions on the new copy
58
- tool_copy = deepcopy(tool)
60
+ # resolve attributes
59
61
  info = registry_info(tool).model_copy()
60
62
  if parallel is not None:
61
63
  info.metadata[TOOL_PARALLEL] = parallel
@@ -64,12 +66,13 @@ def tool_with(
64
66
  elif model_input is not None:
65
67
  info.metadata[TOOL_MODEL_INPUT] = model_input
66
68
 
67
- set_registry_info(tool_copy, info)
68
- set_registry_params(tool_copy, registry_params(tool))
69
+ # set attributes
70
+ set_registry_info(tool, info)
71
+ set_registry_params(tool, registry_params(tool))
69
72
  set_tool_description(
70
- tool_copy,
73
+ tool,
71
74
  ToolDescription(
72
75
  name=name, description=description, parameters=tool_info.parameters
73
76
  ),
74
77
  )
75
- return tool_copy
78
+ return tool
@@ -1,4 +1,5 @@
1
1
  from pydantic import BaseModel, Field, RootModel
2
+ from shortuuid import uuid
2
3
 
3
4
  from inspect_ai.tool import ToolResult
4
5
  from inspect_ai.tool._tool_support_helpers import (
@@ -52,13 +53,21 @@ def code_viewer(language: str, code_param: str) -> ToolCallViewer:
52
53
 
53
54
 
54
55
  @tool(viewer=code_viewer("bash", "command"))
55
- def bash_session(timeout: int | None = None) -> Tool:
56
+ def bash_session(*, timeout: int | None = None, instance: str | None = uuid()) -> Tool:
56
57
  """Bash shell session command execution tool.
57
58
 
58
59
  Execute bash shell commands in a long running session using a sandbox environment (e.g. "docker").
59
60
 
61
+ By default, a separate bash process is created within the sandbox for each
62
+ call to `bash_session()`. You can modify this behavior by passing `instance=None`
63
+ (which will result in a single bash process for the entire sample) or use other
64
+ `instance` values that implement another scheme).
65
+
66
+ See complete documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-bash-session>.
67
+
60
68
  Args:
61
69
  timeout: Timeout (in seconds) for command.
70
+ instance: Instance id (each unique instance id has its own bash process)
62
71
 
63
72
  Returns:
64
73
  String with command output (stdout) or command error (stderr).
@@ -85,7 +94,7 @@ def bash_session(timeout: int | None = None) -> Tool:
85
94
  params: dict[str, object] = {"command": command, "restart": restart}
86
95
 
87
96
  sandbox = await tool_container_sandbox("bash session")
88
- store = store_as(BashSessionStore)
97
+ store = store_as(BashSessionStore, instance=instance)
89
98
 
90
99
  if not store.session_id:
91
100
  store.session_id = (
@@ -83,6 +83,22 @@ async def middle_click(coordinate: list[int], timeout: int | None = None) -> Too
83
83
  )
84
84
 
85
85
 
86
+ async def back_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
87
+ return await _send_cmd(
88
+ ["back_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
89
+ timeout=timeout,
90
+ )
91
+
92
+
93
+ async def forward_click(
94
+ coordinate: list[int], timeout: int | None = None
95
+ ) -> ToolResult:
96
+ return await _send_cmd(
97
+ ["forward_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
98
+ timeout=timeout,
99
+ )
100
+
101
+
86
102
  async def double_click(coordinate: list[int], timeout: int | None = None) -> ToolResult:
87
103
  return await _send_cmd(
88
104
  ["double_click", "--coordinate", f"{coordinate[0]}", f"{coordinate[1]}"],
@@ -182,11 +198,11 @@ async def computer_sandbox() -> SandboxEnvironment:
182
198
  else:
183
199
  raise PrerequisiteError(
184
200
  dedent("""
185
- The computer tool service was not found in any of the sandboxes for this sample. Please add the computer tool service to your configuration. For example, the following Docker compose file uses the aisiuk/inspect-computer-tool:latest image as its default sandbox:
201
+ The computer tool service was not found in any of the sandboxes for this sample. Please add the computer tool service to your configuration. For example, the following Docker compose file uses the aisiuk/inspect-computer-tool image as its default sandbox:
186
202
 
187
203
  services:
188
204
  default:
189
- image: "aisiuk/inspect-computer-tool:latest"
205
+ image: "aisiuk/inspect-computer-tool"
190
206
  init: true
191
207
  """).strip()
192
208
  )
@@ -3,7 +3,7 @@ from typing import Awaitable, Callable, Literal, TypeVar
3
3
  from inspect_ai._util.content import Content, ContentImage, ContentText
4
4
  from inspect_ai.tool import Tool, ToolResult, tool
5
5
  from inspect_ai.tool._tool import TOOL_INIT_MODEL_INPUT, ToolParsingError
6
- from inspect_ai.tool._tool_call import ToolCallModelInput
6
+ from inspect_ai.tool._tool_call import ToolCallModelInput, ToolCallModelInputHints
7
7
 
8
8
  from . import _common as common
9
9
  from ._resources.tool._constants import Action
@@ -64,6 +64,8 @@ def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool
64
64
  - Example: execute(action="left_click_drag", coordinate=(150, 250))
65
65
  - `right_click`: Click the right mouse button.
66
66
  - `middle_click`: Click the middle mouse button.
67
+ - `back_click`: Click the 'back' mouse button.
68
+ - `forward_click`: Click the 'forward' mouse button.
67
69
  - `double_click`: Double-click the left mouse button.
68
70
  - `triple_click`: Double-click the left mouse button.
69
71
  - `wait`: Wait for a specified duration (in seconds).
@@ -117,6 +119,14 @@ def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool
117
119
  return await common.middle_click(
118
120
  not_none(coordinate, "coordinate"), timeout=timeout
119
121
  )
122
+ case "back_click":
123
+ return await common.back_click(
124
+ not_none(coordinate, "coordinate"), timeout=timeout
125
+ )
126
+ case "forward_click":
127
+ return await common.forward_click(
128
+ not_none(coordinate, "coordinate"), timeout=timeout
129
+ )
120
130
  case "double_click":
121
131
  return await common.double_click(
122
132
  not_none(coordinate, "coordinate"), timeout=timeout
@@ -150,8 +160,14 @@ def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool
150
160
 
151
161
  def _computer_model_input(max_screenshots: int) -> ToolCallModelInput:
152
162
  def model_input(
153
- message_index: int, message_total: int, content: str | list[Content]
163
+ message_index: int,
164
+ message_total: int,
165
+ content: str | list[Content],
166
+ hints: ToolCallModelInputHints,
154
167
  ) -> str | list[Content]:
168
+ if hints.get("forbids_computer_screenshot_truncation", False):
169
+ return content
170
+
155
171
  # nothing to do for scalars
156
172
  if isinstance(content, str):
157
173
  return content
@@ -12,6 +12,8 @@ Action = Literal[
12
12
  "left_click_drag",
13
13
  "right_click",
14
14
  "middle_click",
15
+ "back_click",
16
+ "forward_click",
15
17
  "double_click",
16
18
  "triple_click",
17
19
  "scroll",
@@ -153,6 +153,19 @@ class X11Client:
153
153
  ) -> ToolResult:
154
154
  return await self._mouse_move_and("middle_click", coordinate, text)
155
155
 
156
+ # https://wiki.archlinux.org/title/Mouse_buttons#Thumb_buttons_-_forward_and_back
157
+ # suggests that, although not in any spec, the de facto standard is 8 for
158
+ # back and 9 for forward.
159
+ async def back_click(
160
+ self, coordinate: tuple[int, int] | None, text: str | None
161
+ ) -> ToolResult:
162
+ return await self._mouse_move_and("back_click", coordinate, text)
163
+
164
+ async def forward_click(
165
+ self, coordinate: tuple[int, int] | None, text: str | None
166
+ ) -> ToolResult:
167
+ return await self._mouse_move_and("forward_click", coordinate, text)
168
+
156
169
  async def double_click(
157
170
  self, coordinate: tuple[int, int] | None, text: str | None
158
171
  ) -> ToolResult:
@@ -215,6 +228,8 @@ class X11Client:
215
228
  "left_click",
216
229
  "right_click",
217
230
  "middle_click",
231
+ "back_click",
232
+ "forward_click",
218
233
  "double_click",
219
234
  "triple_click",
220
235
  ],
@@ -233,6 +248,8 @@ class X11Client:
233
248
  "left_click": "1",
234
249
  "right_click": "3",
235
250
  "middle_click": "2",
251
+ "back_click": "8",
252
+ "forward_click": "9",
236
253
  "double_click": "--repeat 2 --delay 300 1",
237
254
  "triple_click": "--repeat 3 --delay 300 1",
238
255
  }[action]
@@ -22,7 +22,7 @@ def think(
22
22
  async def execute(thought: str) -> str:
23
23
  """Use the tool to think about something.
24
24
 
25
- The will not obtain new information or change the environment, but just append the thought to the log. Use it when complex reasoning or some cache memory is needed."
25
+ The will not obtain new information or change the environment, but just append the thought to the log. Use it when complex reasoning or some cache memory is needed.
26
26
 
27
27
  Args:
28
28
  thought: A thought to think about.