inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (297) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/eval.py +35 -2
  3. inspect_ai/_cli/util.py +44 -1
  4. inspect_ai/_display/core/config.py +1 -1
  5. inspect_ai/_display/core/display.py +13 -4
  6. inspect_ai/_display/core/results.py +1 -1
  7. inspect_ai/_display/textual/app.py +14 -3
  8. inspect_ai/_display/textual/display.py +4 -0
  9. inspect_ai/_display/textual/widgets/samples.py +9 -3
  10. inspect_ai/_display/textual/widgets/task_detail.py +8 -8
  11. inspect_ai/_display/textual/widgets/tasks.py +17 -1
  12. inspect_ai/_display/textual/widgets/vscode.py +44 -0
  13. inspect_ai/_eval/eval.py +74 -25
  14. inspect_ai/_eval/evalset.py +22 -18
  15. inspect_ai/_eval/loader.py +34 -11
  16. inspect_ai/_eval/run.py +13 -15
  17. inspect_ai/_eval/score.py +13 -3
  18. inspect_ai/_eval/task/generate.py +8 -9
  19. inspect_ai/_eval/task/log.py +55 -6
  20. inspect_ai/_eval/task/run.py +51 -10
  21. inspect_ai/_eval/task/task.py +23 -9
  22. inspect_ai/_util/constants.py +2 -0
  23. inspect_ai/_util/file.py +30 -1
  24. inspect_ai/_util/json.py +37 -1
  25. inspect_ai/_util/registry.py +1 -0
  26. inspect_ai/_util/vscode.py +37 -0
  27. inspect_ai/_view/server.py +113 -1
  28. inspect_ai/_view/www/App.css +7 -1
  29. inspect_ai/_view/www/dist/assets/index.css +813 -415
  30. inspect_ai/_view/www/dist/assets/index.js +54475 -32003
  31. inspect_ai/_view/www/eslint.config.mjs +1 -1
  32. inspect_ai/_view/www/log-schema.json +137 -31
  33. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  34. inspect_ai/_view/www/package.json +11 -2
  35. inspect_ai/_view/www/src/App.tsx +161 -853
  36. inspect_ai/_view/www/src/api/api-browser.ts +176 -5
  37. inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
  38. inspect_ai/_view/www/src/api/client-api.ts +66 -10
  39. inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
  40. inspect_ai/_view/www/src/api/types.ts +107 -2
  41. inspect_ai/_view/www/src/appearance/icons.ts +2 -0
  42. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
  43. inspect_ai/_view/www/src/components/Card.tsx +6 -4
  44. inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
  45. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
  46. inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
  47. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
  48. inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
  49. inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
  50. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
  51. inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
  52. inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
  53. inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
  54. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
  55. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
  56. inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
  57. inspect_ai/_view/www/src/components/Modal.module.css +38 -0
  58. inspect_ai/_view/www/src/components/Modal.tsx +77 -0
  59. inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
  60. inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
  61. inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
  62. inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
  63. inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
  64. inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
  65. inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
  66. inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
  67. inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
  68. inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
  69. inspect_ai/_view/www/src/index.tsx +26 -94
  70. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
  71. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
  72. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
  73. inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
  74. inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
  75. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
  76. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
  77. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
  78. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
  79. inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
  80. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
  81. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
  82. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
  83. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
  84. inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
  85. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
  86. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
  87. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
  88. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
  89. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
  90. inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
  91. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
  92. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
  93. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
  94. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
  95. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
  96. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
  97. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
  98. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
  99. inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
  100. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
  101. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
  102. inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
  103. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
  104. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
  105. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
  106. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
  107. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
  108. inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
  109. inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
  110. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
  111. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
  112. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
  113. inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
  114. inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
  115. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
  116. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
  117. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
  118. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
  119. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
  120. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
  121. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
  122. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
  123. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
  124. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
  125. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
  126. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
  127. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
  128. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
  129. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
  130. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
  131. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
  132. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
  133. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
  134. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
  135. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
  136. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
  137. inspect_ai/_view/www/src/scoring/utils.ts +87 -0
  138. inspect_ai/_view/www/src/state/appSlice.ts +244 -0
  139. inspect_ai/_view/www/src/state/hooks.ts +399 -0
  140. inspect_ai/_view/www/src/state/logPolling.ts +200 -0
  141. inspect_ai/_view/www/src/state/logSlice.ts +224 -0
  142. inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
  143. inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
  144. inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
  145. inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
  146. inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
  147. inspect_ai/_view/www/src/state/scrolling.ts +206 -0
  148. inspect_ai/_view/www/src/state/store.ts +168 -0
  149. inspect_ai/_view/www/src/state/store_filter.ts +84 -0
  150. inspect_ai/_view/www/src/state/utils.ts +23 -0
  151. inspect_ai/_view/www/src/storage/index.ts +26 -0
  152. inspect_ai/_view/www/src/types/log.d.ts +36 -26
  153. inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
  154. inspect_ai/_view/www/src/types.ts +94 -32
  155. inspect_ai/_view/www/src/utils/attachments.ts +58 -23
  156. inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
  157. inspect_ai/_view/www/src/utils/logger.ts +52 -0
  158. inspect_ai/_view/www/src/utils/polling.ts +100 -0
  159. inspect_ai/_view/www/src/utils/react.ts +30 -0
  160. inspect_ai/_view/www/src/utils/vscode.ts +1 -1
  161. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
  162. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
  163. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
  164. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
  165. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
  166. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
  167. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
  168. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
  169. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
  170. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
  171. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
  172. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
  173. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
  174. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
  175. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
  176. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
  177. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
  178. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
  179. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
  180. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
  181. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
  182. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
  183. inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
  184. inspect_ai/_view/www/src/workspace/types.ts +4 -3
  185. inspect_ai/_view/www/src/workspace/utils.ts +4 -4
  186. inspect_ai/_view/www/vite.config.js +6 -0
  187. inspect_ai/_view/www/yarn.lock +464 -355
  188. inspect_ai/agent/__init__.py +36 -0
  189. inspect_ai/agent/_agent.py +268 -0
  190. inspect_ai/agent/_as_solver.py +72 -0
  191. inspect_ai/agent/_as_tool.py +122 -0
  192. inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
  193. inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
  194. inspect_ai/agent/_filter.py +46 -0
  195. inspect_ai/agent/_handoff.py +93 -0
  196. inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
  197. inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
  198. inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
  199. inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
  200. inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
  201. inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
  202. inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
  203. inspect_ai/agent/_react.py +241 -0
  204. inspect_ai/agent/_run.py +36 -0
  205. inspect_ai/agent/_types.py +81 -0
  206. inspect_ai/log/_condense.py +26 -0
  207. inspect_ai/log/_log.py +17 -5
  208. inspect_ai/log/_recorders/buffer/__init__.py +14 -0
  209. inspect_ai/log/_recorders/buffer/buffer.py +30 -0
  210. inspect_ai/log/_recorders/buffer/database.py +685 -0
  211. inspect_ai/log/_recorders/buffer/filestore.py +259 -0
  212. inspect_ai/log/_recorders/buffer/types.py +84 -0
  213. inspect_ai/log/_recorders/eval.py +2 -11
  214. inspect_ai/log/_recorders/types.py +30 -0
  215. inspect_ai/log/_transcript.py +32 -2
  216. inspect_ai/model/__init__.py +7 -1
  217. inspect_ai/model/_call_tools.py +257 -52
  218. inspect_ai/model/_chat_message.py +7 -4
  219. inspect_ai/model/_conversation.py +13 -62
  220. inspect_ai/model/_display.py +85 -0
  221. inspect_ai/model/_generate_config.py +2 -2
  222. inspect_ai/model/_model.py +114 -14
  223. inspect_ai/model/_model_output.py +14 -9
  224. inspect_ai/model/_openai.py +16 -4
  225. inspect_ai/model/_openai_computer_use.py +162 -0
  226. inspect_ai/model/_openai_responses.py +319 -165
  227. inspect_ai/model/_providers/anthropic.py +20 -21
  228. inspect_ai/model/_providers/azureai.py +24 -13
  229. inspect_ai/model/_providers/bedrock.py +1 -7
  230. inspect_ai/model/_providers/cloudflare.py +3 -3
  231. inspect_ai/model/_providers/goodfire.py +2 -6
  232. inspect_ai/model/_providers/google.py +11 -10
  233. inspect_ai/model/_providers/groq.py +6 -3
  234. inspect_ai/model/_providers/hf.py +7 -3
  235. inspect_ai/model/_providers/mistral.py +7 -10
  236. inspect_ai/model/_providers/openai.py +47 -17
  237. inspect_ai/model/_providers/openai_o1.py +11 -4
  238. inspect_ai/model/_providers/openai_responses.py +12 -14
  239. inspect_ai/model/_providers/providers.py +2 -2
  240. inspect_ai/model/_providers/together.py +12 -2
  241. inspect_ai/model/_providers/util/chatapi.py +7 -2
  242. inspect_ai/model/_providers/util/hf_handler.py +4 -2
  243. inspect_ai/model/_providers/util/llama31.py +4 -2
  244. inspect_ai/model/_providers/vertex.py +11 -9
  245. inspect_ai/model/_providers/vllm.py +4 -4
  246. inspect_ai/scorer/__init__.py +2 -0
  247. inspect_ai/scorer/_metrics/__init__.py +2 -0
  248. inspect_ai/scorer/_metrics/grouped.py +84 -0
  249. inspect_ai/scorer/_score.py +26 -6
  250. inspect_ai/solver/__init__.py +2 -2
  251. inspect_ai/solver/_basic_agent.py +22 -9
  252. inspect_ai/solver/_bridge.py +31 -0
  253. inspect_ai/solver/_chain.py +20 -12
  254. inspect_ai/solver/_fork.py +5 -1
  255. inspect_ai/solver/_human_agent.py +52 -0
  256. inspect_ai/solver/_prompt.py +3 -1
  257. inspect_ai/solver/_run.py +59 -0
  258. inspect_ai/solver/_solver.py +14 -4
  259. inspect_ai/solver/_task_state.py +5 -3
  260. inspect_ai/tool/_tool_call.py +15 -8
  261. inspect_ai/tool/_tool_def.py +17 -12
  262. inspect_ai/tool/_tool_support_helpers.py +4 -4
  263. inspect_ai/tool/_tool_with.py +14 -11
  264. inspect_ai/tool/_tools/_bash_session.py +11 -2
  265. inspect_ai/tool/_tools/_computer/_common.py +18 -2
  266. inspect_ai/tool/_tools/_computer/_computer.py +18 -2
  267. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
  268. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
  269. inspect_ai/tool/_tools/_think.py +1 -1
  270. inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
  271. inspect_ai/util/__init__.py +2 -0
  272. inspect_ai/util/_anyio.py +27 -0
  273. inspect_ai/util/_sandbox/__init__.py +2 -1
  274. inspect_ai/util/_sandbox/context.py +32 -7
  275. inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
  276. inspect_ai/util/_sandbox/docker/compose.py +2 -2
  277. inspect_ai/util/_sandbox/docker/docker.py +12 -1
  278. inspect_ai/util/_store_model.py +30 -7
  279. inspect_ai/util/_subprocess.py +13 -3
  280. inspect_ai/util/_subtask.py +1 -0
  281. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
  282. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
  283. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
  284. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
  285. /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
  286. /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
  287. /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
  288. /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
  289. /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
  290. /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
  291. /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
  292. /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
  293. /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
  294. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
  295. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
  296. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
  297. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/eval.py CHANGED
@@ -2,9 +2,11 @@ import logging
2
2
  import os
3
3
  import sys
4
4
  from pathlib import Path
5
- from typing import Any, Literal
5
+ from typing import Any, Literal, cast
6
6
 
7
7
  from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
8
+ from inspect_ai.agent._agent import Agent, is_agent
9
+ from inspect_ai.agent._as_solver import as_solver
8
10
 
9
11
  if sys.version_info < (3, 11):
10
12
  from exceptiongroup import ExceptionGroup
@@ -15,7 +17,11 @@ from typing_extensions import Unpack
15
17
  from inspect_ai._cli.util import parse_cli_args
16
18
  from inspect_ai._display.core.active import display as task_display
17
19
  from inspect_ai._util.config import resolve_args
18
- from inspect_ai._util.constants import DEFAULT_LOG_FORMAT
20
+ from inspect_ai._util.constants import (
21
+ DEFAULT_LOG_FORMAT,
22
+ DEFAULT_LOG_SHARED,
23
+ JSON_LOG_FORMAT,
24
+ )
19
25
  from inspect_ai._util.error import PrerequisiteError
20
26
  from inspect_ai._util.file import absolute_file_path
21
27
  from inspect_ai._util.logger import warn_once
@@ -31,6 +37,7 @@ from inspect_ai.approval._policy import (
31
37
  from inspect_ai.log import EvalConfig, EvalLog, EvalLogInfo
32
38
  from inspect_ai.log._file import read_eval_log_async
33
39
  from inspect_ai.log._recorders import create_recorder_for_format
40
+ from inspect_ai.log._recorders.buffer import cleanup_sample_buffers
34
41
  from inspect_ai.model import (
35
42
  GenerateConfig,
36
43
  GenerateConfigArgs,
@@ -66,7 +73,7 @@ def eval(
66
73
  task_args: dict[str, Any] | str = dict(),
67
74
  sandbox: SandboxEnvironmentType | None = None,
68
75
  sandbox_cleanup: bool | None = None,
69
- solver: Solver | list[Solver] | SolverSpec | None = None,
76
+ solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
70
77
  tags: list[str] | None = None,
71
78
  metadata: dict[str, Any] | None = None,
72
79
  trace: bool | None = None,
@@ -92,6 +99,7 @@ def eval(
92
99
  log_samples: bool | None = None,
93
100
  log_images: bool | None = None,
94
101
  log_buffer: int | None = None,
102
+ log_shared: bool | int | None = None,
95
103
  score: bool = True,
96
104
  score_display: bool | None = None,
97
105
  **kwargs: Unpack[GenerateConfigArgs],
@@ -161,6 +169,9 @@ def eval(
161
169
  log_buffer: Number of samples to buffer before writing log file.
162
170
  If not specified, an appropriate default for the format and filesystem is
163
171
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
172
+ log_shared: Sync sample events to log directory so that users on other systems
173
+ can see log updates in realtime (defaults to no syncing). Specify `True`
174
+ to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
164
175
  score: Score output (defaults to True)
165
176
  score_display: Show scoring metrics in realtime (defaults to True)
166
177
  **kwargs: Model generation options.
@@ -210,6 +221,7 @@ def eval(
210
221
  log_samples=log_samples,
211
222
  log_images=log_images,
212
223
  log_buffer=log_buffer,
224
+ log_shared=log_shared,
213
225
  score=score,
214
226
  score_display=score_display,
215
227
  **kwargs,
@@ -236,7 +248,7 @@ async def eval_async(
236
248
  task_args: dict[str, Any] | str = dict(),
237
249
  sandbox: SandboxEnvironmentType | None = None,
238
250
  sandbox_cleanup: bool | None = None,
239
- solver: Solver | list[Solver] | SolverSpec | None = None,
251
+ solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
240
252
  tags: list[str] | None = None,
241
253
  metadata: dict[str, Any] | None = None,
242
254
  approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
@@ -260,6 +272,7 @@ async def eval_async(
260
272
  log_samples: bool | None = None,
261
273
  log_images: bool | None = None,
262
274
  log_buffer: int | None = None,
275
+ log_shared: bool | int | None = None,
263
276
  score: bool = True,
264
277
  score_display: bool | None = None,
265
278
  **kwargs: Unpack[GenerateConfigArgs],
@@ -312,6 +325,7 @@ async def eval_async(
312
325
  log_buffer: Number of samples to buffer before writing log file.
313
326
  If not specified, an appropriate default for the format and filesystem is
314
327
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
328
+ log_shared: Indicate that the log directory is shared, which results in additional syncing of realtime log data for Inspect View.
315
329
  score: Score output (defaults to True)
316
330
  score_display: Show scoring metrics in realtime (defaults to True)
317
331
  **kwargs: Model generation options.
@@ -341,13 +355,10 @@ async def eval_async(
341
355
 
342
356
  try:
343
357
  # intialise eval
344
- model, approval, resolved_tasks = eval_init(
345
- tasks=tasks,
358
+ model, approval = eval_init(
346
359
  model=model,
347
360
  model_base_url=model_base_url,
348
361
  model_args=model_args,
349
- task_args=task_args,
350
- sandbox=sandbox,
351
362
  approval=approval,
352
363
  max_subprocesses=max_subprocesses,
353
364
  log_level=log_level,
@@ -355,6 +366,11 @@ async def eval_async(
355
366
  **kwargs,
356
367
  )
357
368
 
369
+ # resolve tasks
370
+ resolved_tasks = eval_resolve_tasks(
371
+ tasks, task_args, model, GenerateConfig(**kwargs), sandbox
372
+ )
373
+
358
374
  # warn and return empty string if we resolved no tasks
359
375
  if len(resolved_tasks) == 0:
360
376
  log.warning("No inspect tasks were found at the specified paths.")
@@ -390,8 +406,22 @@ async def eval_async(
390
406
  f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
391
407
  )
392
408
 
409
+ # resolve log_shared
410
+ log_shared = DEFAULT_LOG_SHARED if log_shared is True else log_shared
411
+
412
+ # validate that --log-shared can't use used with 'json' format
413
+ if log_shared and log_format == JSON_LOG_FORMAT:
414
+ raise PrerequisiteError(
415
+ "ERROR: --log-shared is not compatible with the json log format."
416
+ )
417
+
393
418
  # resolve solver
394
- solver = chain(solver) if isinstance(solver, list) else solver
419
+ if isinstance(solver, list):
420
+ solver = chain(solver)
421
+ elif is_agent(solver):
422
+ solver = as_solver(solver)
423
+ else:
424
+ solver = cast(Solver | SolverSpec | None, solver)
395
425
 
396
426
  # ensure consistency of limit and sample_id
397
427
  if sample_id is not None and limit is not None:
@@ -426,6 +456,7 @@ async def eval_async(
426
456
  log_samples=log_samples,
427
457
  log_images=log_images,
428
458
  log_buffer=log_buffer,
459
+ log_shared=log_shared,
429
460
  score_display=score_display,
430
461
  )
431
462
 
@@ -485,6 +516,9 @@ async def eval_async(
485
516
  )
486
517
  logs = EvalLogs(results)
487
518
 
519
+ # cleanup sample buffers if required
520
+ cleanup_sample_buffers(log_dir)
521
+
488
522
  finally:
489
523
  _eval_async_running = False
490
524
 
@@ -510,6 +544,7 @@ def eval_retry(
510
544
  log_samples: bool | None = None,
511
545
  log_images: bool | None = None,
512
546
  log_buffer: int | None = None,
547
+ log_shared: bool | int | None = None,
513
548
  score: bool = True,
514
549
  score_display: bool | None = None,
515
550
  max_retries: int | None = None,
@@ -551,6 +586,9 @@ def eval_retry(
551
586
  log_buffer: Number of samples to buffer before writing log file.
552
587
  If not specified, an appropriate default for the format and filesystem is
553
588
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
589
+ log_shared: Sync sample events to log directory so that users on other systems
590
+ can see log updates in realtime (defaults to no syncing). Specify `True`
591
+ to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
554
592
  score: Score output (defaults to True)
555
593
  score_display: Show scoring metrics in realtime (defaults to True)
556
594
  max_retries:
@@ -586,6 +624,7 @@ def eval_retry(
586
624
  log_samples=log_samples,
587
625
  log_images=log_images,
588
626
  log_buffer=log_buffer,
627
+ log_shared=log_shared,
589
628
  score=score,
590
629
  score_display=score_display,
591
630
  max_retries=max_retries,
@@ -612,6 +651,7 @@ async def eval_retry_async(
612
651
  log_samples: bool | None = None,
613
652
  log_images: bool | None = None,
614
653
  log_buffer: int | None = None,
654
+ log_shared: bool | int | None = None,
615
655
  score: bool = True,
616
656
  score_display: bool | None = None,
617
657
  max_retries: int | None = None,
@@ -651,6 +691,8 @@ async def eval_retry_async(
651
691
  log_buffer: (int | None): Number of samples to buffer before writing log file.
652
692
  If not specified, an appropriate default for the format and filesystem is
653
693
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
694
+ log_shared: Indicate that the log directory is shared, which results in
695
+ additional syncing of realtime log data for Inspect View.
654
696
  score (bool): Score output (defaults to True)
655
697
  score_display (bool | None): Show scoring metrics in realtime (defaults to True)
656
698
  max_retries (int | None):
@@ -691,7 +733,7 @@ async def eval_retry_async(
691
733
  # context to reconstruct ephemeral Task instances)
692
734
  task: str | None
693
735
  task_id = eval_log.eval.task_id
694
- task_name = eval_log.eval.task
736
+ task_name = eval_log.eval.task_registry_name or eval_log.eval.task
695
737
  task_file = eval_log.eval.task_file
696
738
  if task_file:
697
739
  if not Path(task_file).exists():
@@ -750,6 +792,9 @@ async def eval_retry_async(
750
792
  log_buffer = (
751
793
  log_buffer if log_buffer is not None else eval_log.eval.config.log_buffer
752
794
  )
795
+ log_shared = (
796
+ log_shared if log_shared is not None else eval_log.eval.config.log_shared
797
+ )
753
798
  score_display = (
754
799
  score_display
755
800
  if score_display is not None
@@ -796,6 +841,7 @@ async def eval_retry_async(
796
841
  log_samples=log_samples,
797
842
  log_images=log_images,
798
843
  log_buffer=log_buffer,
844
+ log_shared=log_shared,
799
845
  score=score,
800
846
  score_display=score_display,
801
847
  **dict(config),
@@ -809,24 +855,20 @@ async def eval_retry_async(
809
855
 
810
856
 
811
857
  def eval_init(
812
- tasks: Tasks,
813
858
  model: str | Model | list[str] | list[Model] | None | NotGiven = NOT_GIVEN,
814
859
  model_base_url: str | None = None,
815
860
  model_args: dict[str, Any] | str = dict(),
816
- task_args: dict[str, Any] | str = dict(),
817
- sandbox: SandboxEnvironmentType | None = None,
818
861
  approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
819
862
  max_subprocesses: int | None = None,
820
863
  log_level: str | None = None,
821
864
  log_level_transcript: str | None = None,
822
865
  **kwargs: Unpack[GenerateConfigArgs],
823
- ) -> tuple[list[Model], list[ApprovalPolicy] | None, list[ResolvedTask]]:
866
+ ) -> tuple[list[Model], list[ApprovalPolicy] | None]:
824
867
  # init eval context
825
868
  init_eval_context(log_level, log_level_transcript, max_subprocesses)
826
869
 
827
870
  # resolve model and task args
828
871
  model_args = resolve_args(model_args)
829
- task_args = resolve_args(task_args)
830
872
 
831
873
  # resolve model args from environment if not specified
832
874
  if len(model_args) == 0:
@@ -839,21 +881,28 @@ def eval_init(
839
881
  generate_config = GenerateConfig(**kwargs)
840
882
  models = resolve_models(model, model_base_url, model_args, generate_config)
841
883
 
842
- # resolve tasks (set active model to resolve uses of the
843
- # 'default' model in tools, solvers, and scorers)
844
-
845
- with task_display().suspend_task_app():
846
- resolved_tasks: list[ResolvedTask] = []
847
- for m in models:
848
- init_active_model(m, generate_config)
849
- resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
850
-
851
884
  # resolve approval
852
885
  if isinstance(approval, str | ApprovalPolicyConfig):
853
886
  approval = approval_policies_from_config(approval)
854
887
  init_tool_approval(approval)
855
888
 
856
- return models, approval, resolved_tasks
889
+ return models, approval
890
+
891
+
892
+ def eval_resolve_tasks(
893
+ tasks: Tasks,
894
+ task_args: dict[str, Any] | str,
895
+ models: list[Model],
896
+ config: GenerateConfig,
897
+ sandbox: SandboxEnvironmentType | None,
898
+ ) -> list[ResolvedTask]:
899
+ task_args = resolve_args(task_args)
900
+ with task_display().suspend_task_app():
901
+ resolved_tasks: list[ResolvedTask] = []
902
+ for m in models:
903
+ init_active_model(m, config)
904
+ resolved_tasks.extend(resolve_tasks(tasks, task_args, m, sandbox))
905
+ return resolved_tasks
857
906
 
858
907
 
859
908
  def init_eval_display(
@@ -1,6 +1,5 @@
1
1
  import hashlib
2
2
  import logging
3
- from copy import deepcopy
4
3
  from typing import Any, Literal, NamedTuple, Set, cast
5
4
 
6
5
  import rich
@@ -18,6 +17,7 @@ from typing_extensions import Unpack
18
17
  from inspect_ai._util.error import PrerequisiteError
19
18
  from inspect_ai._util.file import basename, filesystem
20
19
  from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
20
+ from inspect_ai.agent._agent import Agent
21
21
  from inspect_ai.approval._policy import ApprovalPolicy
22
22
  from inspect_ai.log import EvalLog
23
23
  from inspect_ai.log._bundle import bundle_log_dir
@@ -37,7 +37,7 @@ from inspect_ai.solver._solver import Solver, SolverSpec
37
37
  from inspect_ai.util import DisplayType, SandboxEnvironmentType
38
38
  from inspect_ai.util._display import display_type_initialized, init_display_type
39
39
 
40
- from .eval import eval, eval_init
40
+ from .eval import eval, eval_init, eval_resolve_tasks
41
41
  from .loader import resolve_task_args
42
42
  from .task import Epochs
43
43
  from .task.resolved import ResolvedTask
@@ -66,7 +66,7 @@ def eval_set(
66
66
  task_args: dict[str, Any] | str = dict(),
67
67
  sandbox: SandboxEnvironmentType | None = None,
68
68
  sandbox_cleanup: bool | None = None,
69
- solver: Solver | list[Solver] | SolverSpec | None = None,
69
+ solver: Solver | SolverSpec | Agent | list[Solver] | None = None,
70
70
  tags: list[str] | None = None,
71
71
  metadata: dict[str, Any] | None = None,
72
72
  trace: bool | None = None,
@@ -92,6 +92,7 @@ def eval_set(
92
92
  log_samples: bool | None = None,
93
93
  log_images: bool | None = None,
94
94
  log_buffer: int | None = None,
95
+ log_shared: bool | int | None = None,
95
96
  bundle_dir: str | None = None,
96
97
  bundle_overwrite: bool = False,
97
98
  **kwargs: Unpack[GenerateConfigArgs],
@@ -171,6 +172,9 @@ def eval_set(
171
172
  log_buffer: Number of samples to buffer before writing log file.
172
173
  If not specified, an appropriate default for the format and filesystem is
173
174
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
175
+ log_shared: Sync sample events to log directory so that users on other systems
176
+ can see log updates in realtime (defaults to no syncing). Specify `True`
177
+ to sync every 10 seconds, otherwise an integer to sync every `n` seconds.
174
178
  bundle_dir: If specified, the log viewer and logs generated
175
179
  by this eval set will be bundled into this directory.
176
180
  bundle_overwrite: Whether to overwrite files in the bundle_dir.
@@ -219,6 +223,7 @@ def eval_set(
219
223
  log_samples=log_samples,
220
224
  log_images=log_images,
221
225
  log_buffer=log_buffer,
226
+ log_shared=log_shared,
222
227
  score=score,
223
228
  **kwargs,
224
229
  )
@@ -242,29 +247,21 @@ def eval_set(
242
247
  if display == "conversation":
243
248
  raise RuntimeError("eval_set cannot be used with conversation display.")
244
249
 
245
- # resolve tasks
246
- models, _, resolved_tasks = eval_init(
247
- tasks=tasks,
250
+ # initialize eval
251
+ models, _ = eval_init(
248
252
  model=model,
249
253
  model_base_url=model_base_url,
250
254
  model_args=model_args,
251
- task_args=task_args,
252
- sandbox=sandbox,
253
255
  max_subprocesses=max_subprocesses,
254
256
  log_level=log_level,
255
257
  log_level_transcript=log_level_transcript,
256
258
  **kwargs,
257
259
  )
258
260
 
259
- # ensure log_dir and list all logs
261
+ # ensure log_dir
260
262
  fs = filesystem(log_dir)
261
263
  fs.mkdir(log_dir, exist_ok=True)
262
264
 
263
- # validate that:
264
- # (1) All tasks have a unique identifier
265
- # (2) All logs have identifiers that map to tasks
266
- validate_eval_set_prerequisites(resolved_tasks, list_all_eval_logs(log_dir))
267
-
268
265
  # resolve some parameters
269
266
  retry_connections = retry_connections or 0.5
270
267
  retry_cleanup = retry_cleanup is not False
@@ -305,11 +302,21 @@ def eval_set(
305
302
  # - tasks with a successful log (they'll just be returned)
306
303
  # - tasks with failed logs (they'll be retried)
307
304
  def try_eval() -> list[EvalLog]:
305
+ # resolve tasks
306
+ resolved_tasks = eval_resolve_tasks(
307
+ tasks, task_args, models, GenerateConfig(**kwargs), sandbox
308
+ )
309
+
308
310
  # list all logs currently in the log directory (update manifest if there are some)
309
311
  all_logs = list_all_eval_logs(log_dir)
310
312
  if len(all_logs) > 0:
311
313
  write_log_dir_manifest(log_dir)
312
314
 
315
+ # validate that:
316
+ # (1) All tasks have a unique identifier
317
+ # (2) All logs have identifiers that map to tasks
318
+ validate_eval_set_prerequisites(resolved_tasks, all_logs)
319
+
313
320
  # see which tasks are yet to run (to complete successfully we need
314
321
  # a successful eval for every [task_file/]task_name/model combination)
315
322
  # for those that haven't run, schedule them into models => tasks groups
@@ -414,13 +421,10 @@ def as_previous_tasks(
414
421
  # want to bring this back but we'd need to resolve the
415
422
  # directory issues.
416
423
 
417
- # deepcopy so the same instance is not run twice
418
- prev_task = deepcopy(task.task)
419
-
420
424
  previous_tasks.append(
421
425
  PreviousTask(
422
426
  id=log.header.eval.task_id,
423
- task=prev_task,
427
+ task=task.task,
424
428
  task_args=resolve_task_args(task.task),
425
429
  model=task.model,
426
430
  log=read_eval_log(log.info),
@@ -26,6 +26,8 @@ from inspect_ai._util.registry import (
26
26
  registry_lookup,
27
27
  registry_params,
28
28
  )
29
+ from inspect_ai.agent._agent import Agent
30
+ from inspect_ai.agent._as_solver import as_solver
29
31
  from inspect_ai.model import Model
30
32
  from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
31
33
  from inspect_ai.solver._bridge import bridge
@@ -421,20 +423,32 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
421
423
  if solver_file is None:
422
424
  if solver_name is None:
423
425
  raise ValueError(f"Unable to resolve solver name from {spec.solver}")
424
- return cast(Solver, registry_create("solver", solver_name, **spec.args))
426
+ elif registry_lookup("solver", solver_name) is not None:
427
+ return cast(Solver, registry_create("solver", solver_name, **spec.args))
428
+ elif registry_lookup("agent", solver_name) is not None:
429
+ agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
430
+ return as_solver(agent)
431
+ else:
432
+ raise ValueError(
433
+ f"Unkonwn solver {solver_name} (not registered as a @solver or @agent)"
434
+ )
425
435
 
426
436
  # we do have a solver file
427
437
  else:
428
438
  # load the module and parse decorators
429
439
  solver_module = load_module(solver_file)
430
- decorators = parse_decorators(solver_file, "solver")
440
+ solver_decorators = parse_decorators(solver_file, "solver")
441
+ agent_decorators = parse_decorators(solver_file, "agent")
431
442
 
432
443
  # if there is no solver_name see if we can discover it
433
444
  if solver_name is None:
434
- if len(decorators) == 1:
445
+ if len(solver_decorators) == 1:
435
446
  # decorator based solver
436
- solver_name = decorators[0][0]
437
- elif len(decorators) == 0:
447
+ solver_name = solver_decorators[0][0]
448
+ elif len(agent_decorators) == 1:
449
+ # decorator based agent
450
+ solver_name = agent_decorators[0][0]
451
+ elif len(solver_decorators) == 0 and len(agent_decorators) == 0:
438
452
  # see if we can find an agent based solver
439
453
  functions = [
440
454
  function
@@ -454,26 +468,35 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
454
468
 
455
469
  elif len(agent_functions) == 0:
456
470
  raise PrerequisiteError(
457
- f"The source file {pretty_solver_file} does not contain any @solver functions or agent functions."
471
+ f"The source file {pretty_solver_file} does not contain any @solver, @agent or bridged agent functions."
458
472
  )
459
473
  else:
460
474
  raise PrerequisiteError(
461
- f"The source file {pretty_solver_file} has more than one agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
475
+ f"The source file {pretty_solver_file} has more than one bridged agent function (qualify which agent using e.g. '{solver_file.name}@agent_fn')"
462
476
  )
463
- else:
477
+ elif len(solver_decorators) > 1:
464
478
  raise PrerequisiteError(
465
479
  f"The source file {pretty_solver_file} has more than one @solver function (qualify which solver using e.g. '{solver_file.name}y@solver_fn')"
466
480
  )
481
+ else:
482
+ raise PrerequisiteError(
483
+ f"The source file {pretty_solver_file} has more than one @agent function (qualify which agent using e.g. '{solver_file.name}y@agent_fn')"
484
+ )
467
485
 
468
486
  # create decorator based solvers using the registry
469
- if any(solver[0] == solver_name for solver in decorators):
487
+ if any(solver[0] == solver_name for solver in solver_decorators):
470
488
  return cast(Solver, registry_create("solver", solver_name, **spec.args))
471
489
 
472
- # create agent based solvers by calling the function and wrapping it in bridge()
490
+ # create decorator based agents using the registry
491
+ elif any(agent[0] == solver_name for agent in agent_decorators):
492
+ agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
493
+ return as_solver(agent)
494
+
495
+ # create bridge based solvers by calling the function and wrapping it in bridge()
473
496
  else:
474
497
  agent_fn = getattr(solver_module, solver_name, None)
475
498
  if inspect.isfunction(agent_fn):
476
- return bridge.bridge(agent_fn(**spec.args))
499
+ return bridge(agent_fn(**spec.args))
477
500
  elif agent_fn is not None:
478
501
  raise PrerequisiteError(
479
502
  f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
inspect_ai/_eval/run.py CHANGED
@@ -1,4 +1,3 @@
1
- import functools
2
1
  import logging
3
2
  import os
4
3
  import sys
@@ -20,7 +19,6 @@ from inspect_ai._display.core.active import (
20
19
  init_task_screen,
21
20
  )
22
21
  from inspect_ai._display.core.display import TaskSpec
23
- from inspect_ai._util._async import tg_collect
24
22
  from inspect_ai._util.error import PrerequisiteError, exception_message
25
23
  from inspect_ai._util.path import chdir
26
24
  from inspect_ai._util.registry import registry_unqualified_name
@@ -195,6 +193,7 @@ async def eval_run(
195
193
  task_name=task.name,
196
194
  task_version=task.version,
197
195
  task_file=resolved_task.task_file,
196
+ task_registry_name=resolved_task.task.registry_name,
198
197
  task_id=resolved_task.id if resolved_task.id else uuid(),
199
198
  run_id=run_id,
200
199
  solver=eval_solver_spec,
@@ -359,17 +358,13 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
359
358
  "Run Task",
360
359
  f"task: {task_options.task.name} ({task_options.model})",
361
360
  ):
362
- tg_results = await tg_collect(
363
- [functools.partial(task_run, task_options)]
364
- )
365
- # check for empty results list (indicates cancellation)
366
- if len(tg_results) == 0:
367
- # task was cancelled, break out of the worker loop
368
- result = None
369
-
370
- else:
371
- result = tg_results[0]
372
- results.append(result)
361
+ async with anyio.create_task_group() as tg:
362
+
363
+ async def run_task() -> None:
364
+ result = await task_run(task_options)
365
+ results.append(result)
366
+
367
+ tg.start_soon(run_task)
373
368
 
374
369
  except Exception as ex:
375
370
  # errors generally don't escape from tasks (the exception being if an error
@@ -407,12 +402,15 @@ async def run_multiple(tasks: list[TaskRunOptions], parallel: int) -> list[EvalL
407
402
  # Use anyio task group instead of manual task management
408
403
  try:
409
404
  async with anyio.create_task_group() as tg:
405
+ # computer number of workers (never more than total_tasks)
406
+ num_workers = min(parallel, total_tasks)
407
+
410
408
  # start worker tasks
411
- for _ in range(parallel):
409
+ for _ in range(num_workers):
412
410
  tg.start_soon(worker)
413
411
 
414
412
  # enqueue initial set of tasks
415
- for _ in range(min(parallel, total_tasks)):
413
+ for _ in range(num_workers):
416
414
  await enque_next_task()
417
415
  except anyio.get_cancelled_exc_class():
418
416
  pass
inspect_ai/_eval/score.py CHANGED
@@ -7,8 +7,8 @@ import anyio
7
7
 
8
8
  from inspect_ai._display import display
9
9
  from inspect_ai._eval.loader import scorer_from_spec
10
- from inspect_ai._util._async import tg_collect
11
- from inspect_ai._util.platform import platform_init
10
+ from inspect_ai._util._async import configured_async_backend, run_coroutine, tg_collect
11
+ from inspect_ai._util.platform import platform_init, running_in_notebook
12
12
  from inspect_ai._util.registry import registry_create, registry_unqualified_name
13
13
  from inspect_ai.log import (
14
14
  EvalLog,
@@ -56,7 +56,17 @@ def score(
56
56
  # resolve scorers into a list
57
57
  scorers = [scorers] if isinstance(scorers, Scorer) else scorers
58
58
 
59
- return anyio.run(score_async, log, scorers, epochs_reducer, action)
59
+ if running_in_notebook():
60
+ return run_coroutine(score_async(log, scorers, epochs_reducer, action))
61
+ else:
62
+ return anyio.run(
63
+ score_async,
64
+ log,
65
+ scorers,
66
+ epochs_reducer,
67
+ action,
68
+ backend=configured_async_backend(),
69
+ )
60
70
 
61
71
 
62
72
  async def score_async(
@@ -1,12 +1,8 @@
1
1
  from typing import Literal
2
2
 
3
- from inspect_ai.model import (
4
- CachePolicy,
5
- GenerateConfig,
6
- Model,
7
- call_tools,
8
- )
3
+ from inspect_ai.model import CachePolicy, GenerateConfig, Model
9
4
  from inspect_ai.model._cache import epoch
5
+ from inspect_ai.model._call_tools import execute_tools
10
6
  from inspect_ai.solver import TaskState
11
7
  from inspect_ai.solver._limit import SampleLimitExceededError
12
8
  from inspect_ai.tool import ToolFunction
@@ -48,10 +44,13 @@ async def task_generate(
48
44
 
49
45
  # resolve tool calls if necessary
50
46
  if tool_calls != "none" and message.tool_calls:
51
- # call tools and append messages to state
52
- state.messages.extend(
53
- await call_tools(message, state.tools, config.max_tool_output)
47
+ # call tools and update messages and output
48
+ messages, output = await execute_tools(
49
+ state.messages, state.tools, config.max_tool_output
54
50
  )
51
+ state.messages.extend(messages)
52
+ if output is not None:
53
+ state.output = output
55
54
 
56
55
  # check for completed or only executing a single tool call
57
56
  if state.completed or tool_calls == "single":