inspect-ai 0.3.90__py3-none-any.whl → 0.3.92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (370) hide show
  1. inspect_ai/_cli/common.py +13 -0
  2. inspect_ai/_cli/eval.py +44 -0
  3. inspect_ai/_display/textual/widgets/samples.py +49 -4
  4. inspect_ai/_display/textual/widgets/vscode.py +4 -2
  5. inspect_ai/_eval/eval.py +41 -28
  6. inspect_ai/_eval/evalset.py +4 -0
  7. inspect_ai/_eval/loader.py +4 -5
  8. inspect_ai/_eval/registry.py +1 -1
  9. inspect_ai/_eval/run.py +6 -3
  10. inspect_ai/_eval/task/log.py +6 -0
  11. inspect_ai/_eval/task/run.py +108 -41
  12. inspect_ai/_eval/task/sandbox.py +19 -5
  13. inspect_ai/_util/_async.py +1 -1
  14. inspect_ai/_util/constants.py +1 -0
  15. inspect_ai/_util/environ.py +32 -0
  16. inspect_ai/_util/file.py +8 -1
  17. inspect_ai/_util/httpx.py +105 -22
  18. inspect_ai/_util/registry.py +83 -9
  19. inspect_ai/_util/text.py +81 -17
  20. inspect_ai/_util/transcript.py +9 -6
  21. inspect_ai/_util/vscode.py +7 -2
  22. inspect_ai/_view/schema.py +1 -1
  23. inspect_ai/_view/www/babel.config.js +11 -0
  24. inspect_ai/_view/www/dist/assets/index.css +3640 -3563
  25. inspect_ai/_view/www/dist/assets/index.js +59204 -52519
  26. inspect_ai/_view/www/eslint.config.mjs +10 -1
  27. inspect_ai/_view/www/jest.config.mjs +21 -0
  28. inspect_ai/_view/www/log-schema.json +111 -2
  29. inspect_ai/_view/www/package.json +19 -5
  30. inspect_ai/_view/www/src/{types → @types}/log.d.ts +95 -32
  31. inspect_ai/_view/www/{App.css → src/app/App.css} +22 -14
  32. inspect_ai/_view/www/src/app/App.tsx +168 -0
  33. inspect_ai/_view/www/src/{AppErrorBoundary.tsx → app/AppErrorBoundary.tsx} +1 -1
  34. inspect_ai/_view/www/src/{appearance → app/appearance}/icons.ts +1 -0
  35. inspect_ai/_view/www/src/{metadata → app/content}/RenderedContent.tsx +5 -5
  36. inspect_ai/_view/www/src/{workspace/WorkSpaceView.tsx → app/log-view/LogView.tsx} +59 -40
  37. inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +159 -0
  38. inspect_ai/_view/www/src/app/log-view/LogViewLayout.tsx +109 -0
  39. inspect_ai/_view/www/src/{workspace → app/log-view}/error/TaskErrorPanel.tsx +3 -3
  40. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ModelRolesView.tsx +1 -1
  41. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/Navbar.tsx +4 -4
  42. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/PrimaryBar.tsx +8 -8
  43. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ResultsPanel.tsx +6 -6
  44. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/RunningStatusPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ScoreGrid.tsx +1 -1
  46. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/SecondaryBar.tsx +8 -8
  47. inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/InfoTab.tsx +35 -6
  48. inspect_ai/_view/www/src/app/log-view/tabs/JsonTab.tsx +136 -0
  49. inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/SamplesTab.tsx +82 -73
  50. inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/grouping.ts +3 -3
  51. inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/types.ts +1 -1
  52. inspect_ai/_view/www/src/{plan → app/plan}/DatasetDetailView.tsx +2 -2
  53. inspect_ai/_view/www/src/{plan → app/plan}/DetailStep.tsx +1 -1
  54. inspect_ai/_view/www/src/{plan → app/plan}/ModelCard.tsx +4 -4
  55. inspect_ai/_view/www/src/{plan → app/plan}/PlanCard.tsx +2 -2
  56. inspect_ai/_view/www/src/{plan → app/plan}/PlanDetailView.tsx +5 -5
  57. inspect_ai/_view/www/src/{plan → app/plan}/SolverDetailView.tsx +1 -1
  58. inspect_ai/_view/www/src/app/routing/AppRouter.tsx +58 -0
  59. inspect_ai/_view/www/src/app/routing/navigationHooks.ts +182 -0
  60. inspect_ai/_view/www/src/app/routing/url.ts +43 -0
  61. inspect_ai/_view/www/src/{samples → app/samples}/InlineSampleDisplay.tsx +11 -27
  62. inspect_ai/_view/www/src/{samples → app/samples}/SampleDialog.tsx +36 -40
  63. inspect_ai/_view/www/src/{samples → app/samples}/SampleDisplay.module.css +4 -0
  64. inspect_ai/_view/www/src/{samples → app/samples}/SampleDisplay.tsx +116 -49
  65. inspect_ai/_view/www/src/{samples → app/samples}/SampleSummaryView.module.css +1 -1
  66. inspect_ai/_view/www/src/{samples → app/samples}/SampleSummaryView.tsx +29 -26
  67. inspect_ai/_view/www/src/{samples → app/samples}/SamplesTools.tsx +3 -3
  68. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessage.module.css +5 -2
  69. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessage.tsx +12 -4
  70. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRenderer.tsx +3 -3
  71. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRow.tsx +6 -1
  72. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatView.tsx +4 -2
  73. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatViewVirtualList.tsx +5 -3
  74. inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +12 -0
  75. inspect_ai/_view/www/src/{samples → app/samples}/chat/MessageContent.tsx +11 -10
  76. inspect_ai/_view/www/src/app/samples/chat/MessageContents.module.css +7 -0
  77. inspect_ai/_view/www/src/{samples → app/samples}/chat/MessageContents.tsx +14 -8
  78. inspect_ai/_view/www/src/{samples → app/samples}/chat/messages.ts +2 -2
  79. inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.module.css +7 -0
  80. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolCallView.tsx +26 -27
  81. inspect_ai/_view/www/src/app/samples/chat/tools/ToolInput.module.css +19 -0
  82. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolInput.tsx +3 -3
  83. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolOutput.module.css +1 -0
  84. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolOutput.tsx +1 -1
  85. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolTitle.module.css +4 -0
  86. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolTitle.tsx +2 -2
  87. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/tool.ts +1 -1
  88. inspect_ai/_view/www/src/app/samples/chat/types.ts +1 -0
  89. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/samplesDescriptor.tsx +38 -15
  90. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
  91. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/CategoricalScoreDescriptor.tsx +2 -2
  92. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/NumericScoreDescriptor.tsx +3 -3
  93. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ObjectScoreDescriptor.tsx +4 -4
  94. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/OtherScoreDescriptor.tsx +2 -2
  95. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/PassFailScoreDescriptor.tsx +2 -2
  96. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ScoreDescriptor.tsx +1 -1
  97. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/types.ts +4 -3
  98. inspect_ai/_view/www/src/{samples → app/samples}/error/SampleErrorView.module.css +2 -1
  99. inspect_ai/_view/www/src/{samples → app/samples}/list/SampleHeader.tsx +3 -0
  100. inspect_ai/_view/www/src/{samples → app/samples}/list/SampleList.tsx +47 -33
  101. inspect_ai/_view/www/src/{samples → app/samples}/list/SampleRow.module.css +16 -0
  102. inspect_ai/_view/www/src/{samples → app/samples}/list/SampleRow.tsx +47 -20
  103. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SelectScorer.tsx +1 -1
  104. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SortFilter.tsx +4 -4
  105. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/filters.ts +8 -6
  106. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/SampleFilter.tsx +4 -3
  107. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/completions.ts +1 -1
  108. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/language.ts +1 -0
  109. inspect_ai/_view/www/src/{samples → app/samples}/sampleDataAdapter.ts +3 -3
  110. inspect_ai/_view/www/src/{samples → app/samples}/sampleLimit.ts +1 -1
  111. inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScores.tsx +1 -1
  112. inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresGrid.tsx +12 -11
  113. inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresView.tsx +6 -6
  114. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ApprovalEventView.tsx +1 -1
  115. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ErrorEventView.tsx +3 -3
  116. inspect_ai/_view/www/src/{samples → app/samples}/transcript/InfoEventView.tsx +4 -4
  117. inspect_ai/_view/www/src/{samples → app/samples}/transcript/InputEventView.tsx +3 -3
  118. inspect_ai/_view/www/src/{samples → app/samples}/transcript/LoggerEventView.tsx +3 -3
  119. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ModelEventView.module.css +13 -7
  120. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ModelEventView.tsx +49 -21
  121. inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleInitEventView.tsx +11 -9
  122. inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleLimitEventView.tsx +1 -1
  123. inspect_ai/_view/www/src/{samples → app/samples}/transcript/SandboxEventView.tsx +8 -6
  124. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ScoreEventView.tsx +4 -4
  125. inspect_ai/_view/www/src/{samples → app/samples}/transcript/StepEventView.tsx +11 -3
  126. inspect_ai/_view/www/src/{samples → app/samples}/transcript/SubtaskEventView.tsx +2 -2
  127. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ToolEventView.tsx +2 -2
  128. inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptView.module.css +8 -7
  129. inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptView.tsx +32 -114
  130. inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptVirtualListComponent.module.css +6 -5
  131. inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptVirtualListComponent.tsx +14 -2
  132. inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventPanel.tsx +2 -2
  133. inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventTimingPanel.tsx +1 -1
  134. inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/utils.ts +1 -1
  135. inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventRenderers.tsx +23 -21
  136. inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventRenders.module.css +7 -0
  137. inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventView.tsx +2 -2
  138. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +142 -0
  139. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +39 -0
  140. inspect_ai/_view/www/src/{samples → app/samples}/transcript/types.ts +1 -1
  141. inspect_ai/_view/www/src/{workspace → app}/sidebar/EvalStatus.tsx +1 -1
  142. inspect_ai/_view/www/src/app/sidebar/LogDirectoryTitleView.module.css +16 -0
  143. inspect_ai/_view/www/src/app/sidebar/LogDirectoryTitleView.tsx +70 -0
  144. inspect_ai/_view/www/src/{workspace → app}/sidebar/Sidebar.module.css +8 -0
  145. inspect_ai/_view/www/src/{workspace → app}/sidebar/Sidebar.tsx +35 -17
  146. inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarLogEntry.tsx +1 -1
  147. inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoreView.tsx +2 -2
  148. inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoresView.tsx +2 -2
  149. inspect_ai/_view/www/src/{types.ts → app/types.ts} +18 -11
  150. inspect_ai/_view/www/src/{usage → app/usage}/ModelTokenTable.tsx +1 -1
  151. inspect_ai/_view/www/src/{usage → app/usage}/ModelUsagePanel.tsx +2 -2
  152. inspect_ai/_view/www/src/{usage → app/usage}/TokenTable.tsx +1 -1
  153. inspect_ai/_view/www/src/{usage → app/usage}/UsageCard.tsx +6 -6
  154. inspect_ai/_view/www/src/{api → client/api}/api-browser.ts +2 -2
  155. inspect_ai/_view/www/src/{api → client/api}/api-http.ts +3 -3
  156. inspect_ai/_view/www/src/{api → client/api}/api-vscode.ts +2 -2
  157. inspect_ai/_view/www/src/{api → client/api}/client-api.ts +6 -5
  158. inspect_ai/_view/www/src/{api → client/api}/index.ts +2 -2
  159. inspect_ai/_view/www/src/{api → client/api}/types.ts +4 -1
  160. inspect_ai/_view/www/src/{logfile → client/remote}/remoteLogFile.ts +3 -3
  161. inspect_ai/_view/www/src/{storage → client/storage}/index.ts +11 -5
  162. inspect_ai/_view/www/src/components/Card.tsx +1 -1
  163. inspect_ai/_view/www/src/components/CopyButton.tsx +1 -1
  164. inspect_ai/_view/www/src/components/DownloadButton.tsx +1 -1
  165. inspect_ai/_view/www/src/components/ErrorPanel.tsx +1 -1
  166. inspect_ai/_view/www/src/components/{ExpandablePanel.css → ExpandablePanel.module.css} +14 -11
  167. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +16 -10
  168. inspect_ai/_view/www/src/components/FindBand.tsx +1 -1
  169. inspect_ai/_view/www/src/components/JsonPanel.css +2 -2
  170. inspect_ai/_view/www/src/components/LargeModal.tsx +12 -1
  171. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +1 -1
  172. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +3 -1
  173. inspect_ai/_view/www/src/components/MessageBand.tsx +1 -1
  174. inspect_ai/_view/www/src/components/NoContentsPanel.tsx +1 -1
  175. inspect_ai/_view/www/src/constants.ts +10 -9
  176. inspect_ai/_view/www/src/index.tsx +27 -11
  177. inspect_ai/_view/www/src/state/appSlice.ts +44 -5
  178. inspect_ai/_view/www/src/state/hooks.ts +30 -7
  179. inspect_ai/_view/www/src/state/logSlice.ts +7 -5
  180. inspect_ai/_view/www/src/state/logsPolling.ts +1 -1
  181. inspect_ai/_view/www/src/state/logsSlice.ts +18 -13
  182. inspect_ai/_view/www/src/state/samplePolling.ts +12 -12
  183. inspect_ai/_view/www/src/state/sampleSlice.ts +3 -5
  184. inspect_ai/_view/www/src/state/sampleUtils.ts +1 -1
  185. inspect_ai/_view/www/src/{scoring/utils.ts → state/scoring.ts} +2 -2
  186. inspect_ai/_view/www/src/state/store.ts +9 -7
  187. inspect_ai/_view/www/src/state/utils.ts +1 -1
  188. inspect_ai/_view/www/src/tests/README.md +49 -0
  189. inspect_ai/_view/www/src/tests/__mocks__/fileMock.js +1 -0
  190. inspect_ai/_view/www/src/tests/__mocks__/styleMock.js +1 -0
  191. inspect_ai/_view/www/src/tests/setupTests.mjs +1 -0
  192. inspect_ai/_view/www/src/tests/utils/base64.test.ts +23 -0
  193. inspect_ai/_view/www/src/tests/utils/format.test.ts +127 -0
  194. inspect_ai/_view/www/src/tests/utils/path.test.ts +54 -0
  195. inspect_ai/_view/www/src/utils/format.ts +8 -2
  196. inspect_ai/_view/www/src/utils/path.ts +14 -2
  197. inspect_ai/_view/www/src/utils/polling.ts +1 -2
  198. inspect_ai/_view/www/src/utils/uri.ts +32 -0
  199. inspect_ai/_view/www/yarn.lock +3310 -382
  200. inspect_ai/agent/_handoff.py +6 -3
  201. inspect_ai/agent/_human/agent.py +5 -3
  202. inspect_ai/agent/_human/install.py +16 -7
  203. inspect_ai/agent/_human/panel.py +14 -1
  204. inspect_ai/agent/_human/service.py +5 -1
  205. inspect_ai/agent/_react.py +161 -128
  206. inspect_ai/agent/_types.py +15 -4
  207. inspect_ai/approval/_policy.py +2 -2
  208. inspect_ai/log/_file.py +30 -11
  209. inspect_ai/log/_log.py +7 -1
  210. inspect_ai/log/_recorders/eval.py +3 -0
  211. inspect_ai/log/_recorders/types.py +1 -0
  212. inspect_ai/log/_samples.py +4 -0
  213. inspect_ai/model/_call_tools.py +33 -17
  214. inspect_ai/model/_generate_config.py +10 -2
  215. inspect_ai/model/_model.py +41 -21
  216. inspect_ai/model/_model_output.py +2 -1
  217. inspect_ai/model/_openai.py +10 -8
  218. inspect_ai/model/_openai_responses.py +95 -42
  219. inspect_ai/model/_providers/anthropic.py +14 -12
  220. inspect_ai/model/_providers/google.py +191 -95
  221. inspect_ai/model/_providers/hf.py +1 -1
  222. inspect_ai/model/_providers/mistral.py +2 -3
  223. inspect_ai/model/_providers/openai.py +54 -17
  224. inspect_ai/model/_providers/openai_o1.py +1 -1
  225. inspect_ai/model/_providers/openai_responses.py +28 -16
  226. inspect_ai/model/_providers/openrouter.py +14 -0
  227. inspect_ai/model/_providers/providers.py +2 -2
  228. inspect_ai/model/_providers/util/chatapi.py +17 -7
  229. inspect_ai/model/_providers/vllm.py +1 -1
  230. inspect_ai/scorer/_metric.py +17 -1
  231. inspect_ai/scorer/_model.py +51 -6
  232. inspect_ai/scorer/_scorer.py +1 -1
  233. inspect_ai/solver/_human_agent.py +3 -0
  234. inspect_ai/solver/_plan.py +1 -1
  235. inspect_ai/solver/_solver.py +1 -1
  236. inspect_ai/solver/_use_tools.py +14 -8
  237. inspect_ai/tool/__init__.py +16 -1
  238. inspect_ai/tool/_json_rpc_helpers.py +285 -0
  239. inspect_ai/tool/_mcp/__init__.py +13 -0
  240. inspect_ai/tool/_mcp/_context.py +14 -0
  241. inspect_ai/tool/_mcp/_mcp.py +293 -0
  242. inspect_ai/tool/_mcp/_sandbox.py +104 -0
  243. inspect_ai/tool/_mcp/_types.py +31 -0
  244. inspect_ai/tool/_mcp/connection.py +60 -0
  245. inspect_ai/tool/_mcp/sampling.py +118 -0
  246. inspect_ai/tool/_mcp/server.py +112 -0
  247. inspect_ai/tool/_mcp/tools.py +34 -0
  248. inspect_ai/tool/_tool.py +13 -0
  249. inspect_ai/tool/_tool_def.py +24 -7
  250. inspect_ai/tool/_tool_support_helpers.py +129 -153
  251. inspect_ai/tool/_tools/_bash_session.py +11 -11
  252. inspect_ai/tool/_tools/_text_editor.py +6 -6
  253. inspect_ai/tool/_tools/_web_browser/_web_browser.py +8 -8
  254. inspect_ai/util/_anyio.py +31 -20
  255. inspect_ai/util/_json.py +20 -2
  256. inspect_ai/util/_sandbox/context.py +18 -7
  257. inspect_ai/util/_sandbox/docker/compose.py +1 -1
  258. inspect_ai/util/_sandbox/docker/docker.py +92 -21
  259. inspect_ai/util/_sandbox/environment.py +33 -2
  260. inspect_ai/util/_sandbox/events.py +2 -2
  261. inspect_ai/util/_sandbox/service.py +13 -3
  262. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/METADATA +6 -2
  263. inspect_ai-0.3.92.dist-info/RECORD +732 -0
  264. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/WHEEL +1 -1
  265. inspect_ai/_view/www/src/App.tsx +0 -316
  266. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +0 -4
  267. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +0 -3
  268. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +0 -3
  269. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +0 -14
  270. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +0 -292
  271. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +0 -5
  272. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +0 -57
  273. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +0 -43
  274. inspect_ai-0.3.90.dist-info/RECORD +0 -705
  275. /inspect_ai/_view/www/src/{types → @types}/asciicinema-player.d.ts +0 -0
  276. /inspect_ai/_view/www/src/{types → @types}/jsondiffpatch.d.ts +0 -0
  277. /inspect_ai/_view/www/src/{types → @types}/markdown-it-katex.d.ts +0 -0
  278. /inspect_ai/_view/www/src/{types → @types}/prism.d.ts +0 -0
  279. /inspect_ai/_view/www/src/{appearance → app/appearance}/colors.ts +0 -0
  280. /inspect_ai/_view/www/src/{appearance → app/appearance}/fonts.ts +0 -0
  281. /inspect_ai/_view/www/src/{appearance → app/appearance}/styles.ts +0 -0
  282. /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataGrid.tsx +0 -0
  283. /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataView.module.css +0 -0
  284. /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataView.tsx +0 -0
  285. /inspect_ai/_view/www/src/{metadata → app/content}/MetadataGrid.module.css +0 -0
  286. /inspect_ai/_view/www/src/{metadata → app/content}/RenderedContent.module.css +0 -0
  287. /inspect_ai/_view/www/src/{metadata → app/content}/types.ts +0 -0
  288. /inspect_ai/_view/www/src/{workspace/WorkSpaceView.module.css → app/log-view/LogView.module.css} +0 -0
  289. /inspect_ai/_view/www/src/{workspace → app/log-view}/error/TaskErrorPanel.module.css +0 -0
  290. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ModelRolesView.module.css +0 -0
  291. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/Navbar.module.css +0 -0
  292. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/PrimaryBar.module.css +0 -0
  293. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ResultsPanel.module.css +0 -0
  294. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/RunningStatusPanel.module.css +0 -0
  295. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ScoreGrid.module.css +0 -0
  296. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/SecondaryBar.module.css +0 -0
  297. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/StatusPanel.module.css +0 -0
  298. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/StatusPanel.tsx +0 -0
  299. /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/InfoTab.module.css +0 -0
  300. /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/JsonTab.module.css +0 -0
  301. /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/RunningNoSamples.module.css +0 -0
  302. /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/RunningNoSamples.tsx +0 -0
  303. /inspect_ai/_view/www/src/{workspace → app/log-view}/types.ts +0 -0
  304. /inspect_ai/_view/www/src/{workspace → app/log-view}/utils.ts +0 -0
  305. /inspect_ai/_view/www/src/{plan → app/plan}/DatasetDetailView.module.css +0 -0
  306. /inspect_ai/_view/www/src/{plan → app/plan}/DetailStep.module.css +0 -0
  307. /inspect_ai/_view/www/src/{plan → app/plan}/ModelCard.module.css +0 -0
  308. /inspect_ai/_view/www/src/{plan → app/plan}/PlanDetailView.module.css +0 -0
  309. /inspect_ai/_view/www/src/{plan → app/plan}/ScorerDetailView.module.css +0 -0
  310. /inspect_ai/_view/www/src/{plan → app/plan}/ScorerDetailView.tsx +0 -0
  311. /inspect_ai/_view/www/src/{plan → app/plan}/SolverDetailView.module.css +0 -0
  312. /inspect_ai/_view/www/src/{samples → app/samples}/InlineSampleDisplay.module.css +0 -0
  313. /inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRow.module.css +0 -0
  314. /inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatViewVirtualList.module.css +0 -0
  315. /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/BooleanScoreDescriptor.module.css +0 -0
  316. /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ObjectScoreDescriptor.module.css +0 -0
  317. /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/PassFailScoreDescriptor.module.css +0 -0
  318. /inspect_ai/_view/www/src/{samples → app/samples}/error/FlatSampleErrorView.module.css +0 -0
  319. /inspect_ai/_view/www/src/{samples → app/samples}/error/FlatSampleErrorView.tsx +0 -0
  320. /inspect_ai/_view/www/src/{samples → app/samples}/error/SampleErrorView.tsx +0 -0
  321. /inspect_ai/_view/www/src/{samples → app/samples}/error/error.ts +0 -0
  322. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleFooter.module.css +0 -0
  323. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleFooter.tsx +0 -0
  324. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleHeader.module.css +0 -0
  325. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleList.module.css +0 -0
  326. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleSeparator.module.css +0 -0
  327. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleSeparator.tsx +0 -0
  328. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/EpochFilter.module.css +0 -0
  329. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/EpochFilter.tsx +0 -0
  330. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SelectScorer.module.css +0 -0
  331. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SortFilter.module.css +0 -0
  332. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/SampleFilter.module.css +0 -0
  333. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/tokenize.ts +0 -0
  334. /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScores.module.css +0 -0
  335. /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresGrid.module.css +0 -0
  336. /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresView.module.css +0 -0
  337. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/InfoEventView.module.css +0 -0
  338. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/LoggerEventView.module.css +0 -0
  339. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleInitEventView.module.css +0 -0
  340. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SandboxEventView.module.css +0 -0
  341. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/ScoreEventView.module.css +0 -0
  342. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SubtaskEventView.module.css +0 -0
  343. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/ToolEventView.module.css +0 -0
  344. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNav.module.css +0 -0
  345. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNav.tsx +0 -0
  346. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNavs.module.css +0 -0
  347. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNavs.tsx +0 -0
  348. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventPanel.module.css +0 -0
  349. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventProgressPanel.module.css +0 -0
  350. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventProgressPanel.tsx +0 -0
  351. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventRow.module.css +0 -0
  352. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventRow.tsx +0 -0
  353. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventSection.module.css +0 -0
  354. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventSection.tsx +0 -0
  355. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventTimingPanel.module.css +0 -0
  356. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateDiffView.tsx +0 -0
  357. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventView.module.css +0 -0
  358. /inspect_ai/_view/www/src/{workspace → app}/sidebar/EvalStatus.module.css +0 -0
  359. /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarLogEntry.module.css +0 -0
  360. /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoreView.module.css +0 -0
  361. /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoresView.module.css +0 -0
  362. /inspect_ai/_view/www/src/{usage → app/usage}/ModelUsagePanel.module.css +0 -0
  363. /inspect_ai/_view/www/src/{usage → app/usage}/TokenTable.module.css +0 -0
  364. /inspect_ai/_view/www/src/{usage → app/usage}/UsageCard.module.css +0 -0
  365. /inspect_ai/_view/www/src/{api → client/api}/api-shared.ts +0 -0
  366. /inspect_ai/_view/www/src/{api → client/api}/jsonrpc.ts +0 -0
  367. /inspect_ai/_view/www/src/{logfile → client/remote}/remoteZipFile.ts +0 -0
  368. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/entry_points.txt +0 -0
  369. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/licenses/LICENSE +0 -0
  370. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/top_level.txt +0 -0
inspect_ai/_cli/common.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import functools
2
+ import os
2
3
  from typing import Any, Callable, Literal, cast
3
4
 
4
5
  import click
@@ -21,6 +22,7 @@ class CommonOptions(TypedDict):
21
22
  log_dir: str
22
23
  display: Literal["full", "conversation", "rich", "plain", "none"]
23
24
  no_ansi: bool | None
25
+ traceback_locals: bool
24
26
  env: tuple[str] | None
25
27
  debug: bool
26
28
  debug_port: int
@@ -72,6 +74,13 @@ def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
72
74
  help="Do not print ANSI control characters.",
73
75
  envvar="INSPECT_NO_ANSI",
74
76
  )
77
+ @click.option(
78
+ "--traceback-locals",
79
+ type=bool,
80
+ is_flag=True,
81
+ envvar="INSPECT_TRACEBACK_LOCALS",
82
+ help="Include values of local variables in tracebacks (note that this can leak private data e.g. API keys so should typically only be enabled for targeted debugging).",
83
+ )
75
84
  @click.option(
76
85
  "--env",
77
86
  multiple=True,
@@ -107,6 +116,10 @@ def process_common_options(options: CommonOptions) -> None:
107
116
  env_args = parse_cli_args(options["env"])
108
117
  init_cli_env(env_args)
109
118
 
119
+ # set traceback locals env var
120
+ if options.get("traceback_locals", False):
121
+ os.environ["INSPECT_TRACEBACK_LOCALS"] = "1"
122
+
110
123
  # propagate display
111
124
  if options["no_ansi"]:
112
125
  display = "rich"
inspect_ai/_cli/eval.py CHANGED
@@ -12,6 +12,7 @@ from inspect_ai._util.constants import (
12
12
  DEFAULT_LOG_LEVEL_TRANSCRIPT,
13
13
  DEFAULT_LOG_SHARED,
14
14
  DEFAULT_MAX_CONNECTIONS,
15
+ DEFAULT_RETRY_ON_ERROR,
15
16
  )
16
17
  from inspect_ai._util.file import filesystem
17
18
  from inspect_ai._util.samples import parse_sample_id, parse_samples_limit
@@ -43,6 +44,7 @@ NO_SANDBOX_CLEANUP_HELP = "Do not cleanup sandbox environments after task comple
43
44
  FAIL_ON_ERROR_HELP = "Threshold of sample errors to tolerage (by default, evals fail when any error occurs). Value between 0 to 1 to set a proportion; value greater than 1 to set a count."
44
45
  NO_LOG_SAMPLES_HELP = "Do not include samples in the log file."
45
46
  NO_FAIL_ON_ERROR_HELP = "Do not fail the eval if errors occur within samples (instead, continue running other samples)"
47
+ RETRY_ON_ERROR_HELP = "Retry samples if they encounter errors (by default, no retries occur). Specify --retry-on-error to retry a single time, or specify e.g. `--retry-on-error=3` to retry multiple times."
46
48
  LOG_IMAGES_HELP = (
47
49
  "Include base64 encoded versions of filename or URL based images in the log file."
48
50
  )
@@ -263,6 +265,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
263
265
  help=NO_FAIL_ON_ERROR_HELP,
264
266
  envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
265
267
  )
268
+ @click.option(
269
+ "--retry-on-error",
270
+ is_flag=False,
271
+ flag_value="true",
272
+ default=None,
273
+ callback=int_or_bool_flag_callback(DEFAULT_RETRY_ON_ERROR),
274
+ help=RETRY_ON_ERROR_HELP,
275
+ envvar="INSPECT_EVAL_RETRY_ON_ERROR",
276
+ )
266
277
  @click.option(
267
278
  "--no-log-samples",
268
279
  type=bool,
@@ -428,6 +439,12 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
428
439
  help="Maximum number of tokens to use for reasoning. Anthropic Claude models only.",
429
440
  envvar="INSPECT_EVAL_REASONING_TOKENS",
430
441
  )
442
+ @click.option(
443
+ "--reasoning-summary",
444
+ type=click.Choice(["concise", "detailed", "auto"]),
445
+ help="Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only.",
446
+ envvar="INSPECT_EVAL_REASONING_SUMMARY",
447
+ )
431
448
  @click.option(
432
449
  "--reasoning-history",
433
450
  type=click.Choice(["none", "all", "last", "auto"]),
@@ -512,6 +529,7 @@ def eval_command(
512
529
  cache_prompt: str | None,
513
530
  reasoning_effort: str | None,
514
531
  reasoning_tokens: int | None,
532
+ reasoning_summary: Literal["concise", "detailed", "auto"] | None,
515
533
  reasoning_history: Literal["none", "all", "last", "auto"] | None,
516
534
  response_schema: ResponseSchema | None,
517
535
  message_limit: int | None,
@@ -524,6 +542,7 @@ def eval_command(
524
542
  max_sandboxes: int | None,
525
543
  fail_on_error: bool | float | None,
526
544
  no_fail_on_error: bool | None,
545
+ retry_on_error: int | None,
527
546
  no_log_samples: bool | None,
528
547
  log_images: bool | None,
529
548
  log_buffer: int | None,
@@ -578,6 +597,7 @@ def eval_command(
578
597
  max_sandboxes=max_sandboxes,
579
598
  fail_on_error=fail_on_error,
580
599
  no_fail_on_error=no_fail_on_error,
600
+ retry_on_error=retry_on_error,
581
601
  debug_errors=common["debug_errors"],
582
602
  no_log_samples=no_log_samples,
583
603
  log_images=log_images,
@@ -683,6 +703,7 @@ def eval_set_command(
683
703
  cache_prompt: str | None,
684
704
  reasoning_effort: str | None,
685
705
  reasoning_tokens: int | None,
706
+ reasoning_summary: Literal["concise", "detailed", "auto"] | None,
686
707
  reasoning_history: Literal["none", "all", "last", "auto"] | None,
687
708
  response_schema: ResponseSchema | None,
688
709
  message_limit: int | None,
@@ -695,6 +716,7 @@ def eval_set_command(
695
716
  max_sandboxes: int | None,
696
717
  fail_on_error: bool | float | None,
697
718
  no_fail_on_error: bool | None,
719
+ retry_on_error: int | None,
698
720
  no_log_samples: bool | None,
699
721
  log_images: bool | None,
700
722
  log_buffer: int | None,
@@ -754,6 +776,7 @@ def eval_set_command(
754
776
  max_sandboxes=max_sandboxes,
755
777
  fail_on_error=fail_on_error,
756
778
  no_fail_on_error=no_fail_on_error,
779
+ retry_on_error=retry_on_error,
757
780
  debug_errors=common["debug_errors"],
758
781
  no_log_samples=no_log_samples,
759
782
  log_images=log_images,
@@ -811,6 +834,7 @@ def eval_exec(
811
834
  max_sandboxes: int | None,
812
835
  fail_on_error: bool | float | None,
813
836
  no_fail_on_error: bool | None,
837
+ retry_on_error: int | None,
814
838
  debug_errors: bool | None,
815
839
  no_log_samples: bool | None,
816
840
  log_images: bool | None,
@@ -858,6 +882,10 @@ def eval_exec(
858
882
  elif fail_on_error == 0.0:
859
883
  fail_on_error = True
860
884
 
885
+ # resolve retry_on_error
886
+ if retry_on_error == 0:
887
+ retry_on_error = None
888
+
861
889
  # resolve negating options
862
890
  sandbox_cleanup = False if no_sandbox_cleanup else None
863
891
  log_samples = False if no_log_samples else None
@@ -890,6 +918,7 @@ def eval_exec(
890
918
  sample_id=eval_sample_id,
891
919
  epochs=eval_epochs,
892
920
  fail_on_error=fail_on_error,
921
+ retry_on_error=retry_on_error,
893
922
  debug_errors=debug_errors,
894
923
  message_limit=message_limit,
895
924
  token_limit=token_limit,
@@ -1024,6 +1053,15 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
1024
1053
  help=NO_FAIL_ON_ERROR_HELP,
1025
1054
  envvar="INSPECT_EVAL_NO_FAIL_ON_ERROR",
1026
1055
  )
1056
+ @click.option(
1057
+ "--retry-on-error",
1058
+ is_flag=False,
1059
+ flag_value="true",
1060
+ default=None,
1061
+ callback=int_or_bool_flag_callback(DEFAULT_RETRY_ON_ERROR),
1062
+ help=RETRY_ON_ERROR_HELP,
1063
+ envvar="INSPECT_EVAL_RETRY_ON_ERROR",
1064
+ )
1027
1065
  @click.option(
1028
1066
  "--no-log-samples",
1029
1067
  type=bool,
@@ -1096,6 +1134,7 @@ def eval_retry_command(
1096
1134
  trace: bool | None,
1097
1135
  fail_on_error: bool | float | None,
1098
1136
  no_fail_on_error: bool | None,
1137
+ retry_on_error: int | None,
1099
1138
  no_log_samples: bool | None,
1100
1139
  log_images: bool | None,
1101
1140
  log_buffer: int | None,
@@ -1125,6 +1164,10 @@ def eval_retry_command(
1125
1164
  elif fail_on_error == 0.0:
1126
1165
  fail_on_error = True
1127
1166
 
1167
+ # resolve retry on error
1168
+ if retry_on_error == 0:
1169
+ retry_on_error = None
1170
+
1128
1171
  # resolve log file
1129
1172
  retry_log_files = [
1130
1173
  log_file_info(filesystem(log_file).info(log_file)) for log_file in log_files
@@ -1143,6 +1186,7 @@ def eval_retry_command(
1143
1186
  sandbox_cleanup=sandbox_cleanup,
1144
1187
  trace=trace,
1145
1188
  fail_on_error=fail_on_error,
1189
+ retry_on_error=retry_on_error,
1146
1190
  debug_errors=common["debug_errors"],
1147
1191
  log_samples=log_samples,
1148
1192
  log_images=log_images,
@@ -1,11 +1,18 @@
1
1
  import time
2
2
  from typing import cast
3
+ from urllib.parse import urlencode, urlparse, urlunparse
3
4
 
4
5
  from rich.console import RenderableType
5
6
  from rich.table import Table
6
7
  from rich.text import Text
7
8
  from textual.app import ComposeResult
8
- from textual.containers import Horizontal, HorizontalGroup, Vertical, VerticalGroup
9
+ from textual.containers import (
10
+ Horizontal,
11
+ HorizontalGroup,
12
+ Right,
13
+ Vertical,
14
+ VerticalGroup,
15
+ )
9
16
  from textual.css.query import NoMatches
10
17
  from textual.reactive import reactive
11
18
  from textual.widget import Widget
@@ -20,9 +27,12 @@ from textual.widgets import (
20
27
  from textual.widgets.option_list import Option, OptionDoesNotExist
21
28
 
22
29
  from inspect_ai._display.textual.widgets.port_mappings import get_url
30
+ from inspect_ai._display.textual.widgets.vscode import conditional_vscode_link
31
+ from inspect_ai._util.file import to_uri
23
32
  from inspect_ai._util.format import format_progress_time
24
33
  from inspect_ai._util.port_names import get_service_by_port
25
34
  from inspect_ai._util.registry import registry_unqualified_name
35
+ from inspect_ai._util.vscode import EXTENSION_COMMAND_OPEN_SAMPLE, VSCodeCommand
26
36
  from inspect_ai.log._samples import ActiveSample
27
37
  from inspect_ai.log._transcript import ToolEvent
28
38
 
@@ -272,6 +282,16 @@ class SampleInfo(Vertical):
272
282
  background: $surface;
273
283
  color: $secondary;
274
284
  }
285
+ SampleInfo #sample-link {
286
+ height: auto;
287
+ width: 11;
288
+ margin-left: 1;
289
+ background: $background;
290
+ }
291
+ SampleInfo #sample-link Link {
292
+ color: $accent;
293
+ background: $background;
294
+ }
275
295
  """
276
296
 
277
297
  def __init__(self) -> None:
@@ -280,9 +300,12 @@ class SampleInfo(Vertical):
280
300
  self._sandbox_count: int | None = None
281
301
 
282
302
  def compose(self) -> ComposeResult:
283
- with Collapsible(title=""):
284
- yield SampleLimits()
285
- yield SandboxesView()
303
+ with Horizontal():
304
+ with Collapsible(title=""):
305
+ yield SampleLimits()
306
+ yield SandboxesView()
307
+ yield Right(id="sample-link")
308
+
286
309
  yield SampleVNC()
287
310
 
288
311
  async def sync_sample(self, sample: ActiveSample | None) -> None:
@@ -311,6 +334,28 @@ class SampleInfo(Vertical):
311
334
  await sandboxes.sync_sample(sample)
312
335
  await self.query_one(SampleVNC).sync_sample(sample)
313
336
 
337
+ # View Log Link
338
+ base_uri = sample.log_location
339
+ query_params = {
340
+ "sample_id": sample.sample.id,
341
+ "epoch": sample.epoch,
342
+ }
343
+
344
+ parsed = urlparse(to_uri(base_uri))
345
+ view_link = urlunparse(parsed._replace(query=urlencode(query_params)))
346
+
347
+ link_container = self.query_one("#sample-link")
348
+ link_container.remove_children()
349
+ link = conditional_vscode_link(
350
+ "[View Log]",
351
+ VSCodeCommand(
352
+ command="inspect.openLogViewer",
353
+ args=[view_link] if sample.log_location else [],
354
+ ),
355
+ EXTENSION_COMMAND_OPEN_SAMPLE,
356
+ )
357
+ link_container.mount(link)
358
+
314
359
 
315
360
  class SampleLimits(Widget):
316
361
  DEFAULT_CSS = """
@@ -8,8 +8,10 @@ from inspect_ai._util.vscode import (
8
8
  )
9
9
 
10
10
 
11
- def conditional_vscode_link(text: str, command: VSCodeCommand) -> Widget:
12
- if can_execute_vscode_command(command.command):
11
+ def conditional_vscode_link(
12
+ text: str, command: VSCodeCommand, context: str | None = None
13
+ ) -> Widget:
14
+ if can_execute_vscode_command(command.command, context=context):
13
15
  vscode_link = VSCodeLink(text)
14
16
  vscode_link.commands = [command]
15
17
  return vscode_link
inspect_ai/_eval/eval.py CHANGED
@@ -90,6 +90,7 @@ def eval(
90
90
  sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
91
91
  epochs: int | Epochs | None = None,
92
92
  fail_on_error: bool | float | None = None,
93
+ retry_on_error: int | None = None,
93
94
  debug_errors: bool | None = None,
94
95
  message_limit: int | None = None,
95
96
  token_limit: int | None = None,
@@ -151,6 +152,8 @@ def eval(
151
152
  (default); `False` to never fail on sample errors; Value between 0 and 1
152
153
  to fail if a proportion of total samples fails. Value greater than 1 to fail
153
154
  eval if a count of samples fails.
155
+ retry_on_error: Number of times to retry samples if they encounter errors
156
+ (by default, no retries occur).
154
157
  debug_errors: Raise task errors (rather than logging them)
155
158
  so they can be debugged (defaults to False).
156
159
  message_limit: Limit on total messages used for each sample.
@@ -214,6 +217,7 @@ def eval(
214
217
  sample_id=sample_id,
215
218
  epochs=epochs,
216
219
  fail_on_error=fail_on_error,
220
+ retry_on_error=retry_on_error,
217
221
  debug_errors=debug_errors,
218
222
  message_limit=message_limit,
219
223
  token_limit=token_limit,
@@ -266,6 +270,7 @@ async def eval_async(
266
270
  sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
267
271
  epochs: int | Epochs | None = None,
268
272
  fail_on_error: bool | float | None = None,
273
+ retry_on_error: int | None = None,
269
274
  debug_errors: bool | None = None,
270
275
  message_limit: int | None = None,
271
276
  token_limit: int | None = None,
@@ -315,6 +320,8 @@ async def eval_async(
315
320
  fail_on_error: `True` to fail on first sample error
316
321
  (default); `False` to never fail on sample errors; Value between 0 and 1
317
322
  to fail if a proportion of total samples fails. Value greater than 1 to fail eval if a count of samples fails.
323
+ retry_on_error: Number of times to retry samples if they encounter errors
324
+ (by default, no retries occur).
318
325
  debug_errors: Raise task errors (rather than logging them) so they can be debugged (defaults to False).
319
326
  message_limit: Limit on total messages used for each sample.
320
327
  token_limit: Limit on total tokens used for each sample.
@@ -455,6 +462,7 @@ async def eval_async(
455
462
  else None,
456
463
  approval=config_from_approval_policies(approval) if approval else None,
457
464
  fail_on_error=fail_on_error,
465
+ retry_on_error=retry_on_error,
458
466
  message_limit=message_limit,
459
467
  token_limit=token_limit,
460
468
  time_limit=time_limit,
@@ -551,6 +559,7 @@ def eval_retry(
551
559
  trace: bool | None = None,
552
560
  display: DisplayType | None = None,
553
561
  fail_on_error: bool | float | None = None,
562
+ retry_on_error: int | None = None,
554
563
  debug_errors: bool | None = None,
555
564
  log_samples: bool | None = None,
556
565
  log_images: bool | None = None,
@@ -589,6 +598,8 @@ def eval_retry(
589
598
  (default); `False` to never fail on sample errors; Value between 0 and 1
590
599
  to fail if a proportion of total samples fails. Value greater than 1 to fail
591
600
  eval if a count of samples fails.
601
+ retry_on_error: Number of times to retry samples if they encounter errors
602
+ (by default, no retries occur).
592
603
  debug_errors: Raise task errors (rather than logging them)
593
604
  so they can be debugged (defaults to False).
594
605
  log_samples: Log detailed samples and scores (defaults to True)
@@ -631,6 +642,7 @@ def eval_retry(
631
642
  max_sandboxes=max_sandboxes,
632
643
  sandbox_cleanup=sandbox_cleanup,
633
644
  fail_on_error=fail_on_error,
645
+ retry_on_error=retry_on_error,
634
646
  debug_errors=debug_errors,
635
647
  log_samples=log_samples,
636
648
  log_images=log_images,
@@ -658,6 +670,7 @@ async def eval_retry_async(
658
670
  max_sandboxes: int | None = None,
659
671
  sandbox_cleanup: bool | None = None,
660
672
  fail_on_error: bool | float | None = None,
673
+ retry_on_error: int | None = None,
661
674
  debug_errors: bool | None = None,
662
675
  log_samples: bool | None = None,
663
676
  log_images: bool | None = None,
@@ -672,46 +685,40 @@ async def eval_retry_async(
672
685
  """Retry a previously failed evaluation task.
673
686
 
674
687
  Args:
675
- tasks: (str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog]):
676
- Log files for task(s) to retry.
677
- log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
688
+ tasks: Log files for task(s) to retry.
689
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
678
690
  "info", "warning", "error", or "critical" (defaults to "warning")
679
- log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
680
- log_dir (str | None): Output path for logging results
681
- (defaults to file log in ./logs directory).
682
- log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
683
- to "eval", the native high-performance format).
684
- max_samples (int | None): Maximum number of samples to run in parallel
691
+ log_level_transcript: Level for logging to the log file (defaults to "info")
692
+ log_dir: Output path for logging results (defaults to file log in ./logs directory).
693
+ log_format: Format for writing log files (defaults to "eval", the native high-performance format).
694
+ max_samples: Maximum number of samples to run in parallel
685
695
  (default is max_connections)
686
- max_tasks (int | None): Maximum number of tasks to run in parallel
687
- (default is 1)
688
- max_subprocesses (int): Maximum number of subprocesses to
689
- run in parallel (default is os.cpu_count())
690
- max_sandboxes (int): Maximum number of sandboxes (per-provider) to run in parallel.
691
- sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
696
+ max_tasks: Maximum number of tasks to run in parallel (default is 1)
697
+ max_subprocesses: Maximum number of subprocesses to run in parallel (default is os.cpu_count())
698
+ max_sandboxes: Maximum number of sandboxes (per-provider) to run in parallel.
699
+ sandbox_cleanup: Cleanup sandbox environments after task completes
692
700
  (defaults to True)
693
- fail_on_error (bool | float | None): `True` to fail on first sample error
701
+ fail_on_error: `True` to fail on first sample error
694
702
  (default); `False` to never fail on sample errors; Value between 0 and 1
695
703
  to fail if a proportion of total samples fails. Value greater than 1 to fail
696
704
  eval if a count of samples fails.
697
- debug_errors (bool | None): Raise task errors (rather than logging them)
705
+ retry_on_error: Number of times to retry samples if they encounter errors
706
+ (by default, no retries occur).
707
+ debug_errors: Raise task errors (rather than logging them)
698
708
  so they can be debugged (defaults to False).
699
- log_samples: (bool | None): Log detailed samples and scores (defaults to True)
700
- log_images: (bool | None): Log base64 encoded version of images,
709
+ log_samples: Log detailed samples and scores (defaults to True)
710
+ log_images: Log base64 encoded version of images,
701
711
  even if specified as a filename or URL (defaults to False)
702
- log_buffer: (int | None): Number of samples to buffer before writing log file.
712
+ log_buffer: Number of samples to buffer before writing log file.
703
713
  If not specified, an appropriate default for the format and filesystem is
704
714
  chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
705
715
  log_shared: Indicate that the log directory is shared, which results in
706
716
  additional syncing of realtime log data for Inspect View.
707
- score (bool): Score output (defaults to True)
708
- score_display (bool | None): Show scoring metrics in realtime (defaults to True)
709
- max_retries (int | None):
710
- Maximum number of times to retry request.
711
- timeout: (int | None):
712
- Request timeout (in seconds)
713
- max_connections (int | None):
714
- Maximum number of concurrent connections to Model API (default is per Model API)
717
+ score: Score output (defaults to True)
718
+ score_display: Show scoring metrics in realtime (defaults to True)
719
+ max_retries: Maximum number of times to retry request.
720
+ timeout: Request timeout (in seconds)
721
+ max_connections: Maximum number of concurrent connections to Model API (default is per Model API)
715
722
 
716
723
  Returns:
717
724
  List of EvalLog (one for each task)
@@ -802,6 +809,11 @@ async def eval_retry_async(
802
809
  if fail_on_error is not None
803
810
  else eval_log.eval.config.fail_on_error
804
811
  )
812
+ retry_on_error = (
813
+ retry_on_error
814
+ if retry_on_error is not None
815
+ else eval_log.eval.config.retry_on_error
816
+ )
805
817
  log_samples = (
806
818
  log_samples if log_samples is not None else eval_log.eval.config.log_samples
807
819
  )
@@ -852,6 +864,7 @@ async def eval_retry_async(
852
864
  sample_id=sample_id,
853
865
  epochs=epochs,
854
866
  fail_on_error=fail_on_error,
867
+ retry_on_error=retry_on_error,
855
868
  debug_errors=debug_errors,
856
869
  message_limit=message_limit,
857
870
  token_limit=token_limit,
@@ -82,6 +82,7 @@ def eval_set(
82
82
  sample_id: str | int | list[str] | list[int] | list[str | int] | None = None,
83
83
  epochs: int | Epochs | None = None,
84
84
  fail_on_error: bool | float | None = None,
85
+ retry_on_error: int | None = None,
85
86
  debug_errors: bool | None = None,
86
87
  message_limit: int | None = None,
87
88
  token_limit: int | None = None,
@@ -153,6 +154,8 @@ def eval_set(
153
154
  (default); `False` to never fail on sample errors; Value between 0 and 1
154
155
  to fail if a proportion of total samples fails. Value greater than 1 to fail
155
156
  eval if a count of samples fails.
157
+ retry_on_error: Number of times to retry samples if they encounter errors
158
+ (by default, no retries occur).
156
159
  debug_errors: Raise task errors (rather than logging them)
157
160
  so they can be debugged (defaults to False).
158
161
  message_limit: Limit on total messages used for each sample.
@@ -215,6 +218,7 @@ def eval_set(
215
218
  sample_id=sample_id,
216
219
  epochs=epochs,
217
220
  fail_on_error=fail_on_error,
221
+ retry_on_error=retry_on_error,
218
222
  debug_errors=debug_errors,
219
223
  message_limit=message_limit,
220
224
  token_limit=token_limit,
@@ -25,7 +25,6 @@ from inspect_ai._util.registry import (
25
25
  registry_lookup,
26
26
  registry_params,
27
27
  )
28
- from inspect_ai.agent._agent import Agent
29
28
  from inspect_ai.agent._as_solver import as_solver
30
29
  from inspect_ai.model import Model
31
30
  from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
@@ -423,9 +422,9 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
423
422
  if solver_name is None:
424
423
  raise ValueError(f"Unable to resolve solver name from {spec.solver}")
425
424
  elif registry_lookup("solver", solver_name) is not None:
426
- return cast(Solver, registry_create("solver", solver_name, **spec.args))
425
+ return registry_create("solver", solver_name, **spec.args)
427
426
  elif registry_lookup("agent", solver_name) is not None:
428
- agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
427
+ agent = registry_create("agent", solver_name, **spec.args)
429
428
  return as_solver(agent)
430
429
  else:
431
430
  raise ValueError(
@@ -484,11 +483,11 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
484
483
 
485
484
  # create decorator based solvers using the registry
486
485
  if any(solver[0] == solver_name for solver in solver_decorators):
487
- return cast(Solver, registry_create("solver", solver_name, **spec.args))
486
+ return registry_create("solver", solver_name, **spec.args)
488
487
 
489
488
  # create decorator based agents using the registry
490
489
  elif any(agent[0] == solver_name for agent in agent_decorators):
491
- agent = cast(Agent, registry_create("agent", solver_name, **spec.args))
490
+ agent = registry_create("agent", solver_name, **spec.args)
492
491
  return as_solver(agent)
493
492
 
494
493
  # create bridge based solvers by calling the function and wrapping it in bridge()
@@ -80,7 +80,7 @@ def task_create(name: str, **kwargs: Any) -> Task:
80
80
  else:
81
81
  logger.warning(f"param '{param}' not used by task '{name}'")
82
82
 
83
- return cast(Task, registry_create("task", name, **task_args))
83
+ return registry_create("task", name, **task_args)
84
84
 
85
85
 
86
86
  @overload
inspect_ai/_eval/run.py CHANGED
@@ -4,6 +4,7 @@ import sys
4
4
  from typing import Any, Awaitable, Callable, Set, cast
5
5
 
6
6
  from inspect_ai._eval.task.task import Task
7
+ from inspect_ai._util.environ import environ_vars
7
8
  from inspect_ai._util.trace import trace_action
8
9
 
9
10
  if sys.version_info < (3, 11):
@@ -49,7 +50,7 @@ from .loader import (
49
50
  from .task.log import TaskLogger
50
51
  from .task.resolved import ResolvedTask
51
52
  from .task.run import TaskRunOptions, task_run
52
- from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
53
+ from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task_and_sample
53
54
  from .task.util import slice_dataset, task_run_dir
54
55
 
55
56
  log = logging.getLogger(__name__)
@@ -435,7 +436,9 @@ async def startup_sandbox_environments(
435
436
  # resolve each sample and add to sandboxenvs
436
437
  dataset = slice_dataset(task.task.dataset, config.limit, config.sample_id)
437
438
  for sample in dataset:
438
- sandbox = resolve_sandbox_for_task(eval_sandbox, task.task, sample)
439
+ sandbox = await resolve_sandbox_for_task_and_sample(
440
+ eval_sandbox, task.task, sample
441
+ )
439
442
  if sandbox is not None and sandbox not in sandboxenvs:
440
443
  sandboxenvs.add(sandbox)
441
444
 
@@ -448,7 +451,7 @@ async def startup_sandbox_environments(
448
451
 
449
452
  # run startup
450
453
  task_init = cast(TaskInit, getattr(sandboxenv_type, "task_init"))
451
- with chdir(sandboxenv.run_dir):
454
+ with chdir(sandboxenv.run_dir), environ_vars(dict(sandboxenv.env)):
452
455
  await task_init("startup", sandboxenv.sandbox.config)
453
456
 
454
457
  # append cleanup method
@@ -187,6 +187,9 @@ class TaskLogger:
187
187
  # log the sample event
188
188
  self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
189
189
 
190
+ def remove_sample(self, id: str | int, epoch: int) -> None:
191
+ self._buffer_db.remove_samples([(id, epoch)])
192
+
190
193
  async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
191
194
  # log the sample
192
195
  await self.recorder.log_sample(self.eval, sample)
@@ -202,6 +205,9 @@ class TaskLogger:
202
205
  scores=sample.scores,
203
206
  error=sample.error.message if sample.error is not None else None,
204
207
  limit=f"{sample.limit.type}" if sample.limit is not None else None,
208
+ retries=len(sample.error_retries)
209
+ if sample.error_retries is not None
210
+ else None,
205
211
  )
206
212
  )
207
213