inspect-ai 0.3.90__py3-none-any.whl → 0.3.92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (370) hide show
  1. inspect_ai/_cli/common.py +13 -0
  2. inspect_ai/_cli/eval.py +44 -0
  3. inspect_ai/_display/textual/widgets/samples.py +49 -4
  4. inspect_ai/_display/textual/widgets/vscode.py +4 -2
  5. inspect_ai/_eval/eval.py +41 -28
  6. inspect_ai/_eval/evalset.py +4 -0
  7. inspect_ai/_eval/loader.py +4 -5
  8. inspect_ai/_eval/registry.py +1 -1
  9. inspect_ai/_eval/run.py +6 -3
  10. inspect_ai/_eval/task/log.py +6 -0
  11. inspect_ai/_eval/task/run.py +108 -41
  12. inspect_ai/_eval/task/sandbox.py +19 -5
  13. inspect_ai/_util/_async.py +1 -1
  14. inspect_ai/_util/constants.py +1 -0
  15. inspect_ai/_util/environ.py +32 -0
  16. inspect_ai/_util/file.py +8 -1
  17. inspect_ai/_util/httpx.py +105 -22
  18. inspect_ai/_util/registry.py +83 -9
  19. inspect_ai/_util/text.py +81 -17
  20. inspect_ai/_util/transcript.py +9 -6
  21. inspect_ai/_util/vscode.py +7 -2
  22. inspect_ai/_view/schema.py +1 -1
  23. inspect_ai/_view/www/babel.config.js +11 -0
  24. inspect_ai/_view/www/dist/assets/index.css +3640 -3563
  25. inspect_ai/_view/www/dist/assets/index.js +59204 -52519
  26. inspect_ai/_view/www/eslint.config.mjs +10 -1
  27. inspect_ai/_view/www/jest.config.mjs +21 -0
  28. inspect_ai/_view/www/log-schema.json +111 -2
  29. inspect_ai/_view/www/package.json +19 -5
  30. inspect_ai/_view/www/src/{types → @types}/log.d.ts +95 -32
  31. inspect_ai/_view/www/{App.css → src/app/App.css} +22 -14
  32. inspect_ai/_view/www/src/app/App.tsx +168 -0
  33. inspect_ai/_view/www/src/{AppErrorBoundary.tsx → app/AppErrorBoundary.tsx} +1 -1
  34. inspect_ai/_view/www/src/{appearance → app/appearance}/icons.ts +1 -0
  35. inspect_ai/_view/www/src/{metadata → app/content}/RenderedContent.tsx +5 -5
  36. inspect_ai/_view/www/src/{workspace/WorkSpaceView.tsx → app/log-view/LogView.tsx} +59 -40
  37. inspect_ai/_view/www/src/app/log-view/LogViewContainer.tsx +159 -0
  38. inspect_ai/_view/www/src/app/log-view/LogViewLayout.tsx +109 -0
  39. inspect_ai/_view/www/src/{workspace → app/log-view}/error/TaskErrorPanel.tsx +3 -3
  40. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ModelRolesView.tsx +1 -1
  41. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/Navbar.tsx +4 -4
  42. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/PrimaryBar.tsx +8 -8
  43. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ResultsPanel.tsx +6 -6
  44. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/RunningStatusPanel.tsx +1 -1
  45. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ScoreGrid.tsx +1 -1
  46. inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/SecondaryBar.tsx +8 -8
  47. inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/InfoTab.tsx +35 -6
  48. inspect_ai/_view/www/src/app/log-view/tabs/JsonTab.tsx +136 -0
  49. inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/SamplesTab.tsx +82 -73
  50. inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/grouping.ts +3 -3
  51. inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/types.ts +1 -1
  52. inspect_ai/_view/www/src/{plan → app/plan}/DatasetDetailView.tsx +2 -2
  53. inspect_ai/_view/www/src/{plan → app/plan}/DetailStep.tsx +1 -1
  54. inspect_ai/_view/www/src/{plan → app/plan}/ModelCard.tsx +4 -4
  55. inspect_ai/_view/www/src/{plan → app/plan}/PlanCard.tsx +2 -2
  56. inspect_ai/_view/www/src/{plan → app/plan}/PlanDetailView.tsx +5 -5
  57. inspect_ai/_view/www/src/{plan → app/plan}/SolverDetailView.tsx +1 -1
  58. inspect_ai/_view/www/src/app/routing/AppRouter.tsx +58 -0
  59. inspect_ai/_view/www/src/app/routing/navigationHooks.ts +182 -0
  60. inspect_ai/_view/www/src/app/routing/url.ts +43 -0
  61. inspect_ai/_view/www/src/{samples → app/samples}/InlineSampleDisplay.tsx +11 -27
  62. inspect_ai/_view/www/src/{samples → app/samples}/SampleDialog.tsx +36 -40
  63. inspect_ai/_view/www/src/{samples → app/samples}/SampleDisplay.module.css +4 -0
  64. inspect_ai/_view/www/src/{samples → app/samples}/SampleDisplay.tsx +116 -49
  65. inspect_ai/_view/www/src/{samples → app/samples}/SampleSummaryView.module.css +1 -1
  66. inspect_ai/_view/www/src/{samples → app/samples}/SampleSummaryView.tsx +29 -26
  67. inspect_ai/_view/www/src/{samples → app/samples}/SamplesTools.tsx +3 -3
  68. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessage.module.css +5 -2
  69. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessage.tsx +12 -4
  70. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRenderer.tsx +3 -3
  71. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRow.tsx +6 -1
  72. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatView.tsx +4 -2
  73. inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatViewVirtualList.tsx +5 -3
  74. inspect_ai/_view/www/src/app/samples/chat/MessageContent.module.css +12 -0
  75. inspect_ai/_view/www/src/{samples → app/samples}/chat/MessageContent.tsx +11 -10
  76. inspect_ai/_view/www/src/app/samples/chat/MessageContents.module.css +7 -0
  77. inspect_ai/_view/www/src/{samples → app/samples}/chat/MessageContents.tsx +14 -8
  78. inspect_ai/_view/www/src/{samples → app/samples}/chat/messages.ts +2 -2
  79. inspect_ai/_view/www/src/app/samples/chat/tools/ToolCallView.module.css +7 -0
  80. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolCallView.tsx +26 -27
  81. inspect_ai/_view/www/src/app/samples/chat/tools/ToolInput.module.css +19 -0
  82. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolInput.tsx +3 -3
  83. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolOutput.module.css +1 -0
  84. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolOutput.tsx +1 -1
  85. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolTitle.module.css +4 -0
  86. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/ToolTitle.tsx +2 -2
  87. inspect_ai/_view/www/src/{samples → app/samples}/chat/tools/tool.ts +1 -1
  88. inspect_ai/_view/www/src/app/samples/chat/types.ts +1 -0
  89. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/samplesDescriptor.tsx +38 -15
  90. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
  91. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/CategoricalScoreDescriptor.tsx +2 -2
  92. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/NumericScoreDescriptor.tsx +3 -3
  93. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ObjectScoreDescriptor.tsx +4 -4
  94. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/OtherScoreDescriptor.tsx +2 -2
  95. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/PassFailScoreDescriptor.tsx +2 -2
  96. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ScoreDescriptor.tsx +1 -1
  97. inspect_ai/_view/www/src/{samples → app/samples}/descriptor/types.ts +4 -3
  98. inspect_ai/_view/www/src/{samples → app/samples}/error/SampleErrorView.module.css +2 -1
  99. inspect_ai/_view/www/src/{samples → app/samples}/list/SampleHeader.tsx +3 -0
  100. inspect_ai/_view/www/src/{samples → app/samples}/list/SampleList.tsx +47 -33
  101. inspect_ai/_view/www/src/{samples → app/samples}/list/SampleRow.module.css +16 -0
  102. inspect_ai/_view/www/src/{samples → app/samples}/list/SampleRow.tsx +47 -20
  103. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SelectScorer.tsx +1 -1
  104. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SortFilter.tsx +4 -4
  105. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/filters.ts +8 -6
  106. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/SampleFilter.tsx +4 -3
  107. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/completions.ts +1 -1
  108. inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/language.ts +1 -0
  109. inspect_ai/_view/www/src/{samples → app/samples}/sampleDataAdapter.ts +3 -3
  110. inspect_ai/_view/www/src/{samples → app/samples}/sampleLimit.ts +1 -1
  111. inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScores.tsx +1 -1
  112. inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresGrid.tsx +12 -11
  113. inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresView.tsx +6 -6
  114. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ApprovalEventView.tsx +1 -1
  115. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ErrorEventView.tsx +3 -3
  116. inspect_ai/_view/www/src/{samples → app/samples}/transcript/InfoEventView.tsx +4 -4
  117. inspect_ai/_view/www/src/{samples → app/samples}/transcript/InputEventView.tsx +3 -3
  118. inspect_ai/_view/www/src/{samples → app/samples}/transcript/LoggerEventView.tsx +3 -3
  119. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ModelEventView.module.css +13 -7
  120. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ModelEventView.tsx +49 -21
  121. inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleInitEventView.tsx +11 -9
  122. inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleLimitEventView.tsx +1 -1
  123. inspect_ai/_view/www/src/{samples → app/samples}/transcript/SandboxEventView.tsx +8 -6
  124. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ScoreEventView.tsx +4 -4
  125. inspect_ai/_view/www/src/{samples → app/samples}/transcript/StepEventView.tsx +11 -3
  126. inspect_ai/_view/www/src/{samples → app/samples}/transcript/SubtaskEventView.tsx +2 -2
  127. inspect_ai/_view/www/src/{samples → app/samples}/transcript/ToolEventView.tsx +2 -2
  128. inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptView.module.css +8 -7
  129. inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptView.tsx +32 -114
  130. inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptVirtualListComponent.module.css +6 -5
  131. inspect_ai/_view/www/src/{samples → app/samples}/transcript/TranscriptVirtualListComponent.tsx +14 -2
  132. inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventPanel.tsx +2 -2
  133. inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventTimingPanel.tsx +1 -1
  134. inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/utils.ts +1 -1
  135. inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventRenderers.tsx +23 -21
  136. inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventRenders.module.css +7 -0
  137. inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventView.tsx +2 -2
  138. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +142 -0
  139. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +39 -0
  140. inspect_ai/_view/www/src/{samples → app/samples}/transcript/types.ts +1 -1
  141. inspect_ai/_view/www/src/{workspace → app}/sidebar/EvalStatus.tsx +1 -1
  142. inspect_ai/_view/www/src/app/sidebar/LogDirectoryTitleView.module.css +16 -0
  143. inspect_ai/_view/www/src/app/sidebar/LogDirectoryTitleView.tsx +70 -0
  144. inspect_ai/_view/www/src/{workspace → app}/sidebar/Sidebar.module.css +8 -0
  145. inspect_ai/_view/www/src/{workspace → app}/sidebar/Sidebar.tsx +35 -17
  146. inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarLogEntry.tsx +1 -1
  147. inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoreView.tsx +2 -2
  148. inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoresView.tsx +2 -2
  149. inspect_ai/_view/www/src/{types.ts → app/types.ts} +18 -11
  150. inspect_ai/_view/www/src/{usage → app/usage}/ModelTokenTable.tsx +1 -1
  151. inspect_ai/_view/www/src/{usage → app/usage}/ModelUsagePanel.tsx +2 -2
  152. inspect_ai/_view/www/src/{usage → app/usage}/TokenTable.tsx +1 -1
  153. inspect_ai/_view/www/src/{usage → app/usage}/UsageCard.tsx +6 -6
  154. inspect_ai/_view/www/src/{api → client/api}/api-browser.ts +2 -2
  155. inspect_ai/_view/www/src/{api → client/api}/api-http.ts +3 -3
  156. inspect_ai/_view/www/src/{api → client/api}/api-vscode.ts +2 -2
  157. inspect_ai/_view/www/src/{api → client/api}/client-api.ts +6 -5
  158. inspect_ai/_view/www/src/{api → client/api}/index.ts +2 -2
  159. inspect_ai/_view/www/src/{api → client/api}/types.ts +4 -1
  160. inspect_ai/_view/www/src/{logfile → client/remote}/remoteLogFile.ts +3 -3
  161. inspect_ai/_view/www/src/{storage → client/storage}/index.ts +11 -5
  162. inspect_ai/_view/www/src/components/Card.tsx +1 -1
  163. inspect_ai/_view/www/src/components/CopyButton.tsx +1 -1
  164. inspect_ai/_view/www/src/components/DownloadButton.tsx +1 -1
  165. inspect_ai/_view/www/src/components/ErrorPanel.tsx +1 -1
  166. inspect_ai/_view/www/src/components/{ExpandablePanel.css → ExpandablePanel.module.css} +14 -11
  167. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +16 -10
  168. inspect_ai/_view/www/src/components/FindBand.tsx +1 -1
  169. inspect_ai/_view/www/src/components/JsonPanel.css +2 -2
  170. inspect_ai/_view/www/src/components/LargeModal.tsx +12 -1
  171. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +1 -1
  172. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +3 -1
  173. inspect_ai/_view/www/src/components/MessageBand.tsx +1 -1
  174. inspect_ai/_view/www/src/components/NoContentsPanel.tsx +1 -1
  175. inspect_ai/_view/www/src/constants.ts +10 -9
  176. inspect_ai/_view/www/src/index.tsx +27 -11
  177. inspect_ai/_view/www/src/state/appSlice.ts +44 -5
  178. inspect_ai/_view/www/src/state/hooks.ts +30 -7
  179. inspect_ai/_view/www/src/state/logSlice.ts +7 -5
  180. inspect_ai/_view/www/src/state/logsPolling.ts +1 -1
  181. inspect_ai/_view/www/src/state/logsSlice.ts +18 -13
  182. inspect_ai/_view/www/src/state/samplePolling.ts +12 -12
  183. inspect_ai/_view/www/src/state/sampleSlice.ts +3 -5
  184. inspect_ai/_view/www/src/state/sampleUtils.ts +1 -1
  185. inspect_ai/_view/www/src/{scoring/utils.ts → state/scoring.ts} +2 -2
  186. inspect_ai/_view/www/src/state/store.ts +9 -7
  187. inspect_ai/_view/www/src/state/utils.ts +1 -1
  188. inspect_ai/_view/www/src/tests/README.md +49 -0
  189. inspect_ai/_view/www/src/tests/__mocks__/fileMock.js +1 -0
  190. inspect_ai/_view/www/src/tests/__mocks__/styleMock.js +1 -0
  191. inspect_ai/_view/www/src/tests/setupTests.mjs +1 -0
  192. inspect_ai/_view/www/src/tests/utils/base64.test.ts +23 -0
  193. inspect_ai/_view/www/src/tests/utils/format.test.ts +127 -0
  194. inspect_ai/_view/www/src/tests/utils/path.test.ts +54 -0
  195. inspect_ai/_view/www/src/utils/format.ts +8 -2
  196. inspect_ai/_view/www/src/utils/path.ts +14 -2
  197. inspect_ai/_view/www/src/utils/polling.ts +1 -2
  198. inspect_ai/_view/www/src/utils/uri.ts +32 -0
  199. inspect_ai/_view/www/yarn.lock +3310 -382
  200. inspect_ai/agent/_handoff.py +6 -3
  201. inspect_ai/agent/_human/agent.py +5 -3
  202. inspect_ai/agent/_human/install.py +16 -7
  203. inspect_ai/agent/_human/panel.py +14 -1
  204. inspect_ai/agent/_human/service.py +5 -1
  205. inspect_ai/agent/_react.py +161 -128
  206. inspect_ai/agent/_types.py +15 -4
  207. inspect_ai/approval/_policy.py +2 -2
  208. inspect_ai/log/_file.py +30 -11
  209. inspect_ai/log/_log.py +7 -1
  210. inspect_ai/log/_recorders/eval.py +3 -0
  211. inspect_ai/log/_recorders/types.py +1 -0
  212. inspect_ai/log/_samples.py +4 -0
  213. inspect_ai/model/_call_tools.py +33 -17
  214. inspect_ai/model/_generate_config.py +10 -2
  215. inspect_ai/model/_model.py +41 -21
  216. inspect_ai/model/_model_output.py +2 -1
  217. inspect_ai/model/_openai.py +10 -8
  218. inspect_ai/model/_openai_responses.py +95 -42
  219. inspect_ai/model/_providers/anthropic.py +14 -12
  220. inspect_ai/model/_providers/google.py +191 -95
  221. inspect_ai/model/_providers/hf.py +1 -1
  222. inspect_ai/model/_providers/mistral.py +2 -3
  223. inspect_ai/model/_providers/openai.py +54 -17
  224. inspect_ai/model/_providers/openai_o1.py +1 -1
  225. inspect_ai/model/_providers/openai_responses.py +28 -16
  226. inspect_ai/model/_providers/openrouter.py +14 -0
  227. inspect_ai/model/_providers/providers.py +2 -2
  228. inspect_ai/model/_providers/util/chatapi.py +17 -7
  229. inspect_ai/model/_providers/vllm.py +1 -1
  230. inspect_ai/scorer/_metric.py +17 -1
  231. inspect_ai/scorer/_model.py +51 -6
  232. inspect_ai/scorer/_scorer.py +1 -1
  233. inspect_ai/solver/_human_agent.py +3 -0
  234. inspect_ai/solver/_plan.py +1 -1
  235. inspect_ai/solver/_solver.py +1 -1
  236. inspect_ai/solver/_use_tools.py +14 -8
  237. inspect_ai/tool/__init__.py +16 -1
  238. inspect_ai/tool/_json_rpc_helpers.py +285 -0
  239. inspect_ai/tool/_mcp/__init__.py +13 -0
  240. inspect_ai/tool/_mcp/_context.py +14 -0
  241. inspect_ai/tool/_mcp/_mcp.py +293 -0
  242. inspect_ai/tool/_mcp/_sandbox.py +104 -0
  243. inspect_ai/tool/_mcp/_types.py +31 -0
  244. inspect_ai/tool/_mcp/connection.py +60 -0
  245. inspect_ai/tool/_mcp/sampling.py +118 -0
  246. inspect_ai/tool/_mcp/server.py +112 -0
  247. inspect_ai/tool/_mcp/tools.py +34 -0
  248. inspect_ai/tool/_tool.py +13 -0
  249. inspect_ai/tool/_tool_def.py +24 -7
  250. inspect_ai/tool/_tool_support_helpers.py +129 -153
  251. inspect_ai/tool/_tools/_bash_session.py +11 -11
  252. inspect_ai/tool/_tools/_text_editor.py +6 -6
  253. inspect_ai/tool/_tools/_web_browser/_web_browser.py +8 -8
  254. inspect_ai/util/_anyio.py +31 -20
  255. inspect_ai/util/_json.py +20 -2
  256. inspect_ai/util/_sandbox/context.py +18 -7
  257. inspect_ai/util/_sandbox/docker/compose.py +1 -1
  258. inspect_ai/util/_sandbox/docker/docker.py +92 -21
  259. inspect_ai/util/_sandbox/environment.py +33 -2
  260. inspect_ai/util/_sandbox/events.py +2 -2
  261. inspect_ai/util/_sandbox/service.py +13 -3
  262. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/METADATA +6 -2
  263. inspect_ai-0.3.92.dist-info/RECORD +732 -0
  264. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/WHEEL +1 -1
  265. inspect_ai/_view/www/src/App.tsx +0 -316
  266. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +0 -4
  267. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +0 -3
  268. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +0 -3
  269. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +0 -14
  270. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +0 -292
  271. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +0 -5
  272. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +0 -57
  273. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +0 -43
  274. inspect_ai-0.3.90.dist-info/RECORD +0 -705
  275. /inspect_ai/_view/www/src/{types → @types}/asciicinema-player.d.ts +0 -0
  276. /inspect_ai/_view/www/src/{types → @types}/jsondiffpatch.d.ts +0 -0
  277. /inspect_ai/_view/www/src/{types → @types}/markdown-it-katex.d.ts +0 -0
  278. /inspect_ai/_view/www/src/{types → @types}/prism.d.ts +0 -0
  279. /inspect_ai/_view/www/src/{appearance → app/appearance}/colors.ts +0 -0
  280. /inspect_ai/_view/www/src/{appearance → app/appearance}/fonts.ts +0 -0
  281. /inspect_ai/_view/www/src/{appearance → app/appearance}/styles.ts +0 -0
  282. /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataGrid.tsx +0 -0
  283. /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataView.module.css +0 -0
  284. /inspect_ai/_view/www/src/{metadata → app/content}/MetaDataView.tsx +0 -0
  285. /inspect_ai/_view/www/src/{metadata → app/content}/MetadataGrid.module.css +0 -0
  286. /inspect_ai/_view/www/src/{metadata → app/content}/RenderedContent.module.css +0 -0
  287. /inspect_ai/_view/www/src/{metadata → app/content}/types.ts +0 -0
  288. /inspect_ai/_view/www/src/{workspace/WorkSpaceView.module.css → app/log-view/LogView.module.css} +0 -0
  289. /inspect_ai/_view/www/src/{workspace → app/log-view}/error/TaskErrorPanel.module.css +0 -0
  290. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ModelRolesView.module.css +0 -0
  291. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/Navbar.module.css +0 -0
  292. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/PrimaryBar.module.css +0 -0
  293. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ResultsPanel.module.css +0 -0
  294. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/RunningStatusPanel.module.css +0 -0
  295. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/ScoreGrid.module.css +0 -0
  296. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/SecondaryBar.module.css +0 -0
  297. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/StatusPanel.module.css +0 -0
  298. /inspect_ai/_view/www/src/{workspace → app/log-view}/navbar/StatusPanel.tsx +0 -0
  299. /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/InfoTab.module.css +0 -0
  300. /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/JsonTab.module.css +0 -0
  301. /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/RunningNoSamples.module.css +0 -0
  302. /inspect_ai/_view/www/src/{workspace → app/log-view}/tabs/RunningNoSamples.tsx +0 -0
  303. /inspect_ai/_view/www/src/{workspace → app/log-view}/types.ts +0 -0
  304. /inspect_ai/_view/www/src/{workspace → app/log-view}/utils.ts +0 -0
  305. /inspect_ai/_view/www/src/{plan → app/plan}/DatasetDetailView.module.css +0 -0
  306. /inspect_ai/_view/www/src/{plan → app/plan}/DetailStep.module.css +0 -0
  307. /inspect_ai/_view/www/src/{plan → app/plan}/ModelCard.module.css +0 -0
  308. /inspect_ai/_view/www/src/{plan → app/plan}/PlanDetailView.module.css +0 -0
  309. /inspect_ai/_view/www/src/{plan → app/plan}/ScorerDetailView.module.css +0 -0
  310. /inspect_ai/_view/www/src/{plan → app/plan}/ScorerDetailView.tsx +0 -0
  311. /inspect_ai/_view/www/src/{plan → app/plan}/SolverDetailView.module.css +0 -0
  312. /inspect_ai/_view/www/src/{samples → app/samples}/InlineSampleDisplay.module.css +0 -0
  313. /inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatMessageRow.module.css +0 -0
  314. /inspect_ai/_view/www/src/{samples → app/samples}/chat/ChatViewVirtualList.module.css +0 -0
  315. /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/BooleanScoreDescriptor.module.css +0 -0
  316. /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/ObjectScoreDescriptor.module.css +0 -0
  317. /inspect_ai/_view/www/src/{samples → app/samples}/descriptor/score/PassFailScoreDescriptor.module.css +0 -0
  318. /inspect_ai/_view/www/src/{samples → app/samples}/error/FlatSampleErrorView.module.css +0 -0
  319. /inspect_ai/_view/www/src/{samples → app/samples}/error/FlatSampleErrorView.tsx +0 -0
  320. /inspect_ai/_view/www/src/{samples → app/samples}/error/SampleErrorView.tsx +0 -0
  321. /inspect_ai/_view/www/src/{samples → app/samples}/error/error.ts +0 -0
  322. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleFooter.module.css +0 -0
  323. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleFooter.tsx +0 -0
  324. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleHeader.module.css +0 -0
  325. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleList.module.css +0 -0
  326. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleSeparator.module.css +0 -0
  327. /inspect_ai/_view/www/src/{samples → app/samples}/list/SampleSeparator.tsx +0 -0
  328. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/EpochFilter.module.css +0 -0
  329. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/EpochFilter.tsx +0 -0
  330. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SelectScorer.module.css +0 -0
  331. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/SortFilter.module.css +0 -0
  332. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/SampleFilter.module.css +0 -0
  333. /inspect_ai/_view/www/src/{samples → app/samples}/sample-tools/sample-filter/tokenize.ts +0 -0
  334. /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScores.module.css +0 -0
  335. /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresGrid.module.css +0 -0
  336. /inspect_ai/_view/www/src/{samples → app/samples}/scores/SampleScoresView.module.css +0 -0
  337. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/InfoEventView.module.css +0 -0
  338. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/LoggerEventView.module.css +0 -0
  339. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SampleInitEventView.module.css +0 -0
  340. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SandboxEventView.module.css +0 -0
  341. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/ScoreEventView.module.css +0 -0
  342. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/SubtaskEventView.module.css +0 -0
  343. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/ToolEventView.module.css +0 -0
  344. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNav.module.css +0 -0
  345. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNav.tsx +0 -0
  346. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNavs.module.css +0 -0
  347. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventNavs.tsx +0 -0
  348. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventPanel.module.css +0 -0
  349. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventProgressPanel.module.css +0 -0
  350. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventProgressPanel.tsx +0 -0
  351. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventRow.module.css +0 -0
  352. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventRow.tsx +0 -0
  353. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventSection.module.css +0 -0
  354. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventSection.tsx +0 -0
  355. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/event/EventTimingPanel.module.css +0 -0
  356. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateDiffView.tsx +0 -0
  357. /inspect_ai/_view/www/src/{samples → app/samples}/transcript/state/StateEventView.module.css +0 -0
  358. /inspect_ai/_view/www/src/{workspace → app}/sidebar/EvalStatus.module.css +0 -0
  359. /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarLogEntry.module.css +0 -0
  360. /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoreView.module.css +0 -0
  361. /inspect_ai/_view/www/src/{workspace → app}/sidebar/SidebarScoresView.module.css +0 -0
  362. /inspect_ai/_view/www/src/{usage → app/usage}/ModelUsagePanel.module.css +0 -0
  363. /inspect_ai/_view/www/src/{usage → app/usage}/TokenTable.module.css +0 -0
  364. /inspect_ai/_view/www/src/{usage → app/usage}/UsageCard.module.css +0 -0
  365. /inspect_ai/_view/www/src/{api → client/api}/api-shared.ts +0 -0
  366. /inspect_ai/_view/www/src/{api → client/api}/jsonrpc.ts +0 -0
  367. /inspect_ai/_view/www/src/{logfile → client/remote}/remoteZipFile.ts +0 -0
  368. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/entry_points.txt +0 -0
  369. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/licenses/LICENSE +0 -0
  370. {inspect_ai-0.3.90.dist-info → inspect_ai-0.3.92.dist-info}/top_level.txt +0 -0
@@ -29,9 +29,9 @@ from .._openai import (
29
29
  OpenAIAsyncHttpxClient,
30
30
  is_computer_use_preview,
31
31
  is_gpt,
32
- is_o1_mini,
33
- is_o1_preview,
34
- is_o1_pro,
32
+ is_o1,
33
+ is_o1_early,
34
+ is_o3_mini,
35
35
  is_o_series,
36
36
  model_output_from_openai,
37
37
  openai_chat_messages,
@@ -62,6 +62,9 @@ class OpenAIAPI(ModelAPI):
62
62
  api_key: str | None = None,
63
63
  config: GenerateConfig = GenerateConfig(),
64
64
  responses_api: bool | None = None,
65
+ responses_store: Literal["auto"] | bool = "auto",
66
+ service_tier: str | None = None,
67
+ client_timeout: float | None = None,
65
68
  **model_args: Any,
66
69
  ) -> None:
67
70
  # extract azure service prefix from model name (other providers
@@ -82,9 +85,25 @@ class OpenAIAPI(ModelAPI):
82
85
  config=config,
83
86
  )
84
87
 
85
- # note whether we are forcing the responses_api
86
- self.responses_api = (
87
- responses_api or self.is_o1_pro() or self.is_computer_use_preview()
88
+ # is this a model we use responses api by default for?
89
+ responses_model = (
90
+ self.is_o_series() and not self.is_o1_early()
91
+ ) or self.is_computer_use_preview()
92
+
93
+ # resolve whether we are forcing the responses api
94
+ self.responses_api = responses_api or responses_model
95
+
96
+ # resolve whether we are using the responses store
97
+ self.responses_store = (
98
+ responses_store if isinstance(responses_store, bool) else responses_model
99
+ )
100
+
101
+ # set service tier if specified
102
+ self.service_tier = service_tier
103
+
104
+ # bump up default client timeout to 15 minutes for service_tier=="flex"
105
+ self.client_timeout = client_timeout or (
106
+ 900.0 if self.service_tier == "flex" else None
88
107
  )
89
108
 
90
109
  # resolve api_key
@@ -140,6 +159,7 @@ class OpenAIAPI(ModelAPI):
140
159
  api_version=api_version,
141
160
  azure_endpoint=base_url,
142
161
  http_client=http_client,
162
+ timeout=client_timeout if client_timeout is not None else NOT_GIVEN,
143
163
  **model_args,
144
164
  )
145
165
  else:
@@ -147,6 +167,7 @@ class OpenAIAPI(ModelAPI):
147
167
  api_key=self.api_key,
148
168
  base_url=model_base_url(base_url, "OPENAI_BASE_URL"),
149
169
  http_client=http_client,
170
+ timeout=client_timeout if client_timeout is not None else NOT_GIVEN,
150
171
  **model_args,
151
172
  )
152
173
 
@@ -159,14 +180,14 @@ class OpenAIAPI(ModelAPI):
159
180
  def is_o_series(self) -> bool:
160
181
  return is_o_series(self.service_model_name())
161
182
 
162
- def is_o1_pro(self) -> bool:
163
- return is_o1_pro(self.service_model_name())
183
+ def is_o1(self) -> bool:
184
+ return is_o1(self.service_model_name())
164
185
 
165
- def is_o1_mini(self) -> bool:
166
- return is_o1_mini(self.service_model_name())
186
+ def is_o1_early(self) -> bool:
187
+ return is_o1_early(self.service_model_name())
167
188
 
168
- def is_o1_preview(self) -> bool:
169
- return is_o1_preview(self.service_model_name())
189
+ def is_o3_mini(self) -> bool:
190
+ return is_o3_mini(self.service_model_name())
170
191
 
171
192
  def is_computer_use_preview(self) -> bool:
172
193
  return is_computer_use_preview(self.service_model_name())
@@ -184,8 +205,18 @@ class OpenAIAPI(ModelAPI):
184
205
 
185
206
  @override
186
207
  def tool_result_images(self) -> bool:
187
- # o1-pro, o1, and computer_use_preview support image inputs (but we're not strictly supporting o1)
188
- return self.is_o1_pro() or self.is_computer_use_preview()
208
+ # o1-pro, o1, and computer_use_preview support image inputs
209
+ if self.is_computer_use_preview():
210
+ return True
211
+ elif self.is_o_series():
212
+ if self.is_o1_early():
213
+ return False
214
+ elif self.is_o3_mini():
215
+ return False
216
+ else:
217
+ return True
218
+ else:
219
+ return False
189
220
 
190
221
  @override
191
222
  def disable_computer_screenshot_truncation(self) -> bool:
@@ -203,7 +234,7 @@ class OpenAIAPI(ModelAPI):
203
234
  config: GenerateConfig,
204
235
  ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
205
236
  # short-circuit to call o1- models that are text only
206
- if self.is_o1_preview() or self.is_o1_mini():
237
+ if self.is_o1_early():
207
238
  return await generate_o1(
208
239
  client=self.client,
209
240
  input=input,
@@ -219,6 +250,8 @@ class OpenAIAPI(ModelAPI):
219
250
  tools=tools,
220
251
  tool_choice=tool_choice,
221
252
  config=config,
253
+ service_tier=self.service_tier,
254
+ store=self.responses_store,
222
255
  )
223
256
 
224
257
  # allocate request_id (so we can see it from ModelCall)
@@ -248,7 +281,7 @@ class OpenAIAPI(ModelAPI):
248
281
  # determine system role
249
282
  # o1-mini does not support developer or system messages
250
283
  # (see Dec 17, 2024 changelog: https://platform.openai.com/docs/changelog)
251
- if self.is_o1_mini():
284
+ if self.is_o1_early():
252
285
  system_role: Literal["user", "system", "developer"] = "user"
253
286
  # other o-series models use 'developer' rather than 'system' messages
254
287
  # https://platform.openai.com/docs/guides/reasoning#advice-on-prompting
@@ -309,6 +342,10 @@ class OpenAIAPI(ModelAPI):
309
342
  # first call the default processing
310
343
  params = openai_completion_params(self.service_model_name(), config, tools)
311
344
 
345
+ # add service_tier if specified
346
+ if self.service_tier is not None:
347
+ params["service_tier"] = self.service_tier
348
+
312
349
  # now tailor to current model
313
350
  if config.max_tokens is not None:
314
351
  if self.is_o_series():
@@ -329,7 +366,7 @@ class OpenAIAPI(ModelAPI):
329
366
 
330
367
  # remove reasoning_effort if not supported
331
368
  if "reasoning_effort" in params.keys() and (
332
- self.is_gpt() or self.is_o1_mini() or self.is_o1_preview()
369
+ self.is_gpt() or self.is_o1_early()
333
370
  ):
334
371
  del params["reasoning_effort"]
335
372
 
@@ -212,7 +212,7 @@ class O1PreviewChatAPIHandler(ChatAPIHandler):
212
212
  prompt that asks the model to use the <tool_call>...</tool_call> syntax)
213
213
  """
214
214
  # extract tool calls
215
- tool_call_regex = rf"<{TOOL_CALL}>((?:.|\n)*?)</{TOOL_CALL}>"
215
+ tool_call_regex = rf"<{TOOL_CALL}>\s*(\{{[\s\S]*?\}})\s*</{TOOL_CALL}>"
216
216
  tool_calls_content: list[str] = re.findall(tool_call_regex, response)
217
217
 
218
218
  # if there are tool calls proceed with parsing
@@ -15,9 +15,7 @@ from .._model_output import ModelOutput, ModelUsage
15
15
  from .._openai import (
16
16
  OpenAIResponseError,
17
17
  is_computer_use_preview,
18
- is_gpt,
19
- is_o1_mini,
20
- is_o1_preview,
18
+ is_o1_early,
21
19
  is_o_series,
22
20
  openai_handle_bad_request,
23
21
  openai_media_filter,
@@ -41,6 +39,8 @@ async def generate_responses(
41
39
  tools: list[ToolInfo],
42
40
  tool_choice: ToolChoice,
43
41
  config: GenerateConfig,
42
+ service_tier: str | None,
43
+ store: bool,
44
44
  ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]:
45
45
  # allocate request_id (so we can see it from ModelCall)
46
46
  request_id = http_hooks.start_request()
@@ -61,14 +61,20 @@ async def generate_responses(
61
61
  # prepare request (we do this so we can log the ModelCall)
62
62
  tool_params = openai_responses_tools(tools, config) if len(tools) > 0 else NOT_GIVEN
63
63
  request = dict(
64
- input=await openai_responses_inputs(input, model_name),
64
+ input=await openai_responses_inputs(input, model_name, store),
65
65
  tools=tool_params,
66
66
  tool_choice=openai_responses_tool_choice(tool_choice, tool_params)
67
67
  if isinstance(tool_params, list) and tool_choice != "auto"
68
68
  else NOT_GIVEN,
69
69
  truncation="auto" if is_computer_use_preview(model_name) else NOT_GIVEN,
70
70
  extra_headers={HttpxHooks.REQUEST_ID_HEADER: request_id},
71
- **completion_params_responses(model_name, config, len(tools) > 0),
71
+ **completion_params_responses(
72
+ model_name,
73
+ config=config,
74
+ service_tier=service_tier,
75
+ tools=len(tools) > 0,
76
+ store=store,
77
+ ),
72
78
  )
73
79
 
74
80
  try:
@@ -110,7 +116,12 @@ async def generate_responses(
110
116
 
111
117
 
112
118
  def completion_params_responses(
113
- model_name: str, config: GenerateConfig, tools: bool
119
+ model_name: str,
120
+ *,
121
+ config: GenerateConfig,
122
+ service_tier: str | None,
123
+ tools: bool,
124
+ store: bool,
114
125
  ) -> dict[str, Any]:
115
126
  # TODO: we'll need a computer_use_preview bool for the 'include'
116
127
  # and 'reasoning' parameters
@@ -120,9 +131,9 @@ def completion_params_responses(
120
131
  f"OpenAI Responses API does not support the '{param}' parameter.",
121
132
  )
122
133
 
123
- params: dict[str, Any] = dict(
124
- model=model_name, store=is_computer_use_preview(model_name)
125
- )
134
+ params: dict[str, Any] = dict(model=model_name, store=store)
135
+ if service_tier is not None:
136
+ params["service_tier"] = service_tier
126
137
  if config.max_tokens is not None:
127
138
  params["max_output_tokens"] = config.max_tokens
128
139
  if config.frequency_penalty is not None:
@@ -153,13 +164,14 @@ def completion_params_responses(
153
164
  unsupported_warning("top_logprobs")
154
165
  if tools and config.parallel_tool_calls is not None and not is_o_series(model_name):
155
166
  params["parallel_tool_calls"] = config.parallel_tool_calls
156
- if (
157
- config.reasoning_effort is not None
158
- and not is_gpt(model_name)
159
- and not is_o1_mini(model_name)
160
- and not is_o1_preview(model_name)
161
- ):
162
- params["reasoning"] = dict(effort=config.reasoning_effort)
167
+ if is_o_series(model_name) and not is_o1_early(model_name):
168
+ reasoning: dict[str, str] = {}
169
+ if config.reasoning_effort is not None:
170
+ reasoning["effort"] = config.reasoning_effort
171
+ if config.reasoning_summary is not None:
172
+ reasoning["summary"] = config.reasoning_summary
173
+ if len(reasoning) > 0:
174
+ params["reasoning"] = reasoning
163
175
  if config.response_schema is not None:
164
176
  params["text"] = dict(
165
177
  format=ResponseFormatTextJSONSchemaConfigParam(
@@ -111,6 +111,20 @@ class OpenRouterAPI(OpenAICompatibleAPI):
111
111
  # default params
112
112
  params = super().completion_params(config, tools)
113
113
 
114
+ # remove reasoning_effort it is exists
115
+ if "reasoning_effort" in params:
116
+ del params["reasoning_effort"]
117
+
118
+ # provide openrouter standard reasoning options
119
+ # https://openrouter.ai/docs/use-cases/reasoning-tokens
120
+ if config.reasoning_effort is not None or config.reasoning_tokens is not None:
121
+ reasoning: dict[str, str | int] = dict()
122
+ if config.reasoning_effort is not None:
123
+ reasoning["effort"] = config.reasoning_effort
124
+ if config.reasoning_tokens is not None:
125
+ reasoning["max_tokens"] = config.reasoning_tokens
126
+ params["reasoning"] = reasoning
127
+
114
128
  # pass args if specifed
115
129
  EXTRA_BODY = "extra_body"
116
130
  if self.models or self.provider or self.transforms:
@@ -105,7 +105,7 @@ def vertex() -> type[ModelAPI]:
105
105
  def google() -> type[ModelAPI]:
106
106
  FEATURE = "Google API"
107
107
  PACKAGE = "google-genai"
108
- MIN_VERSION = "1.8.0"
108
+ MIN_VERSION = "1.12.1"
109
109
 
110
110
  # verify we have the package
111
111
  try:
@@ -267,7 +267,7 @@ def none() -> type[ModelAPI]:
267
267
  def validate_openai_client(feature: str) -> None:
268
268
  FEATURE = feature
269
269
  PACKAGE = "openai"
270
- MIN_VERSION = "1.69.0"
270
+ MIN_VERSION = "1.75.0"
271
271
 
272
272
  # verify we have the package
273
273
  try:
@@ -100,10 +100,20 @@ async def chat_api_request(
100
100
  # look at its `__cause__`. we've observed Cloudflare giving transient 500
101
101
  # status as well as a ReadTimeout, so we count these as rate limit errors
102
102
  def should_retry_chat_api_error(ex: BaseException) -> bool:
103
- return isinstance(ex, RetryError) and (
104
- (
105
- isinstance(ex.__cause__, httpx.HTTPStatusError)
106
- and is_retryable_http_status(ex.__cause__.response.status_code)
107
- )
108
- or isinstance(ex.__cause__, httpx.ReadTimeout)
109
- )
103
+ # not a tenacity RetryError
104
+ if not isinstance(ex, RetryError):
105
+ return False
106
+
107
+ cause = ex.__cause__
108
+
109
+ if cause is None:
110
+ raise RuntimeError(f"Tenacity RetryError with no __cause__: {ex}")
111
+
112
+ if isinstance(cause, httpx.HTTPStatusError):
113
+ if is_retryable_http_status(cause.response.status_code):
114
+ return True
115
+
116
+ if httpx_should_retry(cause):
117
+ return True
118
+
119
+ return False
@@ -104,7 +104,7 @@ class VLLMAPI(ModelAPI):
104
104
 
105
105
  # set which GPUs are available to use
106
106
  if device is not None:
107
- os.environ["CUDA_VISIBLE_DEVICES"] = str(device)
107
+ os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(device)
108
108
 
109
109
  # tell vllm how many GPUs to use
110
110
  if "tensor_parallel_size" not in model_args:
@@ -5,6 +5,7 @@ from typing import (
5
5
  Callable,
6
6
  ParamSpec,
7
7
  Protocol,
8
+ Type,
8
9
  Union,
9
10
  cast,
10
11
  overload,
@@ -24,6 +25,7 @@ from inspect_ai._util.registry import (
24
25
  registry_params,
25
26
  registry_tag,
26
27
  )
28
+ from inspect_ai.dataset._dataset import MT, metadata_as
27
29
 
28
30
  logger = getLogger(__name__)
29
31
 
@@ -121,6 +123,20 @@ class SampleScore(BaseModel):
121
123
  sample_metadata: dict[str, Any] | None = Field(default=None)
122
124
  """Metadata from the sample"""
123
125
 
126
+ def sample_metadata_as(self, metadata_cls: Type[MT]) -> MT | None:
127
+ """Pydantic model interface to sample metadata.
128
+
129
+ Args:
130
+ metadata_cls: Pydantic model type
131
+
132
+ Returns:
133
+ BaseModel: Instance of metadata_cls bound to sample metadata.
134
+ """
135
+ if self.sample_metadata is not None:
136
+ return metadata_as(self.sample_metadata, metadata_cls)
137
+ else:
138
+ return None
139
+
124
140
  scorer: str | None = Field(default=None)
125
141
  """Registry name of scorer that created this score."""
126
142
 
@@ -265,7 +281,7 @@ def metric_create(name: str, **kwargs: Any) -> Metric:
265
281
  Returns:
266
282
  Metric with registry info attribute
267
283
  """
268
- return cast(Metric, registry_create("metric", name, **kwargs))
284
+ return registry_create("metric", name, **kwargs)
269
285
 
270
286
 
271
287
  def to_metric_specs(
@@ -1,7 +1,8 @@
1
1
  import re
2
2
  from functools import partial
3
- from typing import Callable
3
+ from typing import Any, Callable
4
4
 
5
+ from inspect_ai._util.content import Content, ContentText
5
6
  from inspect_ai._util.dict import omit
6
7
  from inspect_ai._util.format import format_function_call
7
8
  from inspect_ai._util.list import remove_last_match_and_after
@@ -13,6 +14,7 @@ from inspect_ai.model._chat_message import (
13
14
  ChatMessageUser,
14
15
  )
15
16
  from inspect_ai.model._model import Model, get_model
17
+ from inspect_ai.model._model_output import ModelOutput
16
18
  from inspect_ai.solver._task_state import TaskState
17
19
  from inspect_ai.util import resource
18
20
 
@@ -166,16 +168,17 @@ def _model_graded_qa_single(
166
168
  question = state.input_text
167
169
 
168
170
  # format the scoring template
169
- score_prompt = grading_template.format(
171
+ scoring_prompt = model_scoring_prompt(
172
+ template=grading_template,
170
173
  question=question,
171
- answer=state.output.completion,
174
+ output=state.output,
172
175
  criterion=target.text,
173
176
  instructions=instructions,
174
- **metadata,
177
+ metadata=metadata,
175
178
  )
176
179
 
177
180
  # query the model for the score
178
- result = await model.generate(score_prompt)
181
+ result = await model.generate([scoring_prompt])
179
182
 
180
183
  # extract the grade
181
184
  match = re.search(grade_pattern or DEFAULT_GRADE_PATTERN, result.completion)
@@ -186,7 +189,7 @@ def _model_graded_qa_single(
186
189
  explanation=result.completion,
187
190
  metadata=dict(
188
191
  grading=[
189
- ChatMessageUser(content=score_prompt),
192
+ scoring_prompt,
190
193
  result.message,
191
194
  ]
192
195
  ),
@@ -300,3 +303,45 @@ def chat_history(state: TaskState) -> str:
300
303
  )
301
304
 
302
305
  return "\n\n".join(history)
306
+
307
+
308
+ def model_scoring_prompt(
309
+ *,
310
+ template: str,
311
+ question: str,
312
+ output: ModelOutput,
313
+ criterion: str,
314
+ instructions: str,
315
+ metadata: dict[str, Any],
316
+ ) -> ChatMessageUser:
317
+ # we need to remove media objects from output and reference them as attachements in the answer
318
+ answer = output.completion
319
+ media: list[Content] = (
320
+ [
321
+ content
322
+ for content in output.message.content
323
+ if content.type in ["image", "audio", "video"]
324
+ ]
325
+ if len(output.choices) > 0 and isinstance(output.message.content, list)
326
+ else []
327
+ )
328
+ if len(media) > 0:
329
+ if len(answer) > 0:
330
+ answer = f"{answer} (see also attached media)"
331
+ else:
332
+ answer = "See attached media"
333
+
334
+ # format the prompt
335
+ prompt = template.format(
336
+ question=question,
337
+ answer=answer,
338
+ criterion=criterion,
339
+ instructions=instructions,
340
+ **metadata,
341
+ )
342
+
343
+ # return with media if necessary
344
+ if len(media) > 0:
345
+ return ChatMessageUser(content=[ContentText(text=prompt)] + media)
346
+ else:
347
+ return ChatMessageUser(content=prompt)
@@ -117,7 +117,7 @@ def scorer_create(name: str, **kwargs: Any) -> Scorer:
117
117
  Returns:
118
118
  Scorer with registry info attribute
119
119
  """
120
- return cast(Scorer, registry_create("scorer", name, **kwargs))
120
+ return registry_create("scorer", name, **kwargs)
121
121
 
122
122
 
123
123
  def scorer(
@@ -13,6 +13,7 @@ def human_agent(
13
13
  answer: bool | str = True,
14
14
  intermediate_scoring: bool = False,
15
15
  record_session: bool = True,
16
+ user: str | None = None,
16
17
  ) -> Solver:
17
18
  """Human solver for agentic tasks that run in a Linux environment.
18
19
 
@@ -32,6 +33,7 @@ def human_agent(
32
33
  that the answer matches the expected format.
33
34
  intermediate_scoring: Allow the human agent to check their score while working.
34
35
  record_session: Record all user commands and outputs in the sandbox bash session.
36
+ user: User to login as. Defaults to the sandbox environment's default user.
35
37
 
36
38
  Returns:
37
39
  Solver: Human agent solver.
@@ -48,5 +50,6 @@ def human_agent(
48
50
  answer=answer,
49
51
  intermediate_scoring=intermediate_scoring,
50
52
  record_session=record_session,
53
+ user=user,
51
54
  )
52
55
  )
@@ -230,4 +230,4 @@ def plan_create(name: str, **kwargs: Any) -> Plan:
230
230
  Returns:
231
231
  Plan with registry info attribute
232
232
  """
233
- return cast(Plan, registry_create("plan", name, **kwargs)) # type: ignore[arg-type]
233
+ return registry_create("plan", name, **kwargs)
@@ -136,7 +136,7 @@ def solver_create(name: str, **kwargs: Any) -> Solver:
136
136
  Returns:
137
137
  Solver with registry info attribute
138
138
  """
139
- return cast(Solver, registry_create("solver", name, **kwargs))
139
+ return registry_create("solver", name, **kwargs)
140
140
 
141
141
 
142
142
  SolverType: TypeAlias = Solver | Agent
@@ -1,4 +1,7 @@
1
+ from typing import Sequence
2
+
1
3
  from inspect_ai.tool import Tool, ToolChoice
4
+ from inspect_ai.tool._tool import ToolSource
2
5
  from inspect_ai.tool._tool_def import ToolDef
3
6
 
4
7
  from ._solver import Generate, Solver, solver
@@ -7,7 +10,7 @@ from ._task_state import TaskState
7
10
 
8
11
  @solver
9
12
  def use_tools(
10
- *tools: Tool | list[Tool],
13
+ *tools: Tool | ToolDef | ToolSource | Sequence[Tool | ToolDef | ToolSource],
11
14
  tool_choice: ToolChoice | None = "auto",
12
15
  append: bool = False,
13
16
  ) -> Solver:
@@ -34,17 +37,20 @@ def use_tools(
34
37
  tools_update: list[Tool] = []
35
38
 
36
39
  # add tool function to take care of tool/tool_def
37
- def add_tool(tool: Tool | ToolDef) -> None:
38
- if isinstance(tool, ToolDef):
39
- tool = tool.as_tool()
40
- tools_update.append(tool)
40
+ async def add_tools(tool: Tool | ToolDef | ToolSource) -> None:
41
+ if isinstance(tool, ToolSource):
42
+ tools_update.extend(await tool.tools())
43
+ else:
44
+ if isinstance(tool, ToolDef):
45
+ tool = tool.as_tool()
46
+ tools_update.append(tool)
41
47
 
42
48
  for tool in tools:
43
- if isinstance(tool, list):
49
+ if isinstance(tool, Sequence):
44
50
  for t in tool:
45
- add_tool(t)
51
+ await add_tools(t)
46
52
  else:
47
- add_tool(tool)
53
+ await add_tools(tool)
48
54
  if len(tools_update) > 0:
49
55
  if append:
50
56
  existing_tools = state.tools
@@ -8,7 +8,15 @@ from inspect_ai._util.content import (
8
8
  )
9
9
  from inspect_ai._util.deprecation import relocated_module_attribute
10
10
 
11
- from ._tool import Tool, ToolError, ToolResult, tool
11
+ from ._mcp import (
12
+ MCPServer,
13
+ mcp_connection,
14
+ mcp_server_sandbox,
15
+ mcp_server_sse,
16
+ mcp_server_stdio,
17
+ mcp_tools,
18
+ )
19
+ from ._tool import Tool, ToolError, ToolResult, ToolSource, tool
12
20
  from ._tool_call import (
13
21
  ToolCall,
14
22
  ToolCallContent,
@@ -45,6 +53,13 @@ __all__ = [
45
53
  "ToolCallError",
46
54
  "ToolError",
47
55
  "ToolResult",
56
+ "ToolSource",
57
+ "mcp_tools",
58
+ "mcp_connection",
59
+ "mcp_server_stdio",
60
+ "mcp_server_sse",
61
+ "mcp_server_sandbox",
62
+ "MCPServer",
48
63
  "Content",
49
64
  "ContentAudio",
50
65
  "ContentImage",