inspect-ai 0.3.61__py3-none-any.whl → 0.3.63__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (430) hide show
  1. inspect_ai/_cli/eval.py +13 -0
  2. inspect_ai/_cli/main.py +1 -1
  3. inspect_ai/_cli/trace.py +8 -0
  4. inspect_ai/_cli/view.py +4 -0
  5. inspect_ai/_display/core/active.py +2 -3
  6. inspect_ai/_display/textual/widgets/transcript.py +15 -9
  7. inspect_ai/_eval/eval.py +4 -4
  8. inspect_ai/_eval/evalset.py +6 -6
  9. inspect_ai/_eval/task/error.py +10 -14
  10. inspect_ai/_eval/task/run.py +13 -8
  11. inspect_ai/_util/hash.py +1 -1
  12. inspect_ai/_util/transcript.py +11 -0
  13. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  14. inspect_ai/_view/www/.vscode/settings.json +8 -0
  15. inspect_ai/_view/www/App.css +92 -29
  16. inspect_ai/_view/www/dist/assets/index.css +16636 -14674
  17. inspect_ai/_view/www/dist/assets/index.js +43585 -36122
  18. inspect_ai/_view/www/dist/index.html +1 -1
  19. inspect_ai/_view/www/index.html +2 -2
  20. inspect_ai/_view/www/log-schema.json +36 -19
  21. inspect_ai/_view/www/package.json +22 -4
  22. inspect_ai/_view/www/postcss.config.cjs +8 -9
  23. inspect_ai/_view/www/src/{App.mjs → App.tsx} +355 -365
  24. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  25. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  26. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  27. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  28. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  29. inspect_ai/_view/www/src/api/index.ts +4 -4
  30. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  31. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  32. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  33. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  34. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  35. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  36. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  37. inspect_ai/_view/www/src/components/Card.css +60 -0
  38. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  39. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  40. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  41. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  42. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  43. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  44. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  45. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  46. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  47. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  48. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  49. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  50. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  51. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  52. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  53. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  54. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  55. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  56. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  57. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  58. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  59. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  60. inspect_ai/_view/www/src/components/LargeModal.tsx +199 -0
  61. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  62. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  63. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  64. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  65. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  66. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  67. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  68. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  69. inspect_ai/_view/www/src/components/NavPills.tsx +99 -0
  70. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  71. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  72. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  73. inspect_ai/_view/www/src/components/TabSet.tsx +200 -0
  74. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  75. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  76. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  77. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  78. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  79. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -7
  80. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  81. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  82. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  83. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  84. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  85. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  86. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  87. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  88. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  89. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  90. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  91. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  92. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  93. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  94. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +309 -0
  95. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  96. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  97. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  98. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  99. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  100. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  101. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  102. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  103. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +326 -0
  104. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  105. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +175 -0
  106. inspect_ai/_view/www/src/samples/SamplesTools.tsx +60 -0
  107. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  108. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  109. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  110. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  111. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  112. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +46 -0
  113. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  114. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  115. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  116. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +143 -0
  117. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  118. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +131 -0
  119. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  120. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +145 -0
  121. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  122. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +86 -0
  123. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  124. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  126. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  127. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +107 -0
  128. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +363 -0
  129. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  130. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  131. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  132. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  133. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  134. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  135. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  136. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  137. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  138. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  139. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  140. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  141. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  142. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  143. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  144. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  145. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  146. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  147. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  148. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  149. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  150. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  151. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  152. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  153. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  154. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  155. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  156. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  157. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  158. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +173 -0
  159. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  160. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +182 -0
  161. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  162. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  163. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  164. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  165. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  166. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  167. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  168. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  169. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  170. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  171. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  172. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  173. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  174. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  175. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  176. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  177. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  179. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  180. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  181. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  182. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +108 -0
  183. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  184. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  185. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  186. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  187. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  188. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  189. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  190. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  191. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +91 -0
  192. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  193. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  194. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  196. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  197. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +38 -0
  198. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  199. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +190 -0
  200. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  201. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  202. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  203. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  204. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  205. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +274 -0
  206. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  207. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  208. inspect_ai/_view/www/src/samples/transcript/state/{StateEventView.mjs → StateEventView.tsx} +148 -110
  209. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  210. inspect_ai/_view/www/src/types/log.d.ts +7 -4
  211. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  212. inspect_ai/_view/www/src/types.ts +71 -0
  213. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +22 -0
  214. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  215. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +95 -0
  216. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  217. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  218. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  219. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  220. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  221. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  222. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  223. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  224. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  225. inspect_ai/_view/www/src/utils/format.ts +194 -0
  226. inspect_ai/_view/www/src/utils/git.ts +7 -0
  227. inspect_ai/_view/www/src/utils/html.ts +6 -0
  228. inspect_ai/_view/www/src/utils/http.ts +14 -0
  229. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  230. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  231. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  232. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  233. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  234. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  235. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  236. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  237. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +160 -0
  238. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  239. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  240. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  241. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  242. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  243. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +113 -0
  244. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +67 -0
  245. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +156 -0
  246. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  247. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +222 -0
  248. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  249. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  250. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  251. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  252. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  253. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  254. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  255. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  256. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  257. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  258. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  259. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +41 -0
  260. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  261. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +61 -0
  262. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +80 -0
  263. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  264. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  265. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  266. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  267. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  268. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  269. inspect_ai/_view/www/tsconfig.json +23 -9
  270. inspect_ai/_view/www/vite.config.js +8 -17
  271. inspect_ai/_view/www/yarn.lock +627 -556
  272. inspect_ai/dataset/_dataset.py +36 -0
  273. inspect_ai/dataset/_sources/csv.py +8 -0
  274. inspect_ai/dataset/_sources/file.py +4 -0
  275. inspect_ai/dataset/_sources/hf.py +11 -1
  276. inspect_ai/dataset/_sources/json.py +8 -0
  277. inspect_ai/log/_log.py +3 -6
  278. inspect_ai/log/_message.py +1 -1
  279. inspect_ai/log/_recorders/eval.py +1 -1
  280. inspect_ai/log/_recorders/json.py +5 -7
  281. inspect_ai/model/_call_tools.py +2 -1
  282. inspect_ai/model/_chat_message.py +27 -0
  283. inspect_ai/model/_conversation.py +10 -3
  284. inspect_ai/model/_generate_config.py +6 -0
  285. inspect_ai/model/_model.py +74 -0
  286. inspect_ai/model/_openai.py +33 -1
  287. inspect_ai/model/_providers/anthropic.py +12 -0
  288. inspect_ai/model/_providers/groq.py +4 -0
  289. inspect_ai/model/_providers/openai.py +21 -9
  290. inspect_ai/model/_providers/openai_o1.py +3 -5
  291. inspect_ai/model/_providers/openrouter.py +86 -0
  292. inspect_ai/model/_providers/providers.py +12 -1
  293. inspect_ai/model/_reasoning.py +17 -0
  294. inspect_ai/scorer/_answer.py +7 -7
  295. inspect_ai/scorer/_classification.py +34 -18
  296. inspect_ai/scorer/_common.py +2 -8
  297. inspect_ai/solver/_basic_agent.py +19 -9
  298. inspect_ai/solver/_multiple_choice.py +24 -9
  299. inspect_ai/tool/__init__.py +2 -0
  300. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +2 -5
  301. inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +4 -0
  302. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  303. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +3 -0
  304. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +61 -0
  305. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +10 -0
  306. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  307. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  308. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  309. inspect_ai/tool/_tools/_execute.py +8 -2
  310. inspect_ai/tool/beta.py +3 -0
  311. inspect_ai/util/_sandbox/docker/docker.py +32 -85
  312. inspect_ai/util/_sandbox/self_check.py +124 -16
  313. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/METADATA +2 -1
  314. inspect_ai-0.3.63.dist-info/RECORD +618 -0
  315. inspect_ai/_view/www/src/Register.mjs +0 -3
  316. inspect_ai/_view/www/src/Types.mjs +0 -38
  317. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  318. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  319. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  320. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  321. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  322. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  323. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  324. inspect_ai/_view/www/src/components/ChatView.mjs +0 -418
  325. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  326. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  327. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  328. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  329. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  330. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  331. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  332. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  333. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  334. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  335. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  336. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  337. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  338. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  339. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  340. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  341. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  342. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  343. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  344. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  345. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  346. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  347. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  348. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  349. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  350. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  351. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  352. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  353. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  354. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  355. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  356. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  357. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  358. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  359. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  360. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  361. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  362. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  363. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  364. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  365. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  366. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  367. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  368. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  369. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  370. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  371. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  372. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  373. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  374. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  375. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  376. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  377. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  378. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  379. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  380. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  381. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  382. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  383. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  384. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  385. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  386. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  387. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  388. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  389. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  390. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  391. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  392. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  393. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  394. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  395. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  396. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  397. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  398. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  399. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  400. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  401. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  402. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  403. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  404. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  405. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  406. inspect_ai/tool/beta/__init__.py +0 -5
  407. inspect_ai-0.3.61.dist-info/RECORD +0 -476
  408. /inspect_ai/{tool/beta/_computer/_resources/tool/__init__.py → _view/www/src/components/MorePopOver.css} +0 -0
  409. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  410. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _view/www/src/workspace/tabs/InfoTab.module.css} +0 -0
  411. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  412. /inspect_ai/tool/{beta → _tools}/_computer/_common.py +0 -0
  413. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  414. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  415. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  416. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  417. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  418. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  419. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  420. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  421. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  422. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  423. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  424. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  425. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  426. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  427. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/LICENSE +0 -0
  428. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/WHEEL +0 -0
  429. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/entry_points.txt +0 -0
  430. {inspect_ai-0.3.61.dist-info → inspect_ai-0.3.63.dist-info}/top_level.txt +0 -0
@@ -144,6 +144,14 @@ class Dataset(Sequence[Sample], abc.ABC):
144
144
  @abc.abstractmethod
145
145
  def shuffled(self) -> bool: ...
146
146
 
147
+ @abc.abstractmethod
148
+ def shuffle_choices(self, seed: int | None = None) -> None:
149
+ """Shuffle the order of the choices with each sample.
150
+
151
+ Args:
152
+ seed: (int | None): Random seed for shuffling (optional).
153
+ """
154
+
147
155
  @overload
148
156
  def __getitem__(self, index: int) -> Sample: ...
149
157
 
@@ -315,6 +323,34 @@ class MemoryDataset(Dataset):
315
323
  random.shuffle(self.samples)
316
324
  self._shuffled = True
317
325
 
326
+ @override
327
+ def shuffle_choices(self, seed: int | None = None) -> None:
328
+ rand = random.Random(seed)
329
+ for sample in self.samples:
330
+ if not sample.choices:
331
+ continue
332
+ # The original positions
333
+ positions = list(range(len(sample.choices)))
334
+
335
+ # Shuffle the choices
336
+ rand.shuffle(positions)
337
+ shuffled_choices = [sample.choices[i] for i in positions]
338
+
339
+ # Map of original position / target letter
340
+ position_map = {i: chr(65 + new_i) for new_i, i in enumerate(positions)}
341
+
342
+ # Update to the shuffled choices and target
343
+ sample.choices = shuffled_choices
344
+ sample.target = self._remap_target(sample.target, position_map=position_map)
345
+
346
+ def _remap_target(
347
+ self, target: str | list[str], position_map: dict[int, str]
348
+ ) -> str | list[str]:
349
+ if isinstance(target, list):
350
+ return [position_map[ord(t) - 65] for t in target]
351
+ else:
352
+ return position_map[ord(target) - 65]
353
+
318
354
  @override
319
355
  def sort(
320
356
  self,
@@ -23,6 +23,7 @@ def csv_dataset(
23
23
  auto_id: bool = False,
24
24
  shuffle: bool = False,
25
25
  seed: int | None = None,
26
+ shuffle_choices: bool | int | None = None,
26
27
  limit: int | None = None,
27
28
  dialect: str = "unix",
28
29
  encoding: str = "utf-8",
@@ -45,6 +46,7 @@ def csv_dataset(
45
46
  auto_id (bool): Assign an auto-incrementing ID for each sample.
46
47
  shuffle (bool): Randomly shuffle the dataset order.
47
48
  seed: (int | None): Seed used for random shuffle.
49
+ shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
48
50
  limit (int | None): Limit the number of records to read.
49
51
  dialect (str): CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details
50
52
  encoding (str): Text encoding for file (defaults to "utf-8").
@@ -86,6 +88,12 @@ def csv_dataset(
86
88
  if shuffle:
87
89
  dataset.shuffle(seed=seed)
88
90
 
91
+ # shuffle choices, if requested
92
+ if isinstance(shuffle_choices, int):
93
+ dataset.shuffle_choices(seed=shuffle_choices)
94
+ elif shuffle_choices is True:
95
+ dataset.shuffle_choices()
96
+
89
97
  # limit if requested
90
98
  if limit:
91
99
  return dataset[0:limit]
@@ -16,6 +16,7 @@ def file_dataset(
16
16
  auto_id: bool = False,
17
17
  shuffle: bool = False,
18
18
  seed: int | None = None,
19
+ shuffle_choices: bool | int | None = None,
19
20
  limit: int | None = None,
20
21
  dialect: str = "unix",
21
22
  encoding: str = "utf-8",
@@ -40,6 +41,7 @@ def file_dataset(
40
41
  auto_id (bool): Assign an auto-incrementing ID for each sample.
41
42
  shuffle (bool): Randomly shuffle the dataset order.
42
43
  seed: (int | None): Seed used for random shuffle.
44
+ shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
43
45
  limit (int | None): Limit the number of records to read.
44
46
  dialect (str): CSV dialect ("unix" or "excel", defaults to "unix"). Only
45
47
  applies to reading CSV files.
@@ -66,6 +68,7 @@ def file_dataset(
66
68
  auto_id=auto_id,
67
69
  shuffle=shuffle,
68
70
  seed=seed,
71
+ shuffle_choices=shuffle_choices,
69
72
  limit=limit,
70
73
  encoding=encoding,
71
74
  name=name,
@@ -78,6 +81,7 @@ def file_dataset(
78
81
  auto_id=auto_id,
79
82
  shuffle=shuffle,
80
83
  seed=seed,
84
+ shuffle_choices=shuffle_choices,
81
85
  limit=limit,
82
86
  dialect=dialect,
83
87
  encoding=encoding,
@@ -29,6 +29,7 @@ def hf_dataset(
29
29
  auto_id: bool = False,
30
30
  shuffle: bool = False,
31
31
  seed: int | None = None,
32
+ shuffle_choices: bool | int | None = None,
32
33
  limit: int | None = None,
33
34
  trust: bool = False,
34
35
  cached: bool = True,
@@ -59,6 +60,7 @@ def hf_dataset(
59
60
  auto_id (bool): Assign an auto-incrementing ID for each sample.
60
61
  shuffle (bool): Randomly shuffle the dataset order.
61
62
  seed: (int | None): Seed used for random shuffle.
63
+ shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
62
64
  limit (int | None): Limit the number of records to read.
63
65
  trust (bool): Whether or not to allow for datasets defined on the Hub
64
66
  using a dataset script. This option should only be set to True for
@@ -117,8 +119,16 @@ def hf_dataset(
117
119
  dataset = dataset.select(range(limit))
118
120
 
119
121
  # return the dataset
120
- return MemoryDataset(
122
+ memory_dataset = MemoryDataset(
121
123
  samples=data_to_samples(dataset.to_list(), data_to_sample, auto_id),
122
124
  name=Path(path).stem if Path(path).exists() else path,
123
125
  location=path,
124
126
  )
127
+
128
+ # maybe shuffle the choices
129
+ if isinstance(shuffle_choices, int):
130
+ memory_dataset.shuffle_choices(seed=shuffle_choices)
131
+ elif shuffle_choices is True:
132
+ memory_dataset.shuffle_choices()
133
+
134
+ return memory_dataset
@@ -25,6 +25,7 @@ def json_dataset(
25
25
  auto_id: bool = False,
26
26
  shuffle: bool = False,
27
27
  seed: int | None = None,
28
+ shuffle_choices: bool | int | None = None,
28
29
  limit: int | None = None,
29
30
  encoding: str = "utf-8",
30
31
  name: str | None = None,
@@ -49,6 +50,7 @@ def json_dataset(
49
50
  auto_id (bool): Assign an auto-incrementing ID for each sample.
50
51
  shuffle (bool): Randomly shuffle the dataset order.
51
52
  seed: (int | None): Seed used for random shuffle.
53
+ shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
52
54
  limit (int | None): Limit the number of records to read.
53
55
  encoding (str): Text encoding for file (defaults to "utf-8").
54
56
  name (str): Optional name for dataset (for logging). If not specified,
@@ -86,6 +88,12 @@ def json_dataset(
86
88
  if shuffle:
87
89
  dataset.shuffle(seed=seed)
88
90
 
91
+ # shuffle choices, if requested
92
+ if isinstance(shuffle_choices, int):
93
+ dataset.shuffle_choices(seed=shuffle_choices)
94
+ elif shuffle_choices is True:
95
+ dataset.shuffle_choices()
96
+
89
97
  # limit if requested
90
98
  if limit:
91
99
  return dataset[0:limit]
inspect_ai/log/_log.py CHANGED
@@ -17,12 +17,7 @@ from inspect_ai._util.error import EvalError, exception_message
17
17
  from inspect_ai._util.logger import warn_once
18
18
  from inspect_ai.approval._policy import ApprovalPolicyConfig
19
19
  from inspect_ai.dataset._dataset import MT, metadata_as
20
- from inspect_ai.model import (
21
- ChatMessage,
22
- GenerateConfig,
23
- ModelOutput,
24
- ModelUsage,
25
- )
20
+ from inspect_ai.model import ChatMessage, GenerateConfig, ModelOutput, ModelUsage
26
21
  from inspect_ai.scorer import Score
27
22
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
28
23
  from inspect_ai.util._store import Store
@@ -404,6 +399,8 @@ class EvalResults(BaseModel):
404
399
  if "metrics" in values:
405
400
  metrics = values["metrics"]
406
401
  del values["metrics"]
402
+ else:
403
+ metrics = None
407
404
  # Convert the scorer to the new schema
408
405
  score = values["scorer"]
409
406
  if metrics:
@@ -5,7 +5,7 @@ from typing import Any, Literal, Type, cast
5
5
  from pydantic import BaseModel, Field, model_validator
6
6
 
7
7
  LoggingLevel = Literal[
8
- "debug", "http", "sandbox", "info", "warning", "error", "critical"
8
+ "debug", "trace", "http", "sandbox", "info", "warning", "error", "critical"
9
9
  ]
10
10
  """Logging level."""
11
11
 
@@ -203,7 +203,7 @@ class EvalRecorder(FileRecorder):
203
203
  # of small fetches from the zip file streams)
204
204
  temp_log: str | None = None
205
205
  fs = filesystem(location)
206
- if not fs.is_local():
206
+ if not fs.is_local() and header_only is False:
207
207
  with tempfile.NamedTemporaryFile(delete=False) as temp:
208
208
  temp_log = temp.name
209
209
  fs.get_file(location, temp_log)
@@ -9,12 +9,7 @@ from typing_extensions import override
9
9
 
10
10
  from inspect_ai._util.constants import LOG_SCHEMA_VERSION
11
11
  from inspect_ai._util.error import EvalError
12
- from inspect_ai._util.file import (
13
- absolute_file_path,
14
- async_fileystem,
15
- file,
16
- filesystem,
17
- )
12
+ from inspect_ai._util.file import absolute_file_path, async_fileystem, file, filesystem
18
13
  from inspect_ai._util.trace import trace_action
19
14
 
20
15
  from .._log import (
@@ -236,12 +231,13 @@ def _read_header_streaming(log_file: str) -> EvalLog:
236
231
  f.seek(0)
237
232
 
238
233
  # Parse the log file, stopping before parsing samples
234
+ status: Literal["started", "success", "cancelled", "error"] | None = None
239
235
  for k, v in ijson.kvitems(f, ""):
240
236
  if k == "status":
241
237
  assert v in get_args(
242
238
  Literal["started", "success", "cancelled", "error"]
243
239
  )
244
- status: Literal["started", "success", "cancelled", "error"] = v
240
+ status = v
245
241
  if k == "eval":
246
242
  eval = EvalSpec(**v)
247
243
  elif k == "plan":
@@ -257,6 +253,8 @@ def _read_header_streaming(log_file: str) -> EvalLog:
257
253
  error = EvalError(**v)
258
254
  break
259
255
 
256
+ assert status, "Must encounter a 'status'"
257
+
260
258
  return EvalLog(
261
259
  eval=eval,
262
260
  plan=plan,
@@ -133,7 +133,8 @@ async def call_tools(
133
133
  ):
134
134
  content: str | list[Content] = [result]
135
135
  elif isinstance(result, list) and (
136
- isinstance(
136
+ len(result) == 0
137
+ or isinstance(
137
138
  result[0], ContentText | ContentImage | ContentAudio | ContentVideo
138
139
  )
139
140
  ):
@@ -7,6 +7,8 @@ from inspect_ai._util.content import Content, ContentText
7
7
  from inspect_ai.tool import ToolCall
8
8
  from inspect_ai.tool._tool_call import ToolCallError
9
9
 
10
+ from ._reasoning import parse_content_with_reasoning
11
+
10
12
  logger = getLogger(__name__)
11
13
 
12
14
 
@@ -83,6 +85,31 @@ class ChatMessageAssistant(ChatMessageBase):
83
85
  tool_calls: list[ToolCall] | None = Field(default=None)
84
86
  """Tool calls made by the model."""
85
87
 
88
+ reasoning: str | None = Field(default=None)
89
+ """Reasoning content."""
90
+
91
+ # Some OpenAI compatible REST endpoints include reasoning as a field alongside
92
+ # content, however since this field doesn't exist in the OpenAI interface,
93
+ # hosting providers (so far we've seen this with Together and Groq) may
94
+ # include the reasoning in a <think></think> tag before the main response.
95
+ # We expect this pattern to be repeated elsewhere, so include this hook to
96
+ # automatically extract the reasoning content when the response is prefaced
97
+ # with a <think> block. If this ends up being an overeach we can fall back
98
+ # to each provider manually parsing out <think> using a helper function.
99
+ # The implementation isn't important here, the critical thing to establish
100
+ # is that Inspect makes reasoning content available separately.
101
+ @model_validator(mode="before")
102
+ @classmethod
103
+ def extract_reasoning(cls, data: Any) -> Any:
104
+ if isinstance(data, dict):
105
+ content = data.get("content", None)
106
+ if isinstance(content, str):
107
+ parsed = parse_content_with_reasoning(content)
108
+ if parsed:
109
+ data["reasoning"] = parsed.reasoning
110
+ data["content"] = parsed.content
111
+ return data
112
+
86
113
 
87
114
  class ChatMessageTool(ChatMessageBase):
88
115
  role: Literal["tool"] = Field(default="tool")
@@ -2,7 +2,7 @@ from rich.console import RenderableType
2
2
  from rich.text import Text
3
3
 
4
4
  from inspect_ai._util.rich import lines_display
5
- from inspect_ai._util.transcript import transcript_markdown
5
+ from inspect_ai._util.transcript import transcript_markdown, transcript_reasoning
6
6
  from inspect_ai.util._conversation import conversation_panel
7
7
  from inspect_ai.util._display import display_type
8
8
 
@@ -38,8 +38,15 @@ def conversation_assistant_message(
38
38
  content=transcript_markdown(m.text, escape=True),
39
39
  )
40
40
 
41
- # start with assistant content
42
- content: list[RenderableType] = (
41
+ # build content
42
+ content: list[RenderableType] = []
43
+
44
+ # reasoning
45
+ if message.reasoning:
46
+ content.extend(transcript_reasoning(message.reasoning))
47
+
48
+ # message text
49
+ content.extend(
43
50
  [transcript_markdown(message.text, escape=True)] if message.text else []
44
51
  )
45
52
 
@@ -75,6 +75,9 @@ class GenerateConfigArgs(TypedDict, total=False):
75
75
  reasoning_effort: Literal["low", "medium", "high"] | None
76
76
  """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
77
77
 
78
+ reasoning_history: bool | None
79
+ """Include reasoning in chat message history sent to generate."""
80
+
78
81
 
79
82
  class GenerateConfig(BaseModel):
80
83
  """Base class for model generation configs."""
@@ -145,6 +148,9 @@ class GenerateConfig(BaseModel):
145
148
  reasoning_effort: Literal["low", "medium", "high"] | None = Field(default=None)
146
149
  """Constrains effort on reasoning for reasoning models. Open AI o1 models only."""
147
150
 
151
+ reasoning_history: bool | None = Field(default=None)
152
+ """Include reasoning in chat message history sent to generate."""
153
+
148
154
  def merge(
149
155
  self, other: Union["GenerateConfig", GenerateConfigArgs]
150
156
  ) -> "GenerateConfig":
@@ -168,6 +168,10 @@ class ModelAPI(abc.ABC):
168
168
  """Tool results can contain images"""
169
169
  return False
170
170
 
171
+ def has_reasoning_history(self) -> bool:
172
+ """Chat message assistant messages can include reasoning."""
173
+ return False
174
+
171
175
 
172
176
  class Model:
173
177
  """Model interface."""
@@ -302,6 +306,11 @@ class Model:
302
306
  tools = []
303
307
  tool_choice = "none"
304
308
 
309
+ # handle reasoning history
310
+ input = resolve_reasoning_history(
311
+ input, config, self.api.has_reasoning_history()
312
+ )
313
+
305
314
  # apply any tool model_input handlers
306
315
  input = resolve_tool_model_input(tdefs, input)
307
316
 
@@ -726,6 +735,71 @@ def simple_input_messages(
726
735
  return messages
727
736
 
728
737
 
738
+ def resolve_reasoning_history(
739
+ messages: list[ChatMessage], config: GenerateConfig, api_has_reasoning_history: bool
740
+ ) -> list[ChatMessage]:
741
+ # determine if we are including reasoning history
742
+ reasoning_history = config.reasoning_history is not False
743
+
744
+ # determine up front if we have any reasoning content
745
+ have_reasoning = any(
746
+ [
747
+ isinstance(m, ChatMessageAssistant) and m.reasoning is not None
748
+ for m in messages
749
+ ]
750
+ )
751
+ if not have_reasoning:
752
+ return messages
753
+
754
+ # API asssistant message format directly supports reasoning history so we will:
755
+ # (a) Remove reasoning content entirely if config says not to include it; or
756
+ # (b) Leave the messages alone if config says to include it
757
+ if api_has_reasoning_history:
758
+ # remove reasoning history as per config
759
+ if not reasoning_history:
760
+ resolved_messages: list[ChatMessage] = []
761
+ for message in messages:
762
+ if isinstance(message, ChatMessageAssistant):
763
+ resolved_messages.append(
764
+ message.model_copy(update={"reasoning": None})
765
+ )
766
+ else:
767
+ resolved_messages.append(message)
768
+
769
+ return resolved_messages
770
+
771
+ # include reasoning history as per config
772
+ else:
773
+ return messages
774
+
775
+ # API can't represent reasoning natively so include <think> tags
776
+ elif reasoning_history:
777
+ resolved_messages = []
778
+ for message in messages:
779
+ if (
780
+ isinstance(message, ChatMessageAssistant)
781
+ and message.reasoning is not None
782
+ ):
783
+ message = deepcopy(message)
784
+ if isinstance(message.content, str):
785
+ message.content = (
786
+ f"<think>\n{message.reasoning}\n</think>\n\n{message.content}"
787
+ )
788
+ else:
789
+ message.content.insert(
790
+ 0, ContentText(text=f"<think>\n{message.reasoning}\n</think>\n")
791
+ )
792
+ message.reasoning = None
793
+
794
+ resolved_messages.append(message)
795
+
796
+ return resolved_messages
797
+
798
+ # api doesn't handle reasoning and config says no reasoning_history, nothing to do
799
+ else:
800
+ return messages
801
+
802
+
729
803
  def resolve_tool_model_input(
730
804
  tdefs: list[ToolDef], messages: list[ChatMessage]
731
805
  ) -> list[ChatMessage]:
@@ -43,10 +43,18 @@ from ._chat_message import (
43
43
  from ._model_output import ModelUsage, StopReason, as_stop_reason
44
44
 
45
45
 
46
+ def is_o_series(name: str) -> bool:
47
+ return is_o1(name) or is_o3(name)
48
+
49
+
46
50
  def is_o1(name: str) -> bool:
47
51
  return name.startswith("o1")
48
52
 
49
53
 
54
+ def is_o3(name: str) -> bool:
55
+ return name.startswith("o3")
56
+
57
+
50
58
  def is_o1_full(name: str) -> bool:
51
59
  return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name)
52
60
 
@@ -55,10 +63,18 @@ def is_o1_mini(name: str) -> bool:
55
63
  return name.startswith("o1-mini")
56
64
 
57
65
 
66
+ def is_o3_mini(name: str) -> bool:
67
+ return name.startswith("o3-mini")
68
+
69
+
58
70
  def is_o1_preview(name: str) -> bool:
59
71
  return name.startswith("o1-preview")
60
72
 
61
73
 
74
+ def is_gpt(name: str) -> bool:
75
+ return name.startswith("gpt")
76
+
77
+
62
78
  def openai_chat_tool_call(tool_call: ToolCall) -> ChatCompletionMessageToolCall:
63
79
  return ChatCompletionMessageToolCall(
64
80
  type="function",
@@ -296,6 +312,14 @@ def chat_messages_from_openai(
296
312
  else:
297
313
  content = [content_from_openai(c) for c in asst_content]
298
314
 
315
+ # resolve reasoning (OpenAI doesn't suport this however OpenAI-compatible
316
+ # interfaces e.g. DeepSeek do include this field so we pluck it out)
317
+ reasoning = message.get("reasoning_content", None) or message.get(
318
+ "reasoning", None
319
+ )
320
+ if reasoning is not None:
321
+ reasoning = str(reasoning)
322
+
299
323
  # return message
300
324
  if "tool_calls" in message:
301
325
  tool_calls: list[ToolCall] = []
@@ -306,7 +330,11 @@ def chat_messages_from_openai(
306
330
  else:
307
331
  tool_calls = []
308
332
  chat_messages.append(
309
- ChatMessageAssistant(content=content, tool_calls=tool_calls or None)
333
+ ChatMessageAssistant(
334
+ content=content,
335
+ tool_calls=tool_calls or None,
336
+ reasoning=reasoning,
337
+ )
310
338
  )
311
339
  elif message["role"] == "tool":
312
340
  tool_content = message.get("content", None) or ""
@@ -357,10 +385,14 @@ def chat_message_assistant_from_openai(
357
385
  message: ChatCompletionMessage, tools: list[ToolInfo]
358
386
  ) -> ChatMessageAssistant:
359
387
  refusal = getattr(message, "refusal", None)
388
+ reasoning = getattr(message, "reasoning_content", None) or getattr(
389
+ message, "reasoning", None
390
+ )
360
391
  return ChatMessageAssistant(
361
392
  content=refusal or message.content or "",
362
393
  source="generate",
363
394
  tool_calls=chat_tool_calls_from_openai(message, tools),
395
+ reasoning=reasoning,
364
396
  )
365
397
 
366
398
 
@@ -12,6 +12,7 @@ else:
12
12
 
13
13
  from anthropic import (
14
14
  APIConnectionError,
15
+ APIStatusError,
15
16
  AsyncAnthropic,
16
17
  AsyncAnthropicBedrock,
17
18
  AsyncAnthropicVertex,
@@ -218,6 +219,17 @@ class AnthropicAPI(ModelAPI):
218
219
  except BadRequestError as ex:
219
220
  return self.handle_bad_request(ex), model_call()
220
221
 
222
+ except APIStatusError as ex:
223
+ if ex.status_code == 413:
224
+ return ModelOutput.from_content(
225
+ model=self.model_name,
226
+ content=ex.message,
227
+ stop_reason="model_length",
228
+ error=ex.message,
229
+ ), model_call()
230
+ else:
231
+ raise ex
232
+
221
233
  def completion_params(self, config: GenerateConfig) -> dict[str, Any]:
222
234
  params = dict(model=self.model_name, max_tokens=cast(int, config.max_tokens))
223
235
  if config.temperature is not None:
@@ -294,8 +294,12 @@ def chat_tool_calls(message: Any, tools: list[ToolInfo]) -> Optional[List[ToolCa
294
294
 
295
295
 
296
296
  def chat_message_assistant(message: Any, tools: list[ToolInfo]) -> ChatMessageAssistant:
297
+ reasoning = getattr(message, "reasoning", None)
298
+ if reasoning is not None:
299
+ reasoning = str(reasoning)
297
300
  return ChatMessageAssistant(
298
301
  content=message.content or "",
299
302
  source="generate",
300
303
  tool_calls=chat_tool_calls(message, tools),
304
+ reasoning=reasoning,
301
305
  )
@@ -35,10 +35,12 @@ from .._model_output import (
35
35
  StopReason,
36
36
  )
37
37
  from .._openai import (
38
- is_o1,
38
+ is_gpt,
39
39
  is_o1_full,
40
40
  is_o1_mini,
41
41
  is_o1_preview,
42
+ is_o3,
43
+ is_o_series,
42
44
  openai_chat_messages,
43
45
  openai_chat_tool_choice,
44
46
  openai_chat_tools,
@@ -140,8 +142,8 @@ class OpenAIAPI(ModelAPI):
140
142
  def is_azure(self) -> bool:
141
143
  return self.service == "azure"
142
144
 
143
- def is_o1(self) -> bool:
144
- return is_o1(self.model_name)
145
+ def is_o_series(self) -> bool:
146
+ return is_o_series(self.model_name)
145
147
 
146
148
  def is_o1_full(self) -> bool:
147
149
  return is_o1_full(self.model_name)
@@ -149,9 +151,15 @@ class OpenAIAPI(ModelAPI):
149
151
  def is_o1_mini(self) -> bool:
150
152
  return is_o1_mini(self.model_name)
151
153
 
154
+ def is_o3(self) -> bool:
155
+ return is_o3(self.model_name)
156
+
152
157
  def is_o1_preview(self) -> bool:
153
158
  return is_o1_preview(self.model_name)
154
159
 
160
+ def is_gpt(self) -> bool:
161
+ return is_gpt(self.model_name)
162
+
155
163
  async def generate(
156
164
  self,
157
165
  input: list[ChatMessage],
@@ -258,7 +266,7 @@ class OpenAIAPI(ModelAPI):
258
266
  model=self.model_name,
259
267
  )
260
268
  if config.max_tokens is not None:
261
- if self.is_o1():
269
+ if self.is_o_series():
262
270
  params["max_completion_tokens"] = config.max_tokens
263
271
  else:
264
272
  params["max_tokens"] = config.max_tokens
@@ -273,10 +281,10 @@ class OpenAIAPI(ModelAPI):
273
281
  if config.seed is not None:
274
282
  params["seed"] = config.seed
275
283
  if config.temperature is not None:
276
- if self.is_o1():
284
+ if self.is_o_series():
277
285
  warn_once(
278
286
  logger,
279
- "o1 models do not support the 'temperature' parameter (temperature is always 1).",
287
+ "o series models do not support the 'temperature' parameter (temperature is always 1).",
280
288
  )
281
289
  else:
282
290
  params["temperature"] = config.temperature
@@ -293,9 +301,9 @@ class OpenAIAPI(ModelAPI):
293
301
  params["logprobs"] = config.logprobs
294
302
  if config.top_logprobs is not None:
295
303
  params["top_logprobs"] = config.top_logprobs
296
- if tools and config.parallel_tool_calls is not None and not self.is_o1():
304
+ if tools and config.parallel_tool_calls is not None and not self.is_o_series():
297
305
  params["parallel_tool_calls"] = config.parallel_tool_calls
298
- if config.reasoning_effort is not None and self.is_o1_full():
306
+ if config.reasoning_effort is not None and not self.is_gpt():
299
307
  params["reasoning_effort"] = config.reasoning_effort
300
308
 
301
309
  return params
@@ -312,7 +320,11 @@ class OpenAIAPI(ModelAPI):
312
320
  stop_reason: StopReason | None = None
313
321
  if e.code == "context_length_exceeded":
314
322
  stop_reason = "model_length"
315
- elif e.code == "invalid_prompt":
323
+ elif (
324
+ e.code == "invalid_prompt" # seems to happen for o1/o3
325
+ or e.code == "content_policy_violation" # seems to happen for vision
326
+ or e.code == "content_filter" # seems to happen on azure
327
+ ):
316
328
  stop_reason = "content_filter"
317
329
 
318
330
  if stop_reason: