inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (518) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/main.py +1 -1
  8. inspect_ai/_cli/sandbox.py +4 -1
  9. inspect_ai/_cli/score.py +181 -32
  10. inspect_ai/_cli/trace.py +10 -0
  11. inspect_ai/_cli/view.py +4 -2
  12. inspect_ai/_display/core/active.py +2 -3
  13. inspect_ai/_display/core/config.py +7 -1
  14. inspect_ai/_display/textual/widgets/samples.py +4 -3
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/eval.py +104 -101
  17. inspect_ai/_eval/evalset.py +75 -75
  18. inspect_ai/_eval/loader.py +122 -12
  19. inspect_ai/_eval/registry.py +1 -1
  20. inspect_ai/_eval/run.py +14 -0
  21. inspect_ai/_eval/score.py +125 -36
  22. inspect_ai/_eval/task/log.py +105 -4
  23. inspect_ai/_eval/task/results.py +92 -38
  24. inspect_ai/_eval/task/run.py +9 -2
  25. inspect_ai/_eval/task/sandbox.py +35 -2
  26. inspect_ai/_eval/task/task.py +49 -46
  27. inspect_ai/_util/constants.py +1 -1
  28. inspect_ai/_util/content.py +8 -0
  29. inspect_ai/_util/error.py +2 -0
  30. inspect_ai/_util/file.py +15 -1
  31. inspect_ai/_util/hash.py +1 -1
  32. inspect_ai/_util/logger.py +4 -2
  33. inspect_ai/_util/registry.py +7 -1
  34. inspect_ai/_view/view.py +1 -2
  35. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  36. inspect_ai/_view/www/.vscode/settings.json +8 -0
  37. inspect_ai/_view/www/App.css +97 -29
  38. inspect_ai/_view/www/README.md +1 -1
  39. inspect_ai/_view/www/dist/assets/index.css +16663 -14674
  40. inspect_ai/_view/www/dist/assets/index.js +58808 -51348
  41. inspect_ai/_view/www/dist/index.html +1 -1
  42. inspect_ai/_view/www/index.html +2 -2
  43. inspect_ai/_view/www/log-schema.json +87 -73
  44. inspect_ai/_view/www/package.json +22 -4
  45. inspect_ai/_view/www/postcss.config.cjs +8 -9
  46. inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
  47. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  48. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  49. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  50. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  51. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  52. inspect_ai/_view/www/src/api/index.ts +4 -4
  53. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  54. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  55. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  56. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  57. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  58. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  59. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  60. inspect_ai/_view/www/src/components/Card.css +60 -0
  61. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  62. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  63. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  64. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  65. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  66. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  67. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  68. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  69. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  70. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  71. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  72. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  73. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  74. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  75. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  76. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  77. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  78. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  79. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  80. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  81. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  82. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  83. inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
  84. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  85. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  86. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  87. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  88. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  89. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  90. inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
  91. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  92. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  93. inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
  94. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  95. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  96. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  97. inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
  98. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  99. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  100. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  101. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  102. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  103. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
  104. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  105. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  106. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  107. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  108. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  109. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  110. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  111. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  112. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  113. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  114. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  115. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  116. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  117. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  118. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
  119. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  120. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  121. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  122. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  123. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  124. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  126. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  127. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
  128. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  129. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
  130. inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
  131. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  132. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  133. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  134. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  135. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  136. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
  137. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  138. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  139. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  140. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
  141. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  142. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
  143. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  144. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
  145. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  146. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
  147. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  148. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
  149. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  150. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  151. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
  152. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
  153. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  154. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  155. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  156. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  157. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  158. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  159. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  160. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  161. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  162. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  163. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  164. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  165. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  166. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  167. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  168. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  169. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  170. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  171. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  172. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  173. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  174. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  175. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  176. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  177. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  179. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  180. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  181. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  182. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
  183. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  184. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
  185. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  186. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  187. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  188. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  189. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  190. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  191. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  192. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  193. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  194. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  196. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  197. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  198. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  199. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  200. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  201. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  202. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  203. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  204. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  205. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  206. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
  207. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  208. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  209. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  210. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  211. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  212. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  213. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  214. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  215. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
  216. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  217. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  218. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  219. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  220. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  221. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
  222. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  223. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
  224. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  225. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  226. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  227. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  228. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  229. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
  230. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  231. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  232. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
  233. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  234. inspect_ai/_view/www/src/types/log.d.ts +108 -19
  235. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  236. inspect_ai/_view/www/src/types.ts +71 -0
  237. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
  238. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  239. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
  240. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  241. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  242. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  243. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  244. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  245. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  246. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  247. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  248. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  249. inspect_ai/_view/www/src/utils/format.ts +194 -0
  250. inspect_ai/_view/www/src/utils/git.ts +7 -0
  251. inspect_ai/_view/www/src/utils/html.ts +6 -0
  252. inspect_ai/_view/www/src/utils/http.ts +14 -0
  253. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  254. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  255. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  256. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  257. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  258. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  259. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  260. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  261. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
  262. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  263. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  264. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  265. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  266. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  267. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
  268. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
  269. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
  270. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  271. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
  272. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  273. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  274. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  275. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  276. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  277. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  278. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  279. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  280. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  281. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  282. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  283. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
  284. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  285. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
  286. inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
  287. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
  288. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  289. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  290. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  291. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  292. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  293. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  294. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  295. inspect_ai/_view/www/tsconfig.json +23 -9
  296. inspect_ai/_view/www/vite.config.js +8 -17
  297. inspect_ai/_view/www/yarn.lock +627 -556
  298. inspect_ai/approval/_approval.py +2 -0
  299. inspect_ai/approval/_approver.py +4 -4
  300. inspect_ai/approval/_auto.py +1 -1
  301. inspect_ai/approval/_human/approver.py +3 -0
  302. inspect_ai/approval/_policy.py +5 -0
  303. inspect_ai/approval/_registry.py +2 -2
  304. inspect_ai/dataset/_dataset.py +64 -37
  305. inspect_ai/dataset/_sources/__init__.py +0 -0
  306. inspect_ai/dataset/_sources/csv.py +20 -12
  307. inspect_ai/dataset/_sources/file.py +4 -0
  308. inspect_ai/dataset/_sources/hf.py +39 -29
  309. inspect_ai/dataset/_sources/json.py +17 -9
  310. inspect_ai/log/__init__.py +2 -0
  311. inspect_ai/log/_convert.py +3 -3
  312. inspect_ai/log/_file.py +24 -9
  313. inspect_ai/log/_log.py +101 -13
  314. inspect_ai/log/_message.py +4 -2
  315. inspect_ai/log/_recorders/file.py +4 -0
  316. inspect_ai/log/_recorders/json.py +5 -7
  317. inspect_ai/log/_recorders/recorder.py +3 -0
  318. inspect_ai/log/_transcript.py +19 -8
  319. inspect_ai/model/__init__.py +2 -0
  320. inspect_ai/model/_cache.py +39 -21
  321. inspect_ai/model/_call_tools.py +4 -3
  322. inspect_ai/model/_chat_message.py +14 -4
  323. inspect_ai/model/_generate_config.py +1 -1
  324. inspect_ai/model/_model.py +31 -24
  325. inspect_ai/model/_model_output.py +14 -1
  326. inspect_ai/model/_openai.py +10 -18
  327. inspect_ai/model/_providers/anthropic.py +3 -3
  328. inspect_ai/model/_providers/google.py +9 -5
  329. inspect_ai/model/_providers/openai.py +5 -9
  330. inspect_ai/model/_providers/openai_o1.py +3 -5
  331. inspect_ai/model/_providers/openrouter.py +86 -0
  332. inspect_ai/model/_providers/providers.py +11 -0
  333. inspect_ai/scorer/__init__.py +6 -1
  334. inspect_ai/scorer/_answer.py +7 -7
  335. inspect_ai/scorer/_classification.py +38 -18
  336. inspect_ai/scorer/_common.py +2 -8
  337. inspect_ai/scorer/_match.py +4 -5
  338. inspect_ai/scorer/_metric.py +87 -28
  339. inspect_ai/scorer/_metrics/__init__.py +3 -3
  340. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  341. inspect_ai/scorer/_metrics/mean.py +3 -17
  342. inspect_ai/scorer/_metrics/std.py +111 -30
  343. inspect_ai/scorer/_model.py +12 -12
  344. inspect_ai/scorer/_pattern.py +3 -3
  345. inspect_ai/scorer/_reducer/reducer.py +36 -21
  346. inspect_ai/scorer/_reducer/registry.py +2 -2
  347. inspect_ai/scorer/_reducer/types.py +7 -1
  348. inspect_ai/scorer/_score.py +11 -1
  349. inspect_ai/scorer/_scorer.py +110 -16
  350. inspect_ai/solver/__init__.py +1 -1
  351. inspect_ai/solver/_basic_agent.py +19 -22
  352. inspect_ai/solver/_bridge/__init__.py +0 -3
  353. inspect_ai/solver/_bridge/bridge.py +3 -3
  354. inspect_ai/solver/_chain.py +1 -2
  355. inspect_ai/solver/_critique.py +3 -3
  356. inspect_ai/solver/_fork.py +2 -2
  357. inspect_ai/solver/_human_agent/__init__.py +0 -0
  358. inspect_ai/solver/_human_agent/agent.py +5 -8
  359. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  360. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  361. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  362. inspect_ai/solver/_multiple_choice.py +38 -26
  363. inspect_ai/solver/_prompt.py +7 -7
  364. inspect_ai/solver/_solver.py +53 -52
  365. inspect_ai/solver/_task_state.py +80 -69
  366. inspect_ai/solver/_use_tools.py +9 -9
  367. inspect_ai/tool/__init__.py +4 -1
  368. inspect_ai/tool/_tool.py +43 -14
  369. inspect_ai/tool/_tool_call.py +6 -2
  370. inspect_ai/tool/_tool_choice.py +3 -1
  371. inspect_ai/tool/_tool_def.py +10 -8
  372. inspect_ai/tool/_tool_params.py +24 -0
  373. inspect_ai/tool/_tool_with.py +7 -7
  374. inspect_ai/tool/_tools/__init__.py +0 -0
  375. inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
  376. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
  377. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  378. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  379. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  380. inspect_ai/tool/_tools/_execute.py +23 -11
  381. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  382. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  383. inspect_ai/tool/_tools/_web_search.py +7 -5
  384. inspect_ai/tool/beta.py +3 -0
  385. inspect_ai/util/_concurrency.py +3 -3
  386. inspect_ai/util/_panel.py +2 -0
  387. inspect_ai/util/_resource.py +12 -12
  388. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  389. inspect_ai/util/_sandbox/docker/config.py +2 -1
  390. inspect_ai/util/_sandbox/docker/docker.py +42 -86
  391. inspect_ai/util/_sandbox/docker/service.py +100 -0
  392. inspect_ai/util/_sandbox/environment.py +99 -96
  393. inspect_ai/util/_sandbox/self_check.py +124 -16
  394. inspect_ai/util/_subprocess.py +5 -3
  395. inspect_ai/util/_subtask.py +15 -16
  396. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
  397. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
  398. inspect_ai-0.3.64.dist-info/RECORD +625 -0
  399. inspect_ai/_view/www/src/Register.mjs +0 -3
  400. inspect_ai/_view/www/src/Types.mjs +0 -38
  401. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  402. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  403. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  404. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  405. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  406. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  407. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  408. inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
  409. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  410. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  411. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  412. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  413. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  414. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  415. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  416. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  417. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  418. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  419. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  420. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  421. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  422. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  423. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  424. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  425. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  426. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  427. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  428. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  429. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  430. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  431. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  432. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  433. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  434. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  435. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  436. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  437. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  438. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  439. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  440. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  441. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  442. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  443. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  444. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  445. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  446. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  447. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  448. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  449. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  450. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  451. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  452. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  453. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  454. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  455. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  456. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  457. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  458. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  459. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  460. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  461. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  462. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  463. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  464. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  465. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  466. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  467. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  468. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  469. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  470. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  471. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  472. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  473. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  474. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  475. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  476. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  477. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
  478. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  479. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  480. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  481. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  482. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  483. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  484. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  485. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  486. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  487. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  488. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  489. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  490. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  491. inspect_ai/tool/beta/__init__.py +0 -5
  492. inspect_ai-0.3.62.dist-info/RECORD +0 -481
  493. /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
  494. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
  495. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  496. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  497. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  498. /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
  499. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  500. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  501. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  502. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  503. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  504. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  505. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  506. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
  507. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
  508. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  509. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  510. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
  511. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  512. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  513. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  514. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  515. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  516. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
  517. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
  518. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
@@ -40,9 +40,9 @@ def score_reducer(
40
40
  """Decorator for registering Score Reducers.
41
41
 
42
42
  Args:
43
- func (ScoreReducerType | None): Function returning `ScoreReducer` targeted by
43
+ func: Function returning `ScoreReducer` targeted by
44
44
  plain task decorator without attributes (e.g. `@score_reducer`)
45
- name (str | None): Optional name for reducer. If the decorator has no name
45
+ name: Optional name for reducer. If the decorator has no name
46
46
  argument then the name of the function will be used to automatically assign a name.
47
47
 
48
48
  Returns:
@@ -5,7 +5,13 @@ from .._metric import Score
5
5
 
6
6
  @runtime_checkable
7
7
  class ScoreReducer(Protocol):
8
- def __call__(self, scores: list[Score]) -> Score: ...
8
+ def __call__(self, scores: list[Score]) -> Score:
9
+ """Reduce a set of scores to a single score.
10
+
11
+ Args:
12
+ scores: List of scores.
13
+ """
14
+ ...
9
15
 
10
16
  @property
11
17
  def __name__(self) -> str: ...
@@ -23,6 +23,8 @@ async def score(state: TaskState) -> list[Score]:
23
23
  a task that does not have a scorer.
24
24
 
25
25
  """
26
+ from inspect_ai.log._transcript import ScoreEvent, transcript
27
+
26
28
  scorers = _scorers.get(None)
27
29
  target = _target.get(None)
28
30
  if scorers is None or target is None:
@@ -30,7 +32,15 @@ async def score(state: TaskState) -> list[Score]:
30
32
  "The score() function can only be called while executing a task with a scorer."
31
33
  )
32
34
 
33
- return [await scorer(state, target) for scorer in scorers]
35
+ scores: list[Score] = []
36
+ for scorer in scorers:
37
+ score = await scorer(state, target)
38
+ scores.append(score)
39
+ transcript()._event(
40
+ ScoreEvent(score=score, target=target.target, intermediate=True)
41
+ )
42
+
43
+ return scores
34
44
 
35
45
 
36
46
  def init_scoring_context(scorers: list[Scorer], target: Target) -> None:
@@ -1,3 +1,5 @@
1
+ from copy import deepcopy
2
+ from dataclasses import dataclass, field
1
3
  from functools import wraps
2
4
  from typing import (
3
5
  Any,
@@ -9,38 +11,74 @@ from typing import (
9
11
  )
10
12
 
11
13
  from inspect_ai._util._async import is_callable_coroutine
14
+ from inspect_ai._util.error import PrerequisiteError
12
15
  from inspect_ai._util.registry import (
13
16
  RegistryInfo,
17
+ is_registry_object,
14
18
  registry_add,
15
19
  registry_create,
16
20
  registry_info,
17
21
  registry_name,
22
+ registry_params,
18
23
  registry_tag,
19
24
  registry_unqualified_name,
20
25
  )
21
26
  from inspect_ai.solver._task_state import TaskState
22
27
 
23
- from ._metric import Metric, Score
28
+ from ._metric import Metric, MetricSpec, Score, as_metric_spec
24
29
  from ._target import Target
25
30
 
26
31
 
27
32
  @runtime_checkable
28
33
  class Scorer(Protocol):
29
- r"""Score model outputs.
30
-
31
- Evaluate the passed outputs and targets and return a
32
- dictionary with scoring outcomes and context.
33
-
34
- Args:
35
- state (TaskState): Task state
36
- target (Target): Ideal target for the output.
37
- """
38
-
39
34
  async def __call__(
40
35
  self,
41
36
  state: TaskState,
42
37
  target: Target,
43
- ) -> Score: ...
38
+ ) -> Score:
39
+ r"""Score model outputs.
40
+
41
+ Evaluate the passed outputs and targets and return a
42
+ dictionary with scoring outcomes and context.
43
+
44
+ Args:
45
+ state: Task state
46
+ target: Ideal target for the output.
47
+
48
+ Examples:
49
+ ```python
50
+ @scorer
51
+ def custom_scorer() -> Scorer:
52
+ async def score(state: TaskState, target: Target) -> Score:
53
+ # Compare state / model output with target
54
+ # to yield a score
55
+ return Score(value=...)
56
+
57
+ return score
58
+ ````
59
+ """
60
+ ...
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class ScorerSpec:
65
+ """Scorer specification used to (re-)create scorers."""
66
+
67
+ scorer: str
68
+ """Scorer name"""
69
+
70
+ args: dict[str, Any] = field(default_factory=dict)
71
+ """Scorer arguments."""
72
+
73
+ metadata: dict[str, Any] | None = field(default=None)
74
+ """Scorer metadata"""
75
+
76
+ metrics: (
77
+ list[MetricSpec | dict[str, list[MetricSpec]]]
78
+ | dict[str, list[MetricSpec]]
79
+ | None
80
+ ) = field(default=None)
81
+ """Scorer metrics"""
44
82
 
45
83
 
46
84
  P = ParamSpec("P")
@@ -90,17 +128,28 @@ def scorer(
90
128
  r"""Decorator for registering scorers.
91
129
 
92
130
  Args:
93
- metrics (list[Metric] | dict[str, list[Metric]]): One or more metrics to calculate
131
+ metrics: One or more metrics to calculate
94
132
  over the scores.
95
- name (str | None):
96
- Optional name for scorer. If the decorator has no name
133
+ name: Optional name for scorer. If the decorator has no name
97
134
  argument then the name of the underlying ScorerType
98
135
  object will be used to automatically assign a name.
99
- **metadata (dict[str,Any]): Additional values to serialize
136
+ **metadata: Additional values to serialize
100
137
  in metadata.
101
138
 
102
139
  Returns:
103
140
  Scorer with registry attributes.
141
+
142
+ Examples:
143
+ ```python
144
+ @scorer
145
+ def custom_scorer() -> Scorer:
146
+ async def score(state: TaskState, target: Target) -> Score:
147
+ # Compare state / model output with target
148
+ # to yield a score
149
+ return Score(value=...)
150
+
151
+ return score
152
+ ````
104
153
  """
105
154
 
106
155
  def wrapper(scorer_type: Callable[P, Scorer]) -> Callable[P, Scorer]:
@@ -142,6 +191,51 @@ def scorer(
142
191
  return wrapper
143
192
 
144
193
 
194
+ def as_scorer_spec(scorer: Scorer) -> ScorerSpec:
195
+ if not is_registry_object(scorer):
196
+ raise PrerequisiteError(
197
+ f"The scorer {getattr(scorer, '__name__', '<unknown>')} was not created by a function decorated with @scorer so cannot be recorded."
198
+ )
199
+ name = registry_unqualified_name(scorer)
200
+ metrics = scorer_metrics(scorer)
201
+ resolved_metrics = resolve_metrics(metrics)
202
+
203
+ args = registry_params(scorer)
204
+ metadata = deepcopy(registry_info(scorer).metadata)
205
+ del metadata[SCORER_METRICS]
206
+
207
+ return ScorerSpec(
208
+ scorer=name, args=args, metadata=metadata, metrics=resolved_metrics
209
+ )
210
+
211
+
212
+ def resolve_metrics(
213
+ metrics: list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]],
214
+ ) -> (
215
+ list[MetricSpec | dict[str, list[MetricSpec]]] | dict[str, list[MetricSpec]] | None
216
+ ):
217
+ if isinstance(metrics, list):
218
+ resolved_metrics: list[MetricSpec | dict[str, list[MetricSpec]]] = []
219
+ for metric_item in metrics:
220
+ if isinstance(metric_item, Metric):
221
+ resolved_metrics.append(as_metric_spec(metric_item))
222
+ else:
223
+ resolved_metrics.append(
224
+ {
225
+ metric_group: [
226
+ as_metric_spec(metric) for metric in metrics_list
227
+ ]
228
+ for metric_group, metrics_list in metric_item.items()
229
+ }
230
+ )
231
+ return resolved_metrics
232
+ else:
233
+ return {
234
+ metric_group: [as_metric_spec(metric) for metric in metrics_list]
235
+ for metric_group, metrics_list in metrics.items()
236
+ }
237
+
238
+
145
239
  def scorer_metrics(
146
240
  scorer: Scorer,
147
241
  ) -> list[Metric | dict[str, list[Metric]]] | dict[str, list[Metric]]:
@@ -1,7 +1,7 @@
1
1
  from inspect_ai._util.deprecation import relocated_module_attribute
2
2
 
3
3
  from ._basic_agent import basic_agent
4
- from ._bridge import bridge
4
+ from ._bridge.bridge import bridge
5
5
  from ._chain import chain
6
6
  from ._critique import self_critique
7
7
  from ._fork import fork
@@ -81,31 +81,28 @@ def basic_agent(
81
81
  alternate conversion scheme as required via `score_value`.
82
82
 
83
83
  Args:
84
- init: (Solver | list[Solver] | None): Agent initialisation
85
- (defaults to system_message with basic ReAct prompt)
86
- tools (list[Tool | ToolDef] | Solver | None): Tools available for the agent. Either a
87
- list of tools or a Solver that can yield dynamic tools per-sample.
88
- cache: (bool | CachePolicy): Caching behaviour for generate responses
89
- (defaults to no caching).
90
- max_attempts (int): Maximum number of submissions to accept before terminating.
91
- message_limit (int | None): Limit on messages in sample before terminating agent.
84
+ init: Agent initialisation (defaults to system_message with basic ReAct prompt)
85
+ tools: Tools available for the agent. Either a list of tools or a Solver that
86
+ can yield dynamic tools per-sample.
87
+ cache: Caching behaviour for generate responses (defaults to no caching).
88
+ max_attempts: Maximum number of submissions to accept before terminating.
89
+ message_limit: Limit on messages in sample before terminating agent.
92
90
  If not specified, will use limit_messages defined for the task. If there is none
93
91
  defined for the task, 50 will be used as a default.
94
- token_limit (int | None): Limit on tokens used in sample before terminating agent.
95
- max_tool_output (int | None): Maximum output length (in bytes).
92
+ token_limit: Limit on tokens used in sample before terminating agent.
93
+ max_tool_output: Maximum output length (in bytes).
96
94
  Defaults to max_tool_output from active GenerateConfig.
97
- score_value (ValueToFloat): Function used to extract float from scores (defaults
98
- to standard value_to_float())
99
- incorrect_message (str | Callable[[TaskState, list[Score]], str | Awaitable[str]]):
100
- User message reply for an incorrect submission from the model. Alternatively,
101
- a function which returns a message (function may optionally be async)
102
- continue_message (str): User message to urge the model to continue when it
103
- doesn't make a tool call.
104
- submit_name (str): Name for tool used to make submissions
105
- (defaults to 'submit')
106
- submit_description (str): Description of submit tool (defaults to
107
- 'Submit an answer for evaluation')
108
- **kwargs (Any): Deprecated arguments for backward compatibility.
95
+ score_value: Function used to extract float from scores (defaults
96
+ to standard value_to_float())
97
+ incorrect_message: User message reply for an incorrect submission from the model.
98
+ Alternatively, a function which returns a message (function may optionally be async)
99
+ continue_message: User message to urge the model to continue when it
100
+ doesn't make a tool call.
101
+ submit_name: Name for tool used to make submissions
102
+ (defaults to 'submit')
103
+ submit_description: Description of submit tool (defaults to
104
+ 'Submit an answer for evaluation')
105
+ **kwargs: Deprecated arguments for backward compatibility.
109
106
 
110
107
  Returns:
111
108
  Plan for agent.
@@ -1,3 +0,0 @@
1
- from .bridge import bridge
2
-
3
- __all__ = ["bridge"]
@@ -17,7 +17,7 @@ from .._task_state import TaskState
17
17
  def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver:
18
18
  """Bridge an external agent into an Inspect Solver.
19
19
 
20
- See documentation at https://inspect.ai-safety-institute.org.uk/agent-bridge.html
20
+ See documentation at <https://inspect.ai-safety-institute.org.uk/agent-bridge.html>
21
21
 
22
22
  Args:
23
23
  agent: Callable which takes a sample `dict` and returns a result `dict`.
@@ -63,11 +63,11 @@ def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solv
63
63
  else state.input
64
64
  )
65
65
 
66
- # create sample
66
+ # create sample (use standard gpt-4 message encoding -- i.e. no 'developer' messages)
67
67
  sample = BridgeSample(
68
68
  sample_id=str(state.sample_id),
69
69
  epoch=state.epoch,
70
- input=await openai_chat_messages(input, state.model.name),
70
+ input=await openai_chat_messages(input, model="gpt-4"),
71
71
  metadata=state.metadata,
72
72
  target=list(state.target),
73
73
  )
@@ -15,8 +15,7 @@ def chain(*solvers: Solver | list[Solver]) -> Solver:
15
15
  early.
16
16
 
17
17
  Args:
18
- solvers (*Solver | list[Solver]): One or more solvers
19
- or lists of solvers to chain together.
18
+ *solvers: One or more solvers or lists of solvers to chain together.
20
19
 
21
20
  Returns:
22
21
  Solver that executes the passed solvers as a chain.
@@ -25,15 +25,15 @@ def self_critique(
25
25
  need to use the model being evaluated).
26
26
 
27
27
  Args:
28
- critique_template (str | None): String or path to file
28
+ critique_template: String or path to file
29
29
  containing critique template. The template uses two
30
30
  variables: `question` and `completion`.
31
31
  Variables from sample `metadata` are also available
32
32
  in the template.
33
- completion_template (str | None): String or path to file
33
+ completion_template: String or path to file
34
34
  containing completion template. The template uses
35
35
  three variables: `question`, `completion`, and `critique`
36
- model (str | Model | None): Alternate model to be used
36
+ model: Alternate model to be used
37
37
  for critique (by default the model being evaluated
38
38
  is used).
39
39
  """
@@ -32,8 +32,8 @@ async def fork(
32
32
  Store that doesn't affect the Store of other subtasks or the parent).
33
33
 
34
34
  Args:
35
- state (TaskState): Beginning TaskState
36
- solvers (Solver | list[Solver]): Solvers to apply on the TaskState.
35
+ state: Beginning TaskState
36
+ solvers: Solvers to apply on the TaskState.
37
37
  Each Solver will get a standalone copy of the TaskState.
38
38
 
39
39
  Returns:
File without changes
@@ -30,14 +30,11 @@ def human_agent(
30
30
  using a VS Code Window or Terminal.
31
31
 
32
32
  Args:
33
- answer (bool | str): Is an explicit answer required for this
34
- task or is it scored based on files in the container? Pass a
35
- `str` with a regex to validate that the answer matches
36
- the expected format.
37
- intermediate_scoring (bool): Allow the human agent to
38
- check their score while working.
39
- record_session (bool): Record all user commands and outputs in
40
- the sandbox bash session.
33
+ answer: Is an explicit answer required for this task or is it scored
34
+ based on files in the container? Pass a `str` with a regex to validate
35
+ that the answer matches the expected format.
36
+ intermediate_scoring: Allow the human agent to check their score while working.
37
+ record_session: Record all user commands and outputs in the sandbox bash session.
41
38
 
42
39
  Returns:
43
40
  Solver: Human agent solver.
@@ -27,14 +27,10 @@ class StartCommand(HumanAgentCommand):
27
27
  print(call_human_agent("start"))
28
28
 
29
29
  def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
30
- from inspect_ai.log._transcript import transcript
31
-
32
30
  async def start() -> str:
33
31
  if not state.running:
34
32
  state.running = True
35
- transcript().info(
36
- f"Task started (total time: {format_progress_time(state.time)})"
37
- )
33
+ clock_action_event("start", state)
38
34
  return render_status(state)
39
35
 
40
36
  return start
@@ -57,14 +53,22 @@ class StopCommand(HumanAgentCommand):
57
53
  print(call_human_agent("stop"))
58
54
 
59
55
  def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
60
- from inspect_ai.log._transcript import transcript
61
-
62
56
  async def stop() -> str:
63
57
  if state.running:
64
58
  state.running = False
65
- transcript().info(
66
- f"Task stopped (total time: {format_progress_time(state.time)})"
67
- )
59
+ clock_action_event("stop", state)
68
60
  return render_status(state)
69
61
 
70
62
  return stop
63
+
64
+
65
+ def clock_action_event(action: str, state: HumanAgentState) -> None:
66
+ from inspect_ai.log._transcript import transcript
67
+
68
+ transcript().info(
69
+ {
70
+ "action": action,
71
+ "total_time": format_progress_time(state.time, False),
72
+ },
73
+ source="human_agent",
74
+ )
@@ -37,6 +37,6 @@ class NoteCommand(HumanAgentCommand):
37
37
  from inspect_ai.log._transcript import transcript
38
38
 
39
39
  async def note(content: str) -> None:
40
- transcript().info(content)
40
+ transcript().info(content, source="human_agent")
41
41
 
42
42
  return note
@@ -1,6 +1,5 @@
1
1
  from argparse import Namespace
2
2
  from copy import deepcopy
3
- from textwrap import dedent
4
3
  from typing import Awaitable, Callable, Literal
5
4
 
6
5
  from pydantic import JsonValue
@@ -51,8 +50,6 @@ class ScoreCommand(HumanAgentCommand):
51
50
 
52
51
  def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]:
53
52
  async def score_task(answer: str | None) -> str:
54
- from inspect_ai.log._transcript import transcript
55
-
56
53
  # make a copy of TaskState, add the answer, then score
57
54
  if answer:
58
55
  task_state = deepcopy(self._state)
@@ -64,14 +61,6 @@ class ScoreCommand(HumanAgentCommand):
64
61
  # record the scoring action in our state
65
62
  state.scorings.append(IntermediateScoring(time=state.time, scores=result))
66
63
 
67
- # record to transcript
68
- transcript().info(
69
- dedent(f"""
70
- ### Intermediate Score
71
- **Answer:** {result[0].answer}, **Score:** {result[0].as_str()}
72
- """)
73
- )
74
-
75
64
  # notify user
76
65
  return render_text(
77
66
  f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}"
@@ -1,13 +1,19 @@
1
+ import logging
1
2
  import re
2
3
  from enum import Enum
3
4
  from random import Random
4
- from typing import Match
5
+ from typing import Match, TypedDict
5
6
 
7
+ from typing_extensions import Unpack
8
+
9
+ from inspect_ai._util.logger import warn_once
6
10
  from inspect_ai.util import resource
7
11
 
8
12
  from ._solver import Generate, Solver, solver
9
13
  from ._task_state import Choices, TaskState
10
14
 
15
+ logger = logging.getLogger(__name__)
16
+
11
17
  SINGLE_ANSWER_TEMPLATE = r"""
12
18
  Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
13
19
 
@@ -201,52 +207,58 @@ class MultipleChoiceTemplate(str, Enum):
201
207
  MULTIPLE_ANSWER_COT = MULTIPLE_ANSWER_TEMPLATE_COT
202
208
 
203
209
 
210
+ class DeprecatedArgs(TypedDict, total=False):
211
+ shuffle: bool | Random
212
+
213
+
204
214
  @solver
205
215
  def multiple_choice(
206
216
  *,
207
217
  template: str | None = None,
208
218
  cot: bool = False,
209
219
  multiple_correct: bool = False,
210
- shuffle: bool | Random = False,
220
+ **kwargs: Unpack[DeprecatedArgs],
211
221
  ) -> Solver:
212
- """Multiple choice question solver.
213
-
214
- Formats a multiple choice question prompt, then calls `generate()`
215
-
216
- ### Usage
222
+ """Multiple choice question solver. Formats a multiple choice question prompt, then calls `generate()`.
217
223
 
218
224
  Note that due to the way this solver works, it has some constraints:
219
225
 
220
- 1. The `Sample` must have the `choices` attribute set.
221
- 2. The only built-in compatible scorer is the `choice` scorer.
222
- 3. It calls `generate()` internally, so you don't need to call it again
223
-
224
- ### Shuffling
225
-
226
- If the choices are shuffled, we will unshuffle them in the message history
227
- after the model has been called, essentially rewriting history. It is
228
- something to be aware of if writing custom scorers or solvers that interact
229
- with this scorer.
226
+ 1. The `Sample` must have the `choices` attribute set.
227
+ 2. The only built-in compatible scorer is the `choice` scorer.
228
+ 3. It calls `generate()` internally, so you don't need to call it again
230
229
 
231
230
  Args:
232
- template (str | None): Template to use for the multiple choice question.
231
+ template: Template to use for the multiple choice question.
233
232
  The defaults vary based on the options and are taken from the `MultipleChoiceTemplate` enum. The template will have questions and possible answers substituted into it before being sent to the model. Consequently it requires three specific template variables:
234
- - `{question}`: The question to be asked.
235
- - `{choices}`: The choices available, which will be formatted as a
233
+
234
+ - `{question}`: The question to be asked.
235
+ - `{choices}`: The choices available, which will be formatted as a
236
236
  list of A) ... B) ... etc. before sending to the model.
237
- - `{letters}`: (optional) A string of letters representing the choices, e.g.
237
+ - `{letters}`: (optional) A string of letters representing the choices, e.g.
238
238
  "A,B,C". Used to be explicit to the model about the possible answers.
239
- cot (bool): Default `False`. Whether the solver should perform chain-of-thought
239
+ cot: Default `False`. Whether the solver should perform chain-of-thought
240
240
  reasoning before answering. NOTE: this has no effect if you provide a custom template.
241
- multiple_correct (bool): Default `False`. Whether to allow multiple
241
+ multiple_correct: Default `False`. Whether to allow multiple
242
242
  answers to the multiple choice question. For example, "What numbers are
243
243
  squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave
244
244
  as `False` if there's exactly one correct answer from the choices
245
245
  available. NOTE: this has no effect if you provide a custom template.
246
- shuffle (bool | Random): Default `False`. Whether to shuffle the choices
247
- in the multiple. Passing a `Random` instance will use that for shuffling,
248
- if `True` a new `Random` instance will be created.
246
+ **kwargs (Any): Deprecated arguments for backward compatibility.
247
+
248
+ #### Shuffling
249
+
250
+ You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API.
249
251
  """
252
+ shuffle: bool | Random = False
253
+ if "shuffle" in kwargs:
254
+ shuffle = kwargs["shuffle"]
255
+
256
+ if shuffle:
257
+ warn_once(
258
+ logger,
259
+ "The multiple choice shuffle parameter is deprecated. Please shuffle choices at the time your dataset is read by using the shuffle_choices method/parameter of the datasets API.",
260
+ )
261
+
250
262
  if template and not valid_template(template):
251
263
  raise ValueError(
252
264
  "The template must contain '{question}' and '{choices}' placeholders for string substitution."
@@ -20,8 +20,8 @@ def prompt_template(template: str, **params: Any) -> Solver:
20
20
  `params`.
21
21
 
22
22
  Args:
23
- template: (str): Template for prompt.
24
- **params (dict[str,Any]): Parameters to fill into the template.
23
+ template: Template for prompt.
24
+ **params: Parameters to fill into the template.
25
25
 
26
26
  Returns:
27
27
  A solver that uses the specified prompt template.
@@ -51,8 +51,8 @@ def system_message(template: str, **params: Any) -> Solver:
51
51
  are none it will be inserted at the beginning of the conversation).
52
52
 
53
53
  Args:
54
- template (str): Template for system message.
55
- **params (dict[str,Any]): Parameters to fill into the template.
54
+ template: Template for system message.
55
+ **params: Parameters to fill into the template.
56
56
 
57
57
  Returns:
58
58
  A solver that inserts the parameterised system message.
@@ -80,8 +80,8 @@ def user_message(template: str, **params: Any) -> Solver:
80
80
  included in the `params`.
81
81
 
82
82
  Args:
83
- template (str): Template for user message.
84
- **params (dict[str,Any]): Parameters to fill into the template.
83
+ template: Template for user message.
84
+ **params: Parameters to fill into the template.
85
85
 
86
86
  Returns:
87
87
  A solver that inserts the parameterised user message.
@@ -109,7 +109,7 @@ def chain_of_thought(template: str = DEFAULT_COT_TEMPLATE) -> Solver:
109
109
  """Solver which modifies the user prompt to encourage chain of thought.
110
110
 
111
111
  Args:
112
- template (str): String or path to file containing CoT template.
112
+ template: String or path to file containing CoT template.
113
113
  The template uses a single variable: `prompt`.
114
114
  """
115
115