inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (518) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/main.py +1 -1
  8. inspect_ai/_cli/sandbox.py +4 -1
  9. inspect_ai/_cli/score.py +181 -32
  10. inspect_ai/_cli/trace.py +10 -0
  11. inspect_ai/_cli/view.py +4 -2
  12. inspect_ai/_display/core/active.py +2 -3
  13. inspect_ai/_display/core/config.py +7 -1
  14. inspect_ai/_display/textual/widgets/samples.py +4 -3
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/eval.py +104 -101
  17. inspect_ai/_eval/evalset.py +75 -75
  18. inspect_ai/_eval/loader.py +122 -12
  19. inspect_ai/_eval/registry.py +1 -1
  20. inspect_ai/_eval/run.py +14 -0
  21. inspect_ai/_eval/score.py +125 -36
  22. inspect_ai/_eval/task/log.py +105 -4
  23. inspect_ai/_eval/task/results.py +92 -38
  24. inspect_ai/_eval/task/run.py +9 -2
  25. inspect_ai/_eval/task/sandbox.py +35 -2
  26. inspect_ai/_eval/task/task.py +49 -46
  27. inspect_ai/_util/constants.py +1 -1
  28. inspect_ai/_util/content.py +8 -0
  29. inspect_ai/_util/error.py +2 -0
  30. inspect_ai/_util/file.py +15 -1
  31. inspect_ai/_util/hash.py +1 -1
  32. inspect_ai/_util/logger.py +4 -2
  33. inspect_ai/_util/registry.py +7 -1
  34. inspect_ai/_view/view.py +1 -2
  35. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  36. inspect_ai/_view/www/.vscode/settings.json +8 -0
  37. inspect_ai/_view/www/App.css +97 -29
  38. inspect_ai/_view/www/README.md +1 -1
  39. inspect_ai/_view/www/dist/assets/index.css +16663 -14674
  40. inspect_ai/_view/www/dist/assets/index.js +58808 -51348
  41. inspect_ai/_view/www/dist/index.html +1 -1
  42. inspect_ai/_view/www/index.html +2 -2
  43. inspect_ai/_view/www/log-schema.json +87 -73
  44. inspect_ai/_view/www/package.json +22 -4
  45. inspect_ai/_view/www/postcss.config.cjs +8 -9
  46. inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
  47. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  48. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  49. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  50. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  51. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  52. inspect_ai/_view/www/src/api/index.ts +4 -4
  53. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  54. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  55. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  56. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  57. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  58. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  59. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  60. inspect_ai/_view/www/src/components/Card.css +60 -0
  61. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  62. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  63. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  64. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  65. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  66. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  67. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  68. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  69. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  70. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  71. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  72. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  73. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  74. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  75. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  76. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  77. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  78. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  79. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  80. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  81. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  82. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  83. inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
  84. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  85. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  86. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  87. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  88. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  89. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  90. inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
  91. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  92. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  93. inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
  94. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  95. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  96. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  97. inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
  98. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  99. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  100. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  101. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  102. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  103. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
  104. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  105. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  106. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  107. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  108. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  109. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  110. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  111. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  112. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  113. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  114. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  115. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  116. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  117. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  118. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
  119. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  120. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  121. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  122. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  123. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  124. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  126. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  127. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
  128. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  129. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
  130. inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
  131. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  132. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  133. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  134. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  135. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  136. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
  137. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  138. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  139. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  140. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
  141. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  142. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
  143. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  144. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
  145. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  146. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
  147. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  148. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
  149. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  150. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  151. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
  152. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
  153. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  154. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  155. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  156. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  157. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  158. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  159. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  160. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  161. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  162. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  163. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  164. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  165. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  166. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  167. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  168. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  169. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  170. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  171. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  172. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  173. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  174. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  175. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  176. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  177. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  179. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  180. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  181. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  182. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
  183. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  184. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
  185. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  186. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  187. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  188. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  189. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  190. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  191. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  192. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  193. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  194. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  196. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  197. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  198. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  199. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  200. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  201. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  202. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  203. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  204. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  205. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  206. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
  207. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  208. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  209. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  210. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  211. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  212. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  213. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  214. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  215. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
  216. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  217. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  218. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  219. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  220. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  221. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
  222. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  223. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
  224. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  225. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  226. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  227. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  228. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  229. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
  230. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  231. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  232. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
  233. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  234. inspect_ai/_view/www/src/types/log.d.ts +108 -19
  235. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  236. inspect_ai/_view/www/src/types.ts +71 -0
  237. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
  238. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  239. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
  240. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  241. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  242. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  243. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  244. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  245. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  246. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  247. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  248. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  249. inspect_ai/_view/www/src/utils/format.ts +194 -0
  250. inspect_ai/_view/www/src/utils/git.ts +7 -0
  251. inspect_ai/_view/www/src/utils/html.ts +6 -0
  252. inspect_ai/_view/www/src/utils/http.ts +14 -0
  253. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  254. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  255. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  256. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  257. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  258. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  259. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  260. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  261. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
  262. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  263. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  264. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  265. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  266. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  267. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
  268. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
  269. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
  270. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  271. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
  272. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  273. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  274. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  275. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  276. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  277. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  278. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  279. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  280. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  281. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  282. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  283. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
  284. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  285. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
  286. inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
  287. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
  288. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  289. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  290. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  291. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  292. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  293. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  294. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  295. inspect_ai/_view/www/tsconfig.json +23 -9
  296. inspect_ai/_view/www/vite.config.js +8 -17
  297. inspect_ai/_view/www/yarn.lock +627 -556
  298. inspect_ai/approval/_approval.py +2 -0
  299. inspect_ai/approval/_approver.py +4 -4
  300. inspect_ai/approval/_auto.py +1 -1
  301. inspect_ai/approval/_human/approver.py +3 -0
  302. inspect_ai/approval/_policy.py +5 -0
  303. inspect_ai/approval/_registry.py +2 -2
  304. inspect_ai/dataset/_dataset.py +64 -37
  305. inspect_ai/dataset/_sources/__init__.py +0 -0
  306. inspect_ai/dataset/_sources/csv.py +20 -12
  307. inspect_ai/dataset/_sources/file.py +4 -0
  308. inspect_ai/dataset/_sources/hf.py +39 -29
  309. inspect_ai/dataset/_sources/json.py +17 -9
  310. inspect_ai/log/__init__.py +2 -0
  311. inspect_ai/log/_convert.py +3 -3
  312. inspect_ai/log/_file.py +24 -9
  313. inspect_ai/log/_log.py +101 -13
  314. inspect_ai/log/_message.py +4 -2
  315. inspect_ai/log/_recorders/file.py +4 -0
  316. inspect_ai/log/_recorders/json.py +5 -7
  317. inspect_ai/log/_recorders/recorder.py +3 -0
  318. inspect_ai/log/_transcript.py +19 -8
  319. inspect_ai/model/__init__.py +2 -0
  320. inspect_ai/model/_cache.py +39 -21
  321. inspect_ai/model/_call_tools.py +4 -3
  322. inspect_ai/model/_chat_message.py +14 -4
  323. inspect_ai/model/_generate_config.py +1 -1
  324. inspect_ai/model/_model.py +31 -24
  325. inspect_ai/model/_model_output.py +14 -1
  326. inspect_ai/model/_openai.py +10 -18
  327. inspect_ai/model/_providers/anthropic.py +3 -3
  328. inspect_ai/model/_providers/google.py +9 -5
  329. inspect_ai/model/_providers/openai.py +5 -9
  330. inspect_ai/model/_providers/openai_o1.py +3 -5
  331. inspect_ai/model/_providers/openrouter.py +86 -0
  332. inspect_ai/model/_providers/providers.py +11 -0
  333. inspect_ai/scorer/__init__.py +6 -1
  334. inspect_ai/scorer/_answer.py +7 -7
  335. inspect_ai/scorer/_classification.py +38 -18
  336. inspect_ai/scorer/_common.py +2 -8
  337. inspect_ai/scorer/_match.py +4 -5
  338. inspect_ai/scorer/_metric.py +87 -28
  339. inspect_ai/scorer/_metrics/__init__.py +3 -3
  340. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  341. inspect_ai/scorer/_metrics/mean.py +3 -17
  342. inspect_ai/scorer/_metrics/std.py +111 -30
  343. inspect_ai/scorer/_model.py +12 -12
  344. inspect_ai/scorer/_pattern.py +3 -3
  345. inspect_ai/scorer/_reducer/reducer.py +36 -21
  346. inspect_ai/scorer/_reducer/registry.py +2 -2
  347. inspect_ai/scorer/_reducer/types.py +7 -1
  348. inspect_ai/scorer/_score.py +11 -1
  349. inspect_ai/scorer/_scorer.py +110 -16
  350. inspect_ai/solver/__init__.py +1 -1
  351. inspect_ai/solver/_basic_agent.py +19 -22
  352. inspect_ai/solver/_bridge/__init__.py +0 -3
  353. inspect_ai/solver/_bridge/bridge.py +3 -3
  354. inspect_ai/solver/_chain.py +1 -2
  355. inspect_ai/solver/_critique.py +3 -3
  356. inspect_ai/solver/_fork.py +2 -2
  357. inspect_ai/solver/_human_agent/__init__.py +0 -0
  358. inspect_ai/solver/_human_agent/agent.py +5 -8
  359. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  360. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  361. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  362. inspect_ai/solver/_multiple_choice.py +38 -26
  363. inspect_ai/solver/_prompt.py +7 -7
  364. inspect_ai/solver/_solver.py +53 -52
  365. inspect_ai/solver/_task_state.py +80 -69
  366. inspect_ai/solver/_use_tools.py +9 -9
  367. inspect_ai/tool/__init__.py +4 -1
  368. inspect_ai/tool/_tool.py +43 -14
  369. inspect_ai/tool/_tool_call.py +6 -2
  370. inspect_ai/tool/_tool_choice.py +3 -1
  371. inspect_ai/tool/_tool_def.py +10 -8
  372. inspect_ai/tool/_tool_params.py +24 -0
  373. inspect_ai/tool/_tool_with.py +7 -7
  374. inspect_ai/tool/_tools/__init__.py +0 -0
  375. inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
  376. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
  377. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  378. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  379. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  380. inspect_ai/tool/_tools/_execute.py +23 -11
  381. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  382. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  383. inspect_ai/tool/_tools/_web_search.py +7 -5
  384. inspect_ai/tool/beta.py +3 -0
  385. inspect_ai/util/_concurrency.py +3 -3
  386. inspect_ai/util/_panel.py +2 -0
  387. inspect_ai/util/_resource.py +12 -12
  388. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  389. inspect_ai/util/_sandbox/docker/config.py +2 -1
  390. inspect_ai/util/_sandbox/docker/docker.py +42 -86
  391. inspect_ai/util/_sandbox/docker/service.py +100 -0
  392. inspect_ai/util/_sandbox/environment.py +99 -96
  393. inspect_ai/util/_sandbox/self_check.py +124 -16
  394. inspect_ai/util/_subprocess.py +5 -3
  395. inspect_ai/util/_subtask.py +15 -16
  396. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
  397. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
  398. inspect_ai-0.3.64.dist-info/RECORD +625 -0
  399. inspect_ai/_view/www/src/Register.mjs +0 -3
  400. inspect_ai/_view/www/src/Types.mjs +0 -38
  401. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  402. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  403. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  404. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  405. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  406. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  407. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  408. inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
  409. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  410. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  411. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  412. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  413. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  414. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  415. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  416. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  417. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  418. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  419. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  420. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  421. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  422. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  423. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  424. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  425. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  426. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  427. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  428. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  429. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  430. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  431. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  432. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  433. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  434. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  435. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  436. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  437. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  438. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  439. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  440. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  441. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  442. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  443. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  444. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  445. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  446. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  447. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  448. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  449. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  450. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  451. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  452. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  453. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  454. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  455. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  456. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  457. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  458. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  459. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  460. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  461. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  462. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  463. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  464. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  465. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  466. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  467. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  468. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  469. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  470. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  471. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  472. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  473. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  474. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  475. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  476. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  477. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
  478. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  479. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  480. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  481. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  482. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  483. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  484. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  485. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  486. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  487. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  488. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  489. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  490. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  491. inspect_ai/tool/beta/__init__.py +0 -5
  492. inspect_ai-0.3.62.dist-info/RECORD +0 -481
  493. /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
  494. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
  495. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  496. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  497. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  498. /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
  499. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  500. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  501. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  502. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  503. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  504. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  505. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  506. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
  507. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
  508. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  509. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  510. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
  511. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  512. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  513. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  514. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  515. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  516. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
  517. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
  518. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/score.py CHANGED
@@ -1,15 +1,16 @@
1
1
  import asyncio
2
2
  from copy import deepcopy
3
- from typing import Callable, cast
3
+ from pathlib import Path
4
+ from typing import Any, Callable, Literal, cast
4
5
 
5
6
  from inspect_ai._display import display
6
- from inspect_ai._util.path import chdir_python
7
+ from inspect_ai._eval.loader import scorer_from_spec
7
8
  from inspect_ai._util.platform import platform_init
8
9
  from inspect_ai._util.registry import registry_create, registry_unqualified_name
9
10
  from inspect_ai.log import (
10
11
  EvalLog,
11
- EvalMetric,
12
12
  )
13
+ from inspect_ai.log._log import EvalMetricDefinition
13
14
  from inspect_ai.model import ModelName
14
15
  from inspect_ai.scorer import Metric, Scorer, Target
15
16
  from inspect_ai.scorer._metric import SampleScore
@@ -19,18 +20,19 @@ from inspect_ai.scorer._reducer import (
19
20
  create_reducers,
20
21
  reducer_log_names,
21
22
  )
22
- from inspect_ai.scorer._scorer import unique_scorer_name
23
+ from inspect_ai.scorer._scorer import ScorerSpec, unique_scorer_name
23
24
  from inspect_ai.solver import TaskState
24
25
 
25
- from .task import Task
26
26
  from .task.results import eval_results
27
- from .task.util import task_run_dir
27
+
28
+ ScoreAction = Literal["append", "overwrite"]
28
29
 
29
30
 
30
31
  def score(
31
32
  log: EvalLog,
32
33
  scorers: Scorer | list[Scorer],
33
34
  epochs_reducer: ScoreReducers | None = None,
35
+ action: ScoreAction | None = None,
34
36
  ) -> EvalLog:
35
37
  """Score an evaluation log.
36
38
 
@@ -40,6 +42,7 @@ def score(
40
42
  epochs_reducer (ScoreReducers | None):
41
43
  Reducer function(s) for aggregating scores in each sample.
42
44
  Defaults to previously used reducer(s).
45
+ action: Whether to append or overwrite this score
43
46
 
44
47
  Returns:
45
48
  Log with scores yielded by scorer.
@@ -50,13 +53,14 @@ def score(
50
53
  # resolve scorers into a list
51
54
  scorers = [scorers] if isinstance(scorers, Scorer) else scorers
52
55
 
53
- return asyncio.run(score_async(log, scorers, epochs_reducer))
56
+ return asyncio.run(score_async(log, scorers, epochs_reducer, action))
54
57
 
55
58
 
56
59
  async def score_async(
57
60
  log: EvalLog,
58
61
  scorers: list[Scorer],
59
62
  epochs_reducer: ScoreReducers | None = None,
63
+ action: ScoreAction | None = None,
60
64
  ) -> EvalLog:
61
65
  """Score an evaluation log.
62
66
 
@@ -66,6 +70,8 @@ async def score_async(
66
70
  epochs_reducer (ScoreReducers | None):
67
71
  Reducer function(s) for aggregating scores in each sample.
68
72
  Defaults to previously used reducer(s).
73
+ action: Whether to append or overwrite this score
74
+
69
75
 
70
76
 
71
77
  Returns:
@@ -109,7 +115,22 @@ async def score_async(
109
115
 
110
116
  # write them back (gather ensures that they come back in the same order)
111
117
  for index, score in enumerate(scores):
112
- log.samples[index].scores = {k: v.score for k, v in score.items()}
118
+ if action == "overwrite":
119
+ log.samples[index].scores = {k: v.score for k, v in score.items()}
120
+ else:
121
+ existing_scores = log.samples[index].scores or {}
122
+ new_scores = {k: v.score for k, v in score.items()}
123
+
124
+ for key, value in new_scores.items():
125
+ if key not in existing_scores:
126
+ existing_scores[key] = value
127
+ else:
128
+ # This key already exists, dedupe its name
129
+ count = 1
130
+ while f"{key}-{count}" in existing_scores.keys():
131
+ count = count + 1
132
+ existing_scores[f"{key}-{count}"] = value
133
+ log.samples[index].scores = existing_scores
113
134
 
114
135
  # collect metrics from EvalLog (they may overlap w/ the scorer metrics,
115
136
  # that will be taken care of in eval_results)
@@ -130,30 +151,37 @@ async def score_async(
130
151
  return log
131
152
 
132
153
 
133
- async def task_score(task: Task, log: EvalLog) -> EvalLog:
134
- with chdir_python(task_run_dir(task)):
135
- # confirm we have a scorer
136
- if task.scorer is None:
137
- raise ValueError("You must specify a scorer for evals to be scored.")
154
+ async def task_score(
155
+ log: EvalLog,
156
+ scorer: str | None = None,
157
+ scorer_args: dict[str, Any] | None = None,
158
+ action: ScoreAction | None = None,
159
+ ) -> EvalLog:
160
+ # confirm we have a scorer
161
+ scorers = resolve_scorers(log, scorer, scorer_args)
162
+ if len(scorers) == 0:
163
+ raise ValueError(
164
+ "Unable to resolve any scorers for this log. Please specify a scorer using the '--scorer' param."
165
+ )
138
166
 
139
- # confirm we have samples
140
- if log.samples is None or len(log.samples) == 0:
141
- raise ValueError("There are no samples to score in the log.")
167
+ # confirm we have samples
168
+ if log.samples is None or len(log.samples) == 0:
169
+ raise ValueError("There are no samples to score in the log.")
142
170
 
143
- task_name = task.name
144
- display().print(f"Scoring {len(log.samples)} samples for task: {task_name}")
171
+ task_name = log.eval.task
172
+ display().print(f"\nScoring {task_name} ({len(log.samples)} samples)")
145
173
 
146
- # perform scoring
147
- log = await score_async(log, task.scorer)
174
+ # perform scoring
175
+ log = await score_async(log=log, scorers=scorers, action=action)
148
176
 
149
177
  # compute and log metrics
150
- display().print(f"Aggregating scores for task: {task_name}")
151
- if task.scorer and log.samples:
178
+ if log.samples:
152
179
  sample_scores = [
153
180
  {
154
181
  score_key: SampleScore(
155
182
  score=score,
156
183
  sample_id=sample.id,
184
+ sample_metadata=sample.metadata,
157
185
  )
158
186
  for score_key, score in sample.scores.items()
159
187
  }
@@ -161,12 +189,15 @@ async def task_score(task: Task, log: EvalLog) -> EvalLog:
161
189
  if sample.scores is not None
162
190
  ]
163
191
 
192
+ epochs_reducer = reducers_from_log(log)
193
+ metrics = metrics_from_log(log)
194
+
164
195
  log.results, log.reductions = eval_results(
165
196
  log.results.total_samples if log.results else 0,
166
197
  sample_scores,
167
- task.epochs_reducer,
168
- task.scorer,
169
- task.metrics,
198
+ epochs_reducer,
199
+ scorers,
200
+ metrics,
170
201
  )
171
202
  return log
172
203
 
@@ -185,6 +216,7 @@ async def run_score_task(
185
216
  results[scorer_name] = SampleScore(
186
217
  score=result,
187
218
  sample_id=state.sample_id,
219
+ sample_metadata=state.metadata,
188
220
  scorer=registry_unqualified_name(scorer),
189
221
  )
190
222
 
@@ -192,21 +224,78 @@ async def run_score_task(
192
224
  return results
193
225
 
194
226
 
195
- def metrics_from_log(log: EvalLog) -> list[Metric]:
227
+ def metrics_from_log(log: EvalLog) -> list[Metric] | dict[str, list[Metric]] | None:
228
+ # See if we have metrics in the eval itself
229
+ if log.eval.metrics:
230
+ if isinstance(log.eval.metrics, list):
231
+ return [metric_from_log(metric) for metric in log.eval.metrics]
232
+ else:
233
+ return {
234
+ key: [metric_from_log(metric) for metric in metrics]
235
+ for key, metrics in log.eval.metrics.items()
236
+ }
237
+ return None
238
+
239
+
240
+ def metric_from_log(metric: EvalMetricDefinition) -> Metric:
241
+ return cast(
242
+ Metric, registry_create("metric", metric.name, **(metric.options or {}))
243
+ )
244
+
245
+
246
+ def reducers_from_log(log: EvalLog) -> list[ScoreReducer] | None:
247
+ return create_reducers(log.eval.config.epochs_reducer)
248
+
249
+
250
+ def resolve_scorers(
251
+ log: EvalLog, scorer: str | None, scorer_args: dict[str, Any] | None
252
+ ) -> list[Scorer]:
253
+ """
254
+ Create a list of Scorer objects from an evaluation log.
255
+
256
+ Args:
257
+ log: EvalLog object containing evaluation configuration and results
258
+ scorer:: Scorer name (simple name or file.py@name).
259
+ scorer_args: Dictionary of scorer arguments
260
+
261
+ Returns:
262
+ list[Scorer]: List of initialized scorers
263
+ """
264
+ # resolve the scorer path
265
+ task_path = Path(log.eval.task_file) if log.eval.task_file else None
266
+
267
+ # If there is an explicit scorer
268
+ if scorer:
269
+ return [
270
+ scorer_from_spec(
271
+ spec=ScorerSpec(scorer=scorer),
272
+ task_path=task_path,
273
+ **(scorer_args or {}),
274
+ )
275
+ ]
276
+ # See if we can create scorers from the eval itself
277
+ elif log.eval.scorers is not None:
278
+ return (
279
+ [
280
+ scorer_from_spec(
281
+ spec=ScorerSpec(scorer=score.name),
282
+ task_path=task_path,
283
+ **(score.options or {}),
284
+ )
285
+ for score in log.eval.scorers
286
+ ]
287
+ if log.results
288
+ else []
289
+ )
290
+
291
+ # Otherwise, perhaps we can re-create them from the results
196
292
  return (
197
293
  [
198
- metric_from_log(metric)
294
+ scorer_from_spec(
295
+ spec=ScorerSpec(scorer=score.name), task_path=task_path, **score.params
296
+ )
199
297
  for score in log.results.scores
200
- for metric in score.metrics.values()
201
298
  ]
202
299
  if log.results
203
300
  else []
204
301
  )
205
-
206
-
207
- def metric_from_log(metric: EvalMetric) -> Metric:
208
- return cast(Metric, registry_create("metric", metric.name, **metric.options))
209
-
210
-
211
- def reducers_from_log(log: EvalLog) -> list[ScoreReducer] | None:
212
- return create_reducers(log.eval.config.epochs_reducer)
@@ -4,9 +4,7 @@ from typing import Any, Literal, cast
4
4
  from shortuuid import uuid
5
5
 
6
6
  from inspect_ai._eval.task.util import slice_dataset
7
- from inspect_ai._util.constants import (
8
- PKG_NAME,
9
- )
7
+ from inspect_ai._util.constants import PKG_NAME
10
8
  from inspect_ai._util.datetime import iso_now
11
9
  from inspect_ai._util.git import git_context
12
10
  from inspect_ai._util.path import cwd_relative_path
@@ -27,7 +25,13 @@ from inspect_ai.log import (
27
25
  EvalSpec,
28
26
  EvalStats,
29
27
  )
30
- from inspect_ai.log._log import EvalLog, EvalSampleReductions
28
+ from inspect_ai.log._log import (
29
+ EvalLog,
30
+ EvalMetricDefinition,
31
+ EvalSampleReductions,
32
+ EvalScorer,
33
+ eval_config_defaults,
34
+ )
31
35
  from inspect_ai.log._recorders import Recorder
32
36
  from inspect_ai.model import (
33
37
  GenerateConfig,
@@ -35,6 +39,8 @@ from inspect_ai.model import (
35
39
  ModelName,
36
40
  )
37
41
  from inspect_ai.model._model import model_usage
42
+ from inspect_ai.scorer._metric import MetricSpec
43
+ from inspect_ai.scorer._scorer import ScorerSpec
38
44
  from inspect_ai.solver._plan import Plan
39
45
  from inspect_ai.solver._solver import Solver, SolverSpec
40
46
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
@@ -52,6 +58,8 @@ class TaskLogger:
52
58
  tags: list[str] | None,
53
59
  model: Model,
54
60
  dataset: Dataset,
61
+ scorer: list[ScorerSpec] | None,
62
+ metrics: list[MetricSpec] | dict[str, list[MetricSpec]] | None,
55
63
  sandbox: SandboxEnvironmentSpec | None,
56
64
  task_attribs: dict[str, Any],
57
65
  task_args: dict[str, Any],
@@ -92,6 +100,17 @@ class TaskLogger:
92
100
  ],
93
101
  )
94
102
 
103
+ # write defaults for unspecified config
104
+ for name, value in eval_config_defaults().items():
105
+ if getattr(eval_config, name, None) is None:
106
+ setattr(eval_config, name, value)
107
+
108
+ # resolve scorers
109
+ eval_scorers = resolve_eval_scorers(scorer)
110
+
111
+ # resolve metrics
112
+ eval_metrics = resolve_eval_metrics(metrics)
113
+
95
114
  # create eval spec
96
115
  self.eval = EvalSpec(
97
116
  run_id=run_id,
@@ -114,6 +133,8 @@ class TaskLogger:
114
133
  sample_ids=sample_ids,
115
134
  shuffled=dataset.shuffled,
116
135
  ),
136
+ scorers=eval_scorers,
137
+ metrics=eval_metrics,
117
138
  sandbox=sandbox,
118
139
  model_args=model_args,
119
140
  config=eval_config,
@@ -200,3 +221,83 @@ def collect_eval_data(stats: EvalStats) -> None:
200
221
  # collect stats
201
222
  stats.completed_at = iso_now()
202
223
  stats.model_usage = model_usage()
224
+
225
+
226
+ def resolve_eval_metrics(
227
+ metrics: list[MetricSpec] | dict[str, list[MetricSpec]] | None,
228
+ ) -> list[EvalMetricDefinition] | dict[str, list[EvalMetricDefinition]] | None:
229
+ if metrics is None:
230
+ return None
231
+ elif isinstance(metrics, list):
232
+ return [EvalMetricDefinition(name=m.metric, options=m.args) for m in metrics]
233
+ else:
234
+ return {
235
+ k: [
236
+ EvalMetricDefinition(name=v.metric, options=v.args) for v in metric_list
237
+ ]
238
+ for k, metric_list in metrics.items()
239
+ }
240
+
241
+
242
+ def resolve_eval_scorers(scorers: list[ScorerSpec] | None) -> list[EvalScorer] | None:
243
+ if scorers is None:
244
+ return None
245
+ else:
246
+ results = []
247
+ for scorer in scorers:
248
+ results.append(
249
+ EvalScorer(
250
+ name=scorer.scorer,
251
+ metrics=resolve_scorer_metrics(scorer.metrics),
252
+ options=scorer.args,
253
+ metadata=scorer.metadata,
254
+ )
255
+ )
256
+ return results
257
+
258
+
259
+ def resolve_scorer_metrics(
260
+ metrics: list[MetricSpec | dict[str, list[MetricSpec]]]
261
+ | dict[str, list[MetricSpec]]
262
+ | None,
263
+ ) -> (
264
+ list[EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]]
265
+ | dict[str, list[EvalMetricDefinition]]
266
+ | None
267
+ ):
268
+ if metrics is None:
269
+ return None
270
+ elif isinstance(metrics, list):
271
+ resolved_metrics: list[
272
+ EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]
273
+ ] = []
274
+ for metric_item in metrics:
275
+ if isinstance(metric_item, MetricSpec):
276
+ resolved_metrics.append(
277
+ EvalMetricDefinition(
278
+ name=metric_item.metric, options=metric_item.args
279
+ )
280
+ )
281
+ elif isinstance(metric_item, dict):
282
+ resolved_metrics.append(
283
+ {
284
+ metric_group: [
285
+ EvalMetricDefinition(
286
+ name=metric_spec.metric, options=metric_spec.args
287
+ )
288
+ for metric_spec in metric_specs
289
+ ]
290
+ for metric_group, metric_specs in metric_item.items()
291
+ }
292
+ )
293
+ else:
294
+ raise TypeError(f"Unexpected item in list: {metric_item}")
295
+ return resolved_metrics
296
+ else:
297
+ return {
298
+ metric_group: [
299
+ EvalMetricDefinition(name=metric_spec.metric, options=metric_spec.args)
300
+ for metric_spec in metric_specs
301
+ ]
302
+ for metric_group, metric_specs in metrics.items()
303
+ }
@@ -1,10 +1,13 @@
1
1
  import fnmatch
2
+ import inspect
3
+ import logging
2
4
  import re
3
5
  from collections import defaultdict
4
6
  from copy import deepcopy
5
7
  from dataclasses import dataclass, field
6
- from typing import Any, Tuple, cast
8
+ from typing import Any, Tuple, TypeGuard, cast, get_args, get_origin, get_type_hints
7
9
 
10
+ from inspect_ai._util.logger import warn_once
8
11
  from inspect_ai._util.registry import (
9
12
  registry_info,
10
13
  registry_log_name,
@@ -19,7 +22,12 @@ from inspect_ai.log import (
19
22
  )
20
23
  from inspect_ai.log._log import EvalSampleReductions
21
24
  from inspect_ai.scorer import Metric, Score, Scorer
22
- from inspect_ai.scorer._metric import SampleScore
25
+ from inspect_ai.scorer._metric import (
26
+ MetricDeprecated,
27
+ MetricProtocol,
28
+ SampleScore,
29
+ Value,
30
+ )
23
31
  from inspect_ai.scorer._metrics.accuracy import accuracy
24
32
  from inspect_ai.scorer._metrics.std import stderr
25
33
  from inspect_ai.scorer._reducer import ScoreReducer, mean_score, reducer_log_name
@@ -29,6 +37,8 @@ from inspect_ai.scorer._scorer import (
29
37
  unique_scorer_name,
30
38
  )
31
39
 
40
+ logger = logging.getLogger(__name__)
41
+
32
42
 
33
43
  @dataclass
34
44
  class ScorerInfo:
@@ -99,12 +109,14 @@ def eval_results(
99
109
  reduced_samples = EvalSampleReductions(
100
110
  scorer=scorer_name,
101
111
  reducer=reducer_display_nm,
102
- samples=reduced_scores,
112
+ samples=[
113
+ EvalSampleScore(**ss.score.__dict__, sample_id=ss.sample_id)
114
+ for ss in reduced_scores
115
+ ],
103
116
  )
104
117
  sample_reductions.append(reduced_samples)
105
118
 
106
119
  # Compute metrics for this scorer
107
- simple_scores = cast(list[Score], reduced_scores)
108
120
  targets = metrics if metrics is not None else scorer_info.metrics
109
121
  if isinstance(targets, list):
110
122
  ## split the metrics into the simple metrics and any dictionary
@@ -119,7 +131,7 @@ def eval_results(
119
131
  scorer_for_metrics(
120
132
  scorer_name=scorer_name,
121
133
  scorer_info=scorer_info,
122
- scores=simple_scores,
134
+ sample_scores=reduced_scores,
123
135
  metrics=simple_metrics,
124
136
  reducer_name=reducer_display_nm,
125
137
  )
@@ -129,7 +141,7 @@ def eval_results(
129
141
  scorers_from_metric_dict(
130
142
  scorer_name=scorer_name,
131
143
  scorer_info=scorer_info,
132
- scores=simple_scores,
144
+ sample_scores=reduced_scores,
133
145
  metrics=dict_metric,
134
146
  reducer_name=reducer_display_nm,
135
147
  )
@@ -145,7 +157,7 @@ def eval_results(
145
157
  scorers_from_metric_dict(
146
158
  scorer_name=scorer_name,
147
159
  scorer_info=scorer_info,
148
- scores=simple_scores,
160
+ sample_scores=reduced_scores,
149
161
  metrics=targets,
150
162
  reducer_name=reducer_display_nm,
151
163
  )
@@ -184,7 +196,7 @@ def split_metrics(
184
196
  def scorer_for_metrics(
185
197
  scorer_name: str,
186
198
  scorer_info: ScorerInfo,
187
- scores: list[Score],
199
+ sample_scores: list[SampleScore],
188
200
  metrics: list[Metric],
189
201
  reducer_name: str | None = None,
190
202
  ) -> list[EvalScore]:
@@ -200,10 +212,10 @@ def scorer_for_metrics(
200
212
  key = metrics_unique_key(
201
213
  registry_unqualified_name(metric), list(list_metrics.keys())
202
214
  )
203
-
215
+ params = registry_params(metric)
204
216
  # process metric values
205
- if len(scores) > 0:
206
- metric_value = metric(scores)
217
+ if len(sample_scores) > 0:
218
+ metric_value = call_metric(metric, sample_scores)
207
219
  else:
208
220
  metric_value = float("Nan")
209
221
  base_metric_name = registry_log_name(metric)
@@ -215,8 +227,7 @@ def scorer_for_metrics(
215
227
  if value is not None:
216
228
  name = metrics_unique_key(metric_key, list(list_metrics.keys()))
217
229
  list_metrics[name] = EvalMetric(
218
- name=name,
219
- value=float(value),
230
+ name=name, value=float(value), params=params
220
231
  )
221
232
 
222
233
  # If the metric value is a list, turn each element in the list
@@ -229,13 +240,14 @@ def scorer_for_metrics(
229
240
  with_suffix(key, count), list(list_metrics.keys())
230
241
  )
231
242
 
232
- list_metrics[name] = EvalMetric(name=name, value=float(value))
243
+ list_metrics[name] = EvalMetric(
244
+ name=name, value=float(value), params=params
245
+ )
233
246
 
234
247
  # the metric is a float, str, or int
235
248
  else:
236
249
  list_metrics[key] = EvalMetric(
237
- name=base_metric_name,
238
- value=float(metric_value),
250
+ name=base_metric_name, value=float(metric_value), params=params
239
251
  )
240
252
 
241
253
  # build results
@@ -257,7 +269,7 @@ def scorer_for_metrics(
257
269
  def scorers_from_metric_dict(
258
270
  scorer_name: str,
259
271
  scorer_info: ScorerInfo,
260
- scores: list[Score],
272
+ sample_scores: list[SampleScore],
261
273
  metrics: dict[str, list[Metric]],
262
274
  reducer_name: str | None = None,
263
275
  ) -> list[EvalScore]:
@@ -265,18 +277,22 @@ def scorers_from_metric_dict(
265
277
 
266
278
  # Expand any metric keys
267
279
  resolved_metrics = (
268
- resolve_glob_metric_keys(metrics, scores[0]) if len(scores) > 0 else metrics
280
+ resolve_glob_metric_keys(metrics, sample_scores[0].score)
281
+ if len(sample_scores) > 0
282
+ else metrics
269
283
  )
270
284
 
271
285
  for metric_key, metric_list in resolved_metrics.items():
272
286
  # filter scores to a list of scalars with the value of the metric name
273
- metric_scores: list[Score] = []
274
- for score in scores:
275
- if isinstance(score.value, dict):
276
- if metric_key in score.value:
287
+ metric_scores: list[SampleScore] = []
288
+ for sample_score in sample_scores:
289
+ if isinstance(sample_score.score.value, dict):
290
+ if metric_key in sample_score.score.value:
277
291
  # Convert the score into a simple scalar value to apply metrics
278
- metric_score = deepcopy(score)
279
- metric_score.value = cast(float, score.value[metric_key])
292
+ metric_score = deepcopy(sample_score)
293
+ metric_score.score.value = cast(
294
+ float, sample_score.score.value[metric_key]
295
+ )
280
296
  metric_scores.append(metric_score)
281
297
  else:
282
298
  raise TypeError(
@@ -291,8 +307,9 @@ def scorers_from_metric_dict(
291
307
  for target_metric in metric_list:
292
308
  # compute the metric value
293
309
  metric_name = registry_log_name(target_metric)
310
+ metric_params = registry_params(target_metric)
294
311
  if len(metric_scores) > 0:
295
- value = target_metric(metric_scores)
312
+ value = call_metric(target_metric, metric_scores)
296
313
  else:
297
314
  value = float("Nan")
298
315
 
@@ -302,20 +319,17 @@ def scorers_from_metric_dict(
302
319
  for key, val in value.items():
303
320
  name = f"{metric_name}_{key}"
304
321
  result_metrics[name] = EvalMetric(
305
- name=name,
306
- value=cast(float, val),
322
+ name=name, value=cast(float, val), params=metric_params
307
323
  )
308
324
  elif isinstance(value, list):
309
325
  for idx, item in enumerate(value):
310
326
  name = f"{metric_name}_{idx}"
311
327
  result_metrics[name] = EvalMetric(
312
- name=name,
313
- value=cast(float, item),
328
+ name=name, value=cast(float, item), params=metric_params
314
329
  )
315
330
  else:
316
331
  result_metrics[metric_name] = EvalMetric(
317
- name=metric_name,
318
- value=cast(float, value),
332
+ name=metric_name, value=cast(float, value), params=metric_params
319
333
  )
320
334
 
321
335
  # create a scorer result for this metric
@@ -336,6 +350,48 @@ def scorers_from_metric_dict(
336
350
  return results
337
351
 
338
352
 
353
+ def call_metric(metric: Metric, sample_scores: list[SampleScore]) -> Value:
354
+ if is_metric_deprecated(metric):
355
+ warn_once(
356
+ logger,
357
+ f"Metric {registry_log_name(metric)} should be updated to take list[SampleScore]. "
358
+ f"Metrics with list[Score] are deprecated.",
359
+ )
360
+ scores = [sample_score.score for sample_score in sample_scores]
361
+ return metric(scores)
362
+ else:
363
+ metric = cast(MetricProtocol, metric)
364
+ return metric(sample_scores)
365
+
366
+
367
+ def is_metric_deprecated(metric: Metric) -> TypeGuard[MetricDeprecated]:
368
+ """Type guard to check if a metric follows the deprecated signature."""
369
+ try:
370
+ # signature and params
371
+ sig = inspect.signature(metric)
372
+ param_types = get_type_hints(metric)
373
+
374
+ # there should be only one param, check it
375
+ first_param = next(iter(sig.parameters.values()), None)
376
+ if first_param is None:
377
+ # No parameters, who knows what this is, treat it as deprecated
378
+ return True
379
+
380
+ expected_type: Any = param_types.get(first_param.name, None)
381
+
382
+ if expected_type is None or expected_type is Any:
383
+ # no helpful type info, treat it as deprecated
384
+ return True
385
+
386
+ # Extract generic base type and arguments to check if it matches list[Score]
387
+ origin = get_origin(expected_type)
388
+ args = get_args(expected_type)
389
+
390
+ return origin is list and args == (Score,)
391
+ except (AttributeError, ValueError, TypeError):
392
+ return False
393
+
394
+
339
395
  def resolve_glob_metric_keys(
340
396
  metrics: dict[str, list[Metric]], base_score: Score
341
397
  ) -> dict[str, list[Metric]]:
@@ -375,7 +431,7 @@ def resolve_glob_metric_keys(
375
431
 
376
432
  def reduce_scores(
377
433
  scores: list[SampleScore], reducer: ScoreReducer
378
- ) -> list[EvalSampleScore]:
434
+ ) -> list[SampleScore]:
379
435
  # Group the scores by sample_id
380
436
  grouped_scores: dict[str, list[SampleScore]] = defaultdict(list)
381
437
  for sample_score in scores:
@@ -383,16 +439,14 @@ def reduce_scores(
383
439
  grouped_scores[str(sample_score.sample_id)].append(sample_score)
384
440
 
385
441
  # reduce the scores
386
- reduced_scores: list[EvalSampleScore] = []
442
+ reduced_scores: list[SampleScore] = []
387
443
  for scores in grouped_scores.values():
388
444
  reduced = reducer([score.score for score in scores])
389
445
  reduced_scores.append(
390
- EvalSampleScore(
446
+ SampleScore(
391
447
  sample_id=scores[0].sample_id,
392
- value=reduced.value,
393
- answer=reduced.answer,
394
- explanation=reduced.explanation,
395
- metadata=reduced.metadata,
448
+ sample_metadata=scores[0].sample_metadata,
449
+ score=reduced,
396
450
  )
397
451
  )
398
452