inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (518) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/main.py +1 -1
  8. inspect_ai/_cli/sandbox.py +4 -1
  9. inspect_ai/_cli/score.py +181 -32
  10. inspect_ai/_cli/trace.py +10 -0
  11. inspect_ai/_cli/view.py +4 -2
  12. inspect_ai/_display/core/active.py +2 -3
  13. inspect_ai/_display/core/config.py +7 -1
  14. inspect_ai/_display/textual/widgets/samples.py +4 -3
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/eval.py +104 -101
  17. inspect_ai/_eval/evalset.py +75 -75
  18. inspect_ai/_eval/loader.py +122 -12
  19. inspect_ai/_eval/registry.py +1 -1
  20. inspect_ai/_eval/run.py +14 -0
  21. inspect_ai/_eval/score.py +125 -36
  22. inspect_ai/_eval/task/log.py +105 -4
  23. inspect_ai/_eval/task/results.py +92 -38
  24. inspect_ai/_eval/task/run.py +9 -2
  25. inspect_ai/_eval/task/sandbox.py +35 -2
  26. inspect_ai/_eval/task/task.py +49 -46
  27. inspect_ai/_util/constants.py +1 -1
  28. inspect_ai/_util/content.py +8 -0
  29. inspect_ai/_util/error.py +2 -0
  30. inspect_ai/_util/file.py +15 -1
  31. inspect_ai/_util/hash.py +1 -1
  32. inspect_ai/_util/logger.py +4 -2
  33. inspect_ai/_util/registry.py +7 -1
  34. inspect_ai/_view/view.py +1 -2
  35. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  36. inspect_ai/_view/www/.vscode/settings.json +8 -0
  37. inspect_ai/_view/www/App.css +97 -29
  38. inspect_ai/_view/www/README.md +1 -1
  39. inspect_ai/_view/www/dist/assets/index.css +16663 -14674
  40. inspect_ai/_view/www/dist/assets/index.js +58808 -51348
  41. inspect_ai/_view/www/dist/index.html +1 -1
  42. inspect_ai/_view/www/index.html +2 -2
  43. inspect_ai/_view/www/log-schema.json +87 -73
  44. inspect_ai/_view/www/package.json +22 -4
  45. inspect_ai/_view/www/postcss.config.cjs +8 -9
  46. inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
  47. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  48. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  49. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  50. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  51. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  52. inspect_ai/_view/www/src/api/index.ts +4 -4
  53. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  54. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  55. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  56. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  57. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  58. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  59. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  60. inspect_ai/_view/www/src/components/Card.css +60 -0
  61. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  62. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  63. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  64. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  65. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  66. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  67. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  68. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  69. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  70. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  71. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  72. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  73. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  74. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  75. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  76. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  77. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  78. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  79. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  80. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  81. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  82. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  83. inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
  84. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  85. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  86. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  87. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  88. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  89. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  90. inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
  91. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  92. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  93. inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
  94. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  95. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  96. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  97. inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
  98. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  99. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  100. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  101. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  102. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  103. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
  104. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  105. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  106. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  107. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  108. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  109. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  110. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  111. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  112. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  113. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  114. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  115. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  116. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  117. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  118. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
  119. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  120. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  121. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  122. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  123. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  124. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  126. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  127. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
  128. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  129. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
  130. inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
  131. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  132. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  133. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  134. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  135. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  136. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
  137. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  138. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  139. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  140. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
  141. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  142. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
  143. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  144. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
  145. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  146. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
  147. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  148. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
  149. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  150. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  151. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
  152. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
  153. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  154. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  155. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  156. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  157. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  158. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  159. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  160. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  161. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  162. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  163. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  164. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  165. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  166. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  167. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  168. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  169. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  170. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  171. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  172. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  173. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  174. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  175. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  176. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  177. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  179. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  180. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  181. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  182. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
  183. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  184. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
  185. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  186. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  187. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  188. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  189. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  190. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  191. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  192. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  193. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  194. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  196. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  197. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  198. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  199. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  200. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  201. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  202. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  203. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  204. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  205. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  206. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
  207. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  208. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  209. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  210. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  211. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  212. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  213. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  214. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  215. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
  216. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  217. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  218. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  219. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  220. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  221. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
  222. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  223. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
  224. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  225. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  226. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  227. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  228. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  229. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
  230. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  231. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  232. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
  233. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  234. inspect_ai/_view/www/src/types/log.d.ts +108 -19
  235. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  236. inspect_ai/_view/www/src/types.ts +71 -0
  237. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
  238. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  239. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
  240. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  241. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  242. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  243. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  244. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  245. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  246. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  247. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  248. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  249. inspect_ai/_view/www/src/utils/format.ts +194 -0
  250. inspect_ai/_view/www/src/utils/git.ts +7 -0
  251. inspect_ai/_view/www/src/utils/html.ts +6 -0
  252. inspect_ai/_view/www/src/utils/http.ts +14 -0
  253. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  254. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  255. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  256. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  257. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  258. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  259. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  260. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  261. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
  262. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  263. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  264. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  265. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  266. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  267. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
  268. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
  269. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
  270. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  271. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
  272. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  273. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  274. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  275. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  276. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  277. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  278. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  279. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  280. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  281. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  282. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  283. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
  284. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  285. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
  286. inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
  287. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
  288. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  289. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  290. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  291. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  292. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  293. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  294. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  295. inspect_ai/_view/www/tsconfig.json +23 -9
  296. inspect_ai/_view/www/vite.config.js +8 -17
  297. inspect_ai/_view/www/yarn.lock +627 -556
  298. inspect_ai/approval/_approval.py +2 -0
  299. inspect_ai/approval/_approver.py +4 -4
  300. inspect_ai/approval/_auto.py +1 -1
  301. inspect_ai/approval/_human/approver.py +3 -0
  302. inspect_ai/approval/_policy.py +5 -0
  303. inspect_ai/approval/_registry.py +2 -2
  304. inspect_ai/dataset/_dataset.py +64 -37
  305. inspect_ai/dataset/_sources/__init__.py +0 -0
  306. inspect_ai/dataset/_sources/csv.py +20 -12
  307. inspect_ai/dataset/_sources/file.py +4 -0
  308. inspect_ai/dataset/_sources/hf.py +39 -29
  309. inspect_ai/dataset/_sources/json.py +17 -9
  310. inspect_ai/log/__init__.py +2 -0
  311. inspect_ai/log/_convert.py +3 -3
  312. inspect_ai/log/_file.py +24 -9
  313. inspect_ai/log/_log.py +101 -13
  314. inspect_ai/log/_message.py +4 -2
  315. inspect_ai/log/_recorders/file.py +4 -0
  316. inspect_ai/log/_recorders/json.py +5 -7
  317. inspect_ai/log/_recorders/recorder.py +3 -0
  318. inspect_ai/log/_transcript.py +19 -8
  319. inspect_ai/model/__init__.py +2 -0
  320. inspect_ai/model/_cache.py +39 -21
  321. inspect_ai/model/_call_tools.py +4 -3
  322. inspect_ai/model/_chat_message.py +14 -4
  323. inspect_ai/model/_generate_config.py +1 -1
  324. inspect_ai/model/_model.py +31 -24
  325. inspect_ai/model/_model_output.py +14 -1
  326. inspect_ai/model/_openai.py +10 -18
  327. inspect_ai/model/_providers/anthropic.py +3 -3
  328. inspect_ai/model/_providers/google.py +9 -5
  329. inspect_ai/model/_providers/openai.py +5 -9
  330. inspect_ai/model/_providers/openai_o1.py +3 -5
  331. inspect_ai/model/_providers/openrouter.py +86 -0
  332. inspect_ai/model/_providers/providers.py +11 -0
  333. inspect_ai/scorer/__init__.py +6 -1
  334. inspect_ai/scorer/_answer.py +7 -7
  335. inspect_ai/scorer/_classification.py +38 -18
  336. inspect_ai/scorer/_common.py +2 -8
  337. inspect_ai/scorer/_match.py +4 -5
  338. inspect_ai/scorer/_metric.py +87 -28
  339. inspect_ai/scorer/_metrics/__init__.py +3 -3
  340. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  341. inspect_ai/scorer/_metrics/mean.py +3 -17
  342. inspect_ai/scorer/_metrics/std.py +111 -30
  343. inspect_ai/scorer/_model.py +12 -12
  344. inspect_ai/scorer/_pattern.py +3 -3
  345. inspect_ai/scorer/_reducer/reducer.py +36 -21
  346. inspect_ai/scorer/_reducer/registry.py +2 -2
  347. inspect_ai/scorer/_reducer/types.py +7 -1
  348. inspect_ai/scorer/_score.py +11 -1
  349. inspect_ai/scorer/_scorer.py +110 -16
  350. inspect_ai/solver/__init__.py +1 -1
  351. inspect_ai/solver/_basic_agent.py +19 -22
  352. inspect_ai/solver/_bridge/__init__.py +0 -3
  353. inspect_ai/solver/_bridge/bridge.py +3 -3
  354. inspect_ai/solver/_chain.py +1 -2
  355. inspect_ai/solver/_critique.py +3 -3
  356. inspect_ai/solver/_fork.py +2 -2
  357. inspect_ai/solver/_human_agent/__init__.py +0 -0
  358. inspect_ai/solver/_human_agent/agent.py +5 -8
  359. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  360. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  361. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  362. inspect_ai/solver/_multiple_choice.py +38 -26
  363. inspect_ai/solver/_prompt.py +7 -7
  364. inspect_ai/solver/_solver.py +53 -52
  365. inspect_ai/solver/_task_state.py +80 -69
  366. inspect_ai/solver/_use_tools.py +9 -9
  367. inspect_ai/tool/__init__.py +4 -1
  368. inspect_ai/tool/_tool.py +43 -14
  369. inspect_ai/tool/_tool_call.py +6 -2
  370. inspect_ai/tool/_tool_choice.py +3 -1
  371. inspect_ai/tool/_tool_def.py +10 -8
  372. inspect_ai/tool/_tool_params.py +24 -0
  373. inspect_ai/tool/_tool_with.py +7 -7
  374. inspect_ai/tool/_tools/__init__.py +0 -0
  375. inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
  376. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
  377. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  378. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  379. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  380. inspect_ai/tool/_tools/_execute.py +23 -11
  381. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  382. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  383. inspect_ai/tool/_tools/_web_search.py +7 -5
  384. inspect_ai/tool/beta.py +3 -0
  385. inspect_ai/util/_concurrency.py +3 -3
  386. inspect_ai/util/_panel.py +2 -0
  387. inspect_ai/util/_resource.py +12 -12
  388. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  389. inspect_ai/util/_sandbox/docker/config.py +2 -1
  390. inspect_ai/util/_sandbox/docker/docker.py +42 -86
  391. inspect_ai/util/_sandbox/docker/service.py +100 -0
  392. inspect_ai/util/_sandbox/environment.py +99 -96
  393. inspect_ai/util/_sandbox/self_check.py +124 -16
  394. inspect_ai/util/_subprocess.py +5 -3
  395. inspect_ai/util/_subtask.py +15 -16
  396. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
  397. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
  398. inspect_ai-0.3.64.dist-info/RECORD +625 -0
  399. inspect_ai/_view/www/src/Register.mjs +0 -3
  400. inspect_ai/_view/www/src/Types.mjs +0 -38
  401. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  402. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  403. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  404. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  405. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  406. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  407. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  408. inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
  409. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  410. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  411. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  412. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  413. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  414. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  415. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  416. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  417. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  418. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  419. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  420. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  421. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  422. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  423. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  424. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  425. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  426. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  427. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  428. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  429. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  430. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  431. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  432. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  433. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  434. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  435. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  436. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  437. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  438. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  439. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  440. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  441. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  442. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  443. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  444. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  445. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  446. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  447. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  448. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  449. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  450. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  451. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  452. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  453. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  454. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  455. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  456. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  457. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  458. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  459. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  460. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  461. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  462. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  463. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  464. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  465. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  466. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  467. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  468. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  469. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  470. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  471. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  472. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  473. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  474. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  475. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  476. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  477. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
  478. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  479. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  480. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  481. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  482. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  483. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  484. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  485. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  486. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  487. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  488. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  489. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  490. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  491. inspect_ai/tool/beta/__init__.py +0 -5
  492. inspect_ai-0.3.62.dist-info/RECORD +0 -481
  493. /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
  494. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
  495. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  496. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  497. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  498. /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
  499. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  500. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  501. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  502. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  503. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  504. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  505. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  506. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
  507. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
  508. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  509. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  510. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
  511. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  512. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  513. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  514. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  515. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  516. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
  517. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
  518. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ from dataclasses import dataclass, field
1
2
  from logging import getLogger
2
3
  from typing import (
3
4
  Any,
@@ -12,11 +13,15 @@ from typing import (
12
13
 
13
14
  from pydantic import BaseModel, Field
14
15
 
16
+ from inspect_ai._util.error import PrerequisiteError
15
17
  from inspect_ai._util.registry import (
16
18
  RegistryInfo,
19
+ is_registry_object,
17
20
  registry_add,
18
21
  registry_create,
22
+ registry_info,
19
23
  registry_name,
24
+ registry_params,
20
25
  registry_tag,
21
26
  )
22
27
 
@@ -43,19 +48,12 @@ Value = Union[
43
48
  """Value provided by a score.
44
49
 
45
50
  Use the methods of `Score` to easily treat
46
- the Value as a simple scalar of various types.
51
+ the `Value` as a simple scalar of various types.
47
52
  """
48
53
 
49
54
 
50
55
  class Score(BaseModel):
51
- """Score generated by a scorer.
52
-
53
- Args:
54
- value (Value): Score value.
55
- answer (str | None): Answer extracted from model output (optional).
56
- explanation (str | None): Explanation of score (optional).
57
- metadata (dict[str,Any]): Additional metadata related to the score.
58
- """
56
+ """Score generated by a scorer."""
59
57
 
60
58
  value: Value
61
59
  """Score value."""
@@ -112,12 +110,7 @@ class Score(BaseModel):
112
110
 
113
111
 
114
112
  class SampleScore(BaseModel):
115
- """Score for a Sample
116
-
117
- Args:
118
- score: Score
119
- sample_id: (str | int | None) Unique id of a sample
120
- """
113
+ """Score for a Sample."""
121
114
 
122
115
  score: Score
123
116
  """A score"""
@@ -125,6 +118,9 @@ class SampleScore(BaseModel):
125
118
  sample_id: str | int | None = Field(default=None)
126
119
  """A sample id"""
127
120
 
121
+ sample_metadata: dict[str, Any] | None = Field(default=None)
122
+ """Metadata from the sample"""
123
+
128
124
  scorer: str | None = Field(default=None)
129
125
  """Registry name of scorer that created this score."""
130
126
 
@@ -188,22 +184,57 @@ def value_to_float(
188
184
 
189
185
 
190
186
  @runtime_checkable
191
- class Metric(Protocol):
192
- r"""Evaluate scores using a metric.
193
-
194
- Args:
195
- scores (list[Score]): List of scores.
187
+ class MetricDeprecated(Protocol):
188
+ def __call__(self, scores: list[Score]) -> Value: ...
196
189
 
197
- Returns:
198
- Metric value
199
- """
200
190
 
201
- def __call__(self, scores: list[Score]) -> Value: ...
191
+ @runtime_checkable
192
+ class MetricProtocol(Protocol):
193
+ def __call__(self, scores: list[SampleScore]) -> Value:
194
+ r"""Compute a metric on a list of scores.
195
+
196
+ Args:
197
+ scores: List of scores.
198
+
199
+ Returns:
200
+ Metric value
201
+
202
+ Examples:
203
+ ```python
204
+ @metric
205
+ def mean() -> Metric:
206
+ def metric(scores: list[SampleScore]) -> Value:
207
+ return np.mean([score.score.as_float() for score in scores]).item()
208
+ return metric
209
+ ```
210
+ """
211
+ ...
212
+
213
+
214
+ Metric = MetricProtocol | MetricDeprecated
215
+ """Metric protocol.
216
+
217
+ The Metric signature changed in release v0.3.64. Both
218
+ the previous and new signatures are supported -- you
219
+ should use `MetricProtocol` for new code as the
220
+ depreacated signature will eventually be removed.
221
+ """
202
222
 
203
223
 
204
224
  P = ParamSpec("P")
205
225
 
206
226
 
227
+ @dataclass(frozen=True)
228
+ class MetricSpec:
229
+ """Scorer specification used to (re-)create scorers."""
230
+
231
+ metric: str
232
+ """Metric name"""
233
+
234
+ args: dict[str, Any] = field(default_factory=dict)
235
+ """Metric arguments."""
236
+
237
+
207
238
  def metric_register(metric: Callable[P, Metric], name: str = "") -> Callable[P, Metric]:
208
239
  r"""Register a function or class as a metric.
209
240
 
@@ -237,6 +268,26 @@ def metric_create(name: str, **kwargs: Any) -> Metric:
237
268
  return cast(Metric, registry_create("metric", name, **kwargs))
238
269
 
239
270
 
271
+ def to_metric_specs(
272
+ metrics: list[Metric] | dict[str, list[Metric]],
273
+ ) -> list[MetricSpec] | dict[str, list[MetricSpec]]:
274
+ if isinstance(metrics, list):
275
+ return [as_metric_spec(m) for m in metrics]
276
+ else:
277
+ return {
278
+ k: [as_metric_spec(v) for v in metric_list]
279
+ for k, metric_list in metrics.items()
280
+ }
281
+
282
+
283
+ def as_metric_spec(metric: Metric) -> MetricSpec:
284
+ if not is_registry_object(metric):
285
+ raise PrerequisiteError(
286
+ f"The metric {getattr(metric, '__name__', '<unknown>')} was not created by a function decorated with @metric so cannot be recorded."
287
+ )
288
+ return MetricSpec(metric=registry_info(metric).name, args=registry_params(metric))
289
+
290
+
240
291
  @overload
241
292
  def metric(name: str) -> Callable[[Callable[P, Metric]], Callable[P, Metric]]: ...
242
293
 
@@ -252,10 +303,18 @@ def metric(
252
303
  r"""Decorator for registering metrics.
253
304
 
254
305
  Args:
255
- name: (str | MetricType):
256
- Optional name for metric. If the decorator has no name
257
- argument then the name of the underlying MetricType
258
- will be used to automatically assign a name.
306
+ name: Optional name for metric. If the decorator has no name
307
+ argument then the name of the underlying MetricType
308
+ will be used to automatically assign a name.
309
+
310
+ Examples:
311
+ ```python
312
+ @metric
313
+ def mean() -> Metric:
314
+ def metric(scores: list[SampleScore]) -> Value:
315
+ return np.mean([score.score.as_float() for score in scores]).item()
316
+ return metric
317
+ ```
259
318
  """
260
319
 
261
320
  # create_metric_wrapper:
@@ -1,12 +1,12 @@
1
1
  from .accuracy import accuracy
2
- from .mean import mean, var
3
- from .std import bootstrap_stderr, std, stderr
2
+ from .mean import mean
3
+ from .std import bootstrap_stderr, std, stderr, var
4
4
 
5
5
  __all__ = [
6
6
  "accuracy",
7
7
  "mean",
8
- "var",
9
8
  "bootstrap_stderr",
10
9
  "std",
11
10
  "stderr",
11
+ "var",
12
12
  ]
@@ -2,7 +2,7 @@ from logging import getLogger
2
2
 
3
3
  from .._metric import (
4
4
  Metric,
5
- Score,
5
+ SampleScore,
6
6
  ValueToFloat,
7
7
  metric,
8
8
  value_to_float,
@@ -16,22 +16,20 @@ def accuracy(to_float: ValueToFloat = value_to_float()) -> Metric:
16
16
  r"""Compute proportion of total answers which are correct.
17
17
 
18
18
  Args:
19
- to_float (ValueToFloat): Function for mapping
20
- Value to float for computing metrics. The default
21
- `value_to_float()` maps CORRECT ("C") to 1.0,
22
- INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
23
- NOANSWER ("N") to 0, casts numeric values to
24
- float directly, and prints a warning and returns
25
- 0 if the Value is a complex object (list or dict).
19
+ to_float: Function for mapping `Value` to float for computing
20
+ metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
21
+ INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
22
+ casts numeric values to float directly, and prints a warning and returns
23
+ 0 if the Value is a complex object (list or dict).
26
24
 
27
25
  Returns:
28
26
  Accuracy metric
29
27
  """
30
28
 
31
- def metric(scores: list[Score]) -> float:
29
+ def metric(scores: list[SampleScore]) -> float:
32
30
  total = 0.0
33
31
  for item in scores:
34
- total += to_float(item.value)
32
+ total += to_float(item.score.value)
35
33
  return total / float(len(scores))
36
34
 
37
35
  return metric
@@ -1,6 +1,6 @@
1
1
  import numpy as np
2
2
 
3
- from .._metric import Metric, Score, metric
3
+ from .._metric import Metric, SampleScore, metric
4
4
 
5
5
 
6
6
  @metric
@@ -11,21 +11,7 @@ def mean() -> Metric:
11
11
  mean metric
12
12
  """
13
13
 
14
- def metric(scores: list[Score]) -> float:
15
- return np.mean([score.as_float() for score in scores]).item()
16
-
17
- return metric
18
-
19
-
20
- @metric
21
- def var() -> Metric:
22
- """Compute variance over all scores.
23
-
24
- Returns:
25
- var metric
26
- """
27
-
28
- def metric(scores: list[Score]) -> float:
29
- return np.var([score.as_float() for score in scores]).item()
14
+ def metric(scores: list[SampleScore]) -> float:
15
+ return np.mean([score.score.as_float() for score in scores]).item()
30
16
 
31
17
  return metric
@@ -5,7 +5,7 @@ import numpy as np
5
5
 
6
6
  from .._metric import (
7
7
  Metric,
8
- Score,
8
+ SampleScore,
9
9
  ValueToFloat,
10
10
  metric,
11
11
  value_to_float,
@@ -21,21 +21,21 @@ def bootstrap_stderr(
21
21
  """Standard error of the mean using bootstrap.
22
22
 
23
23
  Args:
24
- num_samples (int): Number of bootstrap samples to take.
25
- to_float (ValueToFloat): Function for mapping
26
- Value to float for computing metrics. The default
27
- `value_to_float()` maps CORRECT ("C") to 1.0,
28
- INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
29
- NOANSWER ("N") to 0, casts numeric values to
30
- float directly, and prints a warning and returns
31
- 0 if the Value is a complex object (list or dict).
24
+ num_samples: Number of bootstrap samples to take.
25
+ to_float: Function for mapping
26
+ Value to float for computing metrics. The default
27
+ `value_to_float()` maps CORRECT ("C") to 1.0,
28
+ INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
29
+ NOANSWER ("N") to 0, casts numeric values to
30
+ float directly, and prints a warning and returns
31
+ 0 if the Value is a complex object (list or dict).
32
32
 
33
33
  Returns:
34
34
  bootstrap_stderr metric
35
35
  """
36
36
 
37
- def metric(scores: list[Score]) -> float:
38
- values = [to_float(score.value) for score in scores]
37
+ def metric(scores: list[SampleScore]) -> float:
38
+ values = [to_float(score.score.value) for score in scores]
39
39
  std = np.std(
40
40
  [
41
41
  np.mean(np.random.choice(values, len(values), replace=True))
@@ -48,24 +48,71 @@ def bootstrap_stderr(
48
48
 
49
49
 
50
50
  @metric
51
- def stderr(to_float: ValueToFloat = value_to_float()) -> Metric:
51
+ def stderr(
52
+ to_float: ValueToFloat = value_to_float(), cluster: str | None = None
53
+ ) -> Metric:
52
54
  """Standard error of the mean using Central Limit Theorem.
53
55
 
54
56
  Args:
55
- to_float (ValueToFloat): Function for mapping
56
- Value to float for computing metrics. The default
57
- `value_to_float()` maps CORRECT ("C") to 1.0,
58
- INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
59
- NOANSWER ("N") to 0, casts numeric values to
60
- float directly, and prints a warning and returns
61
- 0 if the Value is a complex object (list or dict).
57
+ to_float: Function for mapping `Value` to float for computing
58
+ metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
59
+ INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
60
+ casts numeric values to float directly, and prints a warning and returns
61
+ 0 if the Value is a complex object (list or dict).
62
+ cluster (str | None): The key from the Sample metadata
63
+ corresponding to a cluster identifier for computing
64
+ [clustered standard errors](https://en.wikipedia.org/wiki/Clustered_standard_errors).
62
65
 
63
66
  Returns:
64
- stderr metric
67
+ stderr metric
65
68
  """
66
69
 
67
- def metric(scores: list[Score]) -> float:
68
- values = [to_float(score.value) for score in scores]
70
+ def clustered_metric(scores: list[SampleScore]) -> float:
71
+ """Computes a clustered standard error.
72
+
73
+ For details, see Appendix A of https://arxiv.org/pdf/2411.00640.
74
+ The version here uses a finite cluster correction (unlike the paper)
75
+ """
76
+ assert cluster is not None
77
+ cluster_list = []
78
+ value_list = []
79
+ for sample_score in scores:
80
+ if (
81
+ sample_score.sample_metadata is None
82
+ or cluster not in sample_score.sample_metadata
83
+ ):
84
+ raise ValueError(
85
+ f"Sample {sample_score.sample_id} has no cluster metadata. To compute `stderr` with clustering, each sample metadata must have a value for '{cluster}'"
86
+ )
87
+ cluster_list.append(sample_score.sample_metadata[cluster])
88
+ value_list.append(to_float(sample_score.score.value))
89
+ clusters = np.array(cluster_list)
90
+ values = np.array(value_list)
91
+ mean = float(np.mean(values))
92
+
93
+ # Convert to numpy arrays and get unique clusters
94
+ unique_clusters = np.unique(clusters)
95
+ cluster_count = len(unique_clusters)
96
+
97
+ # Compute clustered variance using NumPy operations
98
+ clustered_variance = 0.0
99
+ for cluster_id in unique_clusters:
100
+ # get a data vector for this cluster
101
+ cluster_data = values[clusters == cluster_id]
102
+ # this computes X' \Omega X = \sum_i \sum_j (s_{i,c} - mean) * (s_{j,c} - mean)
103
+ clustered_variance += np.outer(
104
+ cluster_data - mean, cluster_data - mean
105
+ ).sum()
106
+
107
+ # Multiply by C / (C - 1) to unbias the variance estimate
108
+ standard_error = np.sqrt(
109
+ clustered_variance * cluster_count / (cluster_count - 1)
110
+ ) / len(scores)
111
+
112
+ return cast(float, standard_error)
113
+
114
+ def metric(scores: list[SampleScore]) -> float:
115
+ values = [to_float(score.score.value) for score in scores]
69
116
  n = len(values)
70
117
 
71
118
  # standard deviation is calculated by dividing by n-ddof so ensure
@@ -81,6 +128,9 @@ def stderr(to_float: ValueToFloat = value_to_float()) -> Metric:
81
128
 
82
129
  return cast(float, standard_error)
83
130
 
131
+ if cluster is not None:
132
+ return clustered_metric
133
+
84
134
  return metric
85
135
 
86
136
 
@@ -88,6 +138,39 @@ def stderr(to_float: ValueToFloat = value_to_float()) -> Metric:
88
138
  def std(to_float: ValueToFloat = value_to_float()) -> Metric:
89
139
  """Calculates the sample standard deviation of a list of scores.
90
140
 
141
+ Args:
142
+ to_float: Function for mapping `Value` to float for computing
143
+ metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0,
144
+ INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0,
145
+ casts numeric values to float directly, and prints a warning and returns
146
+ 0 if the Value is a complex object (list or dict).
147
+
148
+
149
+ Returns:
150
+ std metric
151
+ """
152
+
153
+ def metric(scores: list[SampleScore]) -> float:
154
+ values = [to_float(score.score.value) for score in scores]
155
+ n = len(values)
156
+
157
+ # standard deviation is calculated by dividing by n-ddof so ensure
158
+ # that we won't divide by zero
159
+ if (n - 1) < 1:
160
+ return 0
161
+
162
+ # Calculate the sample standard deviation
163
+ sample_std = np.std(values, ddof=1)
164
+
165
+ return cast(float, sample_std)
166
+
167
+ return metric
168
+
169
+
170
+ @metric
171
+ def var(to_float: ValueToFloat = value_to_float()) -> Metric:
172
+ """Compute the sample variance of a list of scores.
173
+
91
174
  Args:
92
175
  to_float (ValueToFloat): Function for mapping
93
176
  Value to float for computing metrics. The default
@@ -98,21 +181,19 @@ def std(to_float: ValueToFloat = value_to_float()) -> Metric:
98
181
  0 if the Value is a complex object (list or dict).
99
182
 
100
183
  Returns:
101
- std metric
184
+ var metric
102
185
  """
103
186
 
104
- def metric(scores: list[Score]) -> float:
105
- values = [to_float(score.value) for score in scores]
187
+ def metric(scores: list[SampleScore]) -> float:
188
+ values = [to_float(score.score.value) for score in scores]
106
189
  n = len(values)
107
-
108
- # standard deviation is calculated by dividing by n-ddof so ensure
190
+ # variance is calculated by dividing by n-ddof so ensure
109
191
  # that we won't divide by zero
110
192
  if (n - 1) < 1:
111
193
  return 0
112
194
 
113
- # Calculate the sample standard deviation
114
- sample_std = np.std(values, ddof=1)
195
+ variance = np.var(values, ddof=1)
115
196
 
116
- return cast(float, sample_std)
197
+ return cast(float, variance)
117
198
 
118
199
  return metric
@@ -35,31 +35,31 @@ def model_graded_fact(
35
35
  """Score a question/answer task with a fact response using a model.
36
36
 
37
37
  Args:
38
- template (str): Template for grading prompt. This template uses
38
+ template: Template for grading prompt. This template uses
39
39
  four variables: `question`, `criterion`, `answer`, and
40
40
  `instructions` (which is fed from the `instructions` parameter).
41
41
  Variables from sample `metadata` are also available in the template.
42
- instructions (str): Grading instructions. This should
42
+ instructions: Grading instructions. This should
43
43
  include a prompt for the model to answer (e.g. with
44
44
  with chain of thought reasoning) in a way that matches
45
45
  the specified `grade_pattern`, for example, the default
46
46
  `grade_pattern` looks for one of GRADE: C, GRADE: P, or
47
47
  GRADE: I).
48
- grade_pattern (str): Regex to extract the grade from the
48
+ grade_pattern: Regex to extract the grade from the
49
49
  model response. Defaults to looking for e.g. GRADE: C
50
50
  The regex should have a single capture group that
51
51
  extracts exactly the letter C, P, or I.
52
- include_history (bool | Callable[[TaskState], str]):
52
+ include_history:
53
53
  Whether to include the full chat history in the presented
54
54
  question. Defaults to `False`, which presents only the
55
55
  original sample input. Optionally provide a function to
56
56
  customise how the chat history is presented.
57
- partial_credit (bool): Whether to allow for "partial" credit for
57
+ partial_credit: Whether to allow for "partial" credit for
58
58
  answers (by default assigned a score of 0.5). Defaults
59
59
  to `False`. Note that this parameter is only used
60
60
  with the default `instructions` (as custom instructions
61
61
  provide their own prompts for grades).
62
- model (list[str | Model] | str | Model | None): Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
62
+ model: Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
63
63
  """
64
64
  return model_graded_qa(
65
65
  template=template if template else DEFAULT_MODEL_GRADED_FACT_TEMPLATE,
@@ -83,32 +83,32 @@ def model_graded_qa(
83
83
  """Score a question/answer task using a model.
84
84
 
85
85
  Args:
86
- template (str): Template for grading prompt. This template has
86
+ template: Template for grading prompt. This template has
87
87
  four variables:
88
88
  - `question`, `criterion`, `answer`, and
89
89
  `instructions` (which is fed from the `instructions` parameter).
90
90
  Variables from sample `metadata` are also available in the template.
91
- instructions (str): Grading instructions. This should
91
+ instructions: Grading instructions. This should
92
92
  include a prompt for the model to answer (e.g. with
93
93
  with chain of thought reasoning) in a way that matches
94
94
  the specified `grade_pattern`, for example, the default
95
95
  `grade_pattern` looks for one of GRADE: C, GRADE: P, or
96
96
  GRADE: I.
97
- grade_pattern (str): Regex to extract the grade from the
97
+ grade_pattern: Regex to extract the grade from the
98
98
  model response. Defaults to looking for e.g. GRADE: C
99
99
  The regex should have a single capture group that
100
100
  extracts exactly the letter C, P, I.
101
- include_history (bool | Callable[[TaskState], str]):
101
+ include_history:
102
102
  Whether to include the full chat history in the presented
103
103
  question. Defaults to `False`, which presents only the
104
104
  original sample input. Optionally provide a function to
105
105
  customise how the chat history is presented.
106
- partial_credit (bool): Whether to allow for "partial" credit for
106
+ partial_credit: Whether to allow for "partial" credit for
107
107
  answers (by default assigned a score of 0.5). Defaults
108
108
  to `False`. Note that this parameter is only used
109
109
  with the default `instructions` (as custom instructions
110
110
  provide their own prompts for grades).
111
- model (list[str | Model] | str | Model | None): Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
111
+ model: Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used.
112
112
  """
113
113
  # bind variables
114
114
  get_scorer = partial(
@@ -55,11 +55,11 @@ def pattern(pattern: str, ignore_case: bool = True, match_all: bool = False) ->
55
55
  to match either one or all of the extracted groups
56
56
 
57
57
  Args:
58
- pattern (str): Regular expression for extracting the
58
+ pattern: Regular expression for extracting the
59
59
  answer from model output.
60
- ignore_case (bool): Ignore case when comparing
60
+ ignore_case: Ignore case when comparing
61
61
  the extract answer to the targets. (Default: True)
62
- match_all (bool): With multiple captures, do all captured
62
+ match_all: With multiple captures, do all captured
63
63
  values need to match the target? (Default: False)
64
64
  """
65
65
 
@@ -12,6 +12,8 @@ from .types import ScoreReducer
12
12
 
13
13
  @score_reducer(name="mode")
14
14
  def mode_score() -> ScoreReducer:
15
+ r"""Take the mode from a list of scores."""
16
+
15
17
  def reduce(scores: list[Score]) -> Score:
16
18
  r"""A utility function for the most common score in a list of scores.
17
19
 
@@ -36,12 +38,13 @@ def mode_score() -> ScoreReducer:
36
38
 
37
39
  @score_reducer(name="mean")
38
40
  def mean_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
39
- def reduce(scores: list[Score]) -> Score:
40
- r"""A utility function for taking a mean value over a list of scores.
41
+ r"""Take the mean of a list of scores.
41
42
 
42
- Args:
43
- scores: a list of Scores.
44
- """
43
+ Args:
44
+ value_to_float: Function to convert the value to a float
45
+ """
46
+
47
+ def reduce(scores: list[Score]) -> Score:
45
48
  if isinstance(scores[0].value, dict):
46
49
  return _compute_dict_stat(scores, value_to_float, statistics.mean)
47
50
  elif isinstance(scores[0].value, list):
@@ -54,12 +57,13 @@ def mean_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
54
57
 
55
58
  @score_reducer(name="median")
56
59
  def median_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
57
- def reduce(scores: list[Score]) -> Score:
58
- r"""A utility function for taking a median value over a list of scores.
60
+ r"""Take the median value from a list of scores.
59
61
 
60
- Args:
61
- scores: a list of Scores.
62
- """
62
+ Args:
63
+ value_to_float: Function to convert the value to a float
64
+ """
65
+
66
+ def reduce(scores: list[Score]) -> Score:
63
67
  if isinstance(scores[0].value, dict):
64
68
  return _compute_dict_stat(scores, value_to_float, statistics.median)
65
69
  elif isinstance(scores[0].value, list):
@@ -74,13 +78,15 @@ def median_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReduce
74
78
  def at_least(
75
79
  k: int, value: float = 1.0, value_to_float: ValueToFloat = value_to_float()
76
80
  ) -> ScoreReducer:
77
- def reduce(scores: list[Score]) -> Score:
78
- r"""A utility function for scoring a value as correct if there are at least n score values greater than or equal to the value
81
+ r"""Score correct if there are at least k score values greater than or equal to the value.
79
82
 
80
- Args:
81
- scores: a list of Scores.
82
- """
83
+ Args:
84
+ k: Number of score values that must exceed `value`.
85
+ value: Score value threshold.
86
+ value_to_float: Function to convert score values to float.
87
+ """
83
88
 
89
+ def reduce(scores: list[Score]) -> Score:
84
90
  def gte_n(
85
91
  counter: Counter[str | int | float | bool],
86
92
  ) -> str | int | float | bool:
@@ -104,6 +110,14 @@ def at_least(
104
110
  def pass_at(
105
111
  k: int, value: float = 1.0, value_to_float: ValueToFloat = value_to_float()
106
112
  ) -> ScoreReducer:
113
+ r"""Probability of at least 1 correct sample given `k` epochs (<https://arxiv.org/pdf/2107.03374>).
114
+
115
+ Args:
116
+ k: Epochs to compute probability for.
117
+ value: Score value threshold.
118
+ value_to_float: Function to convert score values to float.
119
+ """
120
+
107
121
  def reduce(scores: list[Score]) -> Score:
108
122
  def pass_at_k(values: list[float]) -> float:
109
123
  total = len(scores)
@@ -129,12 +143,13 @@ def pass_at(
129
143
 
130
144
  @score_reducer(name="max")
131
145
  def max_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer:
132
- def reduce(scores: list[Score]) -> Score:
133
- r"""A utility function for taking the maximum value from a list of scores
146
+ r"""Take the maximum value from a list of scores.
134
147
 
135
- Args:
136
- scores: a list of Scores.
137
- """
148
+ Args:
149
+ value_to_float: Function to convert the value to a float
150
+ """
151
+
152
+ def reduce(scores: list[Score]) -> Score:
138
153
  if isinstance(scores[0].value, dict):
139
154
  dict_result: dict[str, str | int | float | bool | None] = {}
140
155
  keys = scores[0].value.keys() # type: ignore
@@ -238,7 +253,7 @@ def _compute_dict_stat(
238
253
 
239
254
  Args:
240
255
  scores: a list of Scores.
241
- value_to_float: function to convert the value to a float
256
+ value_to_float: Function to convert the value to a float
242
257
  statistic: the statistic to apply
243
258
  """
244
259
  # Make sure these are all dictionaries be we proceed