inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (518) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/main.py +1 -1
  8. inspect_ai/_cli/sandbox.py +4 -1
  9. inspect_ai/_cli/score.py +181 -32
  10. inspect_ai/_cli/trace.py +10 -0
  11. inspect_ai/_cli/view.py +4 -2
  12. inspect_ai/_display/core/active.py +2 -3
  13. inspect_ai/_display/core/config.py +7 -1
  14. inspect_ai/_display/textual/widgets/samples.py +4 -3
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/eval.py +104 -101
  17. inspect_ai/_eval/evalset.py +75 -75
  18. inspect_ai/_eval/loader.py +122 -12
  19. inspect_ai/_eval/registry.py +1 -1
  20. inspect_ai/_eval/run.py +14 -0
  21. inspect_ai/_eval/score.py +125 -36
  22. inspect_ai/_eval/task/log.py +105 -4
  23. inspect_ai/_eval/task/results.py +92 -38
  24. inspect_ai/_eval/task/run.py +9 -2
  25. inspect_ai/_eval/task/sandbox.py +35 -2
  26. inspect_ai/_eval/task/task.py +49 -46
  27. inspect_ai/_util/constants.py +1 -1
  28. inspect_ai/_util/content.py +8 -0
  29. inspect_ai/_util/error.py +2 -0
  30. inspect_ai/_util/file.py +15 -1
  31. inspect_ai/_util/hash.py +1 -1
  32. inspect_ai/_util/logger.py +4 -2
  33. inspect_ai/_util/registry.py +7 -1
  34. inspect_ai/_view/view.py +1 -2
  35. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  36. inspect_ai/_view/www/.vscode/settings.json +8 -0
  37. inspect_ai/_view/www/App.css +97 -29
  38. inspect_ai/_view/www/README.md +1 -1
  39. inspect_ai/_view/www/dist/assets/index.css +16663 -14674
  40. inspect_ai/_view/www/dist/assets/index.js +58808 -51348
  41. inspect_ai/_view/www/dist/index.html +1 -1
  42. inspect_ai/_view/www/index.html +2 -2
  43. inspect_ai/_view/www/log-schema.json +87 -73
  44. inspect_ai/_view/www/package.json +22 -4
  45. inspect_ai/_view/www/postcss.config.cjs +8 -9
  46. inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
  47. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  48. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  49. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  50. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  51. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  52. inspect_ai/_view/www/src/api/index.ts +4 -4
  53. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  54. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  55. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  56. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  57. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  58. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  59. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  60. inspect_ai/_view/www/src/components/Card.css +60 -0
  61. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  62. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  63. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  64. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  65. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  66. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  67. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  68. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  69. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  70. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  71. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  72. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  73. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  74. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  75. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  76. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  77. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  78. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  79. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  80. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  81. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  82. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  83. inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
  84. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  85. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  86. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  87. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  88. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  89. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  90. inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
  91. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  92. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  93. inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
  94. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  95. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  96. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  97. inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
  98. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  99. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  100. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  101. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  102. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  103. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
  104. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  105. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  106. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  107. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  108. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  109. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  110. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  111. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  112. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  113. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  114. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  115. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  116. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  117. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  118. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
  119. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  120. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  121. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  122. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  123. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  124. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  126. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  127. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
  128. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  129. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
  130. inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
  131. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  132. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  133. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  134. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  135. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  136. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
  137. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  138. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  139. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  140. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
  141. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  142. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
  143. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  144. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
  145. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  146. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
  147. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  148. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
  149. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  150. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  151. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
  152. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
  153. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  154. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  155. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  156. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  157. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  158. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  159. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  160. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  161. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  162. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  163. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  164. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  165. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  166. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  167. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  168. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  169. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  170. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  171. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  172. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  173. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  174. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  175. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  176. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  177. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  179. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  180. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  181. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  182. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
  183. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  184. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
  185. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  186. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  187. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  188. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  189. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  190. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  191. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  192. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  193. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  194. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  196. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  197. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  198. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  199. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  200. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  201. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  202. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  203. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  204. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  205. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  206. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
  207. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  208. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  209. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  210. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  211. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  212. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  213. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  214. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  215. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
  216. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  217. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  218. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  219. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  220. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  221. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
  222. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  223. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
  224. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  225. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  226. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  227. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  228. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  229. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
  230. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  231. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  232. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
  233. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  234. inspect_ai/_view/www/src/types/log.d.ts +108 -19
  235. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  236. inspect_ai/_view/www/src/types.ts +71 -0
  237. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
  238. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  239. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
  240. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  241. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  242. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  243. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  244. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  245. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  246. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  247. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  248. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  249. inspect_ai/_view/www/src/utils/format.ts +194 -0
  250. inspect_ai/_view/www/src/utils/git.ts +7 -0
  251. inspect_ai/_view/www/src/utils/html.ts +6 -0
  252. inspect_ai/_view/www/src/utils/http.ts +14 -0
  253. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  254. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  255. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  256. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  257. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  258. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  259. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  260. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  261. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
  262. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  263. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  264. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  265. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  266. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  267. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
  268. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
  269. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
  270. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  271. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
  272. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  273. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  274. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  275. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  276. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  277. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  278. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  279. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  280. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  281. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  282. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  283. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
  284. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  285. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
  286. inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
  287. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
  288. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  289. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  290. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  291. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  292. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  293. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  294. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  295. inspect_ai/_view/www/tsconfig.json +23 -9
  296. inspect_ai/_view/www/vite.config.js +8 -17
  297. inspect_ai/_view/www/yarn.lock +627 -556
  298. inspect_ai/approval/_approval.py +2 -0
  299. inspect_ai/approval/_approver.py +4 -4
  300. inspect_ai/approval/_auto.py +1 -1
  301. inspect_ai/approval/_human/approver.py +3 -0
  302. inspect_ai/approval/_policy.py +5 -0
  303. inspect_ai/approval/_registry.py +2 -2
  304. inspect_ai/dataset/_dataset.py +64 -37
  305. inspect_ai/dataset/_sources/__init__.py +0 -0
  306. inspect_ai/dataset/_sources/csv.py +20 -12
  307. inspect_ai/dataset/_sources/file.py +4 -0
  308. inspect_ai/dataset/_sources/hf.py +39 -29
  309. inspect_ai/dataset/_sources/json.py +17 -9
  310. inspect_ai/log/__init__.py +2 -0
  311. inspect_ai/log/_convert.py +3 -3
  312. inspect_ai/log/_file.py +24 -9
  313. inspect_ai/log/_log.py +101 -13
  314. inspect_ai/log/_message.py +4 -2
  315. inspect_ai/log/_recorders/file.py +4 -0
  316. inspect_ai/log/_recorders/json.py +5 -7
  317. inspect_ai/log/_recorders/recorder.py +3 -0
  318. inspect_ai/log/_transcript.py +19 -8
  319. inspect_ai/model/__init__.py +2 -0
  320. inspect_ai/model/_cache.py +39 -21
  321. inspect_ai/model/_call_tools.py +4 -3
  322. inspect_ai/model/_chat_message.py +14 -4
  323. inspect_ai/model/_generate_config.py +1 -1
  324. inspect_ai/model/_model.py +31 -24
  325. inspect_ai/model/_model_output.py +14 -1
  326. inspect_ai/model/_openai.py +10 -18
  327. inspect_ai/model/_providers/anthropic.py +3 -3
  328. inspect_ai/model/_providers/google.py +9 -5
  329. inspect_ai/model/_providers/openai.py +5 -9
  330. inspect_ai/model/_providers/openai_o1.py +3 -5
  331. inspect_ai/model/_providers/openrouter.py +86 -0
  332. inspect_ai/model/_providers/providers.py +11 -0
  333. inspect_ai/scorer/__init__.py +6 -1
  334. inspect_ai/scorer/_answer.py +7 -7
  335. inspect_ai/scorer/_classification.py +38 -18
  336. inspect_ai/scorer/_common.py +2 -8
  337. inspect_ai/scorer/_match.py +4 -5
  338. inspect_ai/scorer/_metric.py +87 -28
  339. inspect_ai/scorer/_metrics/__init__.py +3 -3
  340. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  341. inspect_ai/scorer/_metrics/mean.py +3 -17
  342. inspect_ai/scorer/_metrics/std.py +111 -30
  343. inspect_ai/scorer/_model.py +12 -12
  344. inspect_ai/scorer/_pattern.py +3 -3
  345. inspect_ai/scorer/_reducer/reducer.py +36 -21
  346. inspect_ai/scorer/_reducer/registry.py +2 -2
  347. inspect_ai/scorer/_reducer/types.py +7 -1
  348. inspect_ai/scorer/_score.py +11 -1
  349. inspect_ai/scorer/_scorer.py +110 -16
  350. inspect_ai/solver/__init__.py +1 -1
  351. inspect_ai/solver/_basic_agent.py +19 -22
  352. inspect_ai/solver/_bridge/__init__.py +0 -3
  353. inspect_ai/solver/_bridge/bridge.py +3 -3
  354. inspect_ai/solver/_chain.py +1 -2
  355. inspect_ai/solver/_critique.py +3 -3
  356. inspect_ai/solver/_fork.py +2 -2
  357. inspect_ai/solver/_human_agent/__init__.py +0 -0
  358. inspect_ai/solver/_human_agent/agent.py +5 -8
  359. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  360. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  361. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  362. inspect_ai/solver/_multiple_choice.py +38 -26
  363. inspect_ai/solver/_prompt.py +7 -7
  364. inspect_ai/solver/_solver.py +53 -52
  365. inspect_ai/solver/_task_state.py +80 -69
  366. inspect_ai/solver/_use_tools.py +9 -9
  367. inspect_ai/tool/__init__.py +4 -1
  368. inspect_ai/tool/_tool.py +43 -14
  369. inspect_ai/tool/_tool_call.py +6 -2
  370. inspect_ai/tool/_tool_choice.py +3 -1
  371. inspect_ai/tool/_tool_def.py +10 -8
  372. inspect_ai/tool/_tool_params.py +24 -0
  373. inspect_ai/tool/_tool_with.py +7 -7
  374. inspect_ai/tool/_tools/__init__.py +0 -0
  375. inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
  376. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
  377. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  378. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  379. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  380. inspect_ai/tool/_tools/_execute.py +23 -11
  381. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  382. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  383. inspect_ai/tool/_tools/_web_search.py +7 -5
  384. inspect_ai/tool/beta.py +3 -0
  385. inspect_ai/util/_concurrency.py +3 -3
  386. inspect_ai/util/_panel.py +2 -0
  387. inspect_ai/util/_resource.py +12 -12
  388. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  389. inspect_ai/util/_sandbox/docker/config.py +2 -1
  390. inspect_ai/util/_sandbox/docker/docker.py +42 -86
  391. inspect_ai/util/_sandbox/docker/service.py +100 -0
  392. inspect_ai/util/_sandbox/environment.py +99 -96
  393. inspect_ai/util/_sandbox/self_check.py +124 -16
  394. inspect_ai/util/_subprocess.py +5 -3
  395. inspect_ai/util/_subtask.py +15 -16
  396. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
  397. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
  398. inspect_ai-0.3.64.dist-info/RECORD +625 -0
  399. inspect_ai/_view/www/src/Register.mjs +0 -3
  400. inspect_ai/_view/www/src/Types.mjs +0 -38
  401. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  402. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  403. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  404. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  405. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  406. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  407. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  408. inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
  409. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  410. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  411. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  412. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  413. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  414. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  415. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  416. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  417. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  418. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  419. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  420. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  421. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  422. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  423. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  424. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  425. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  426. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  427. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  428. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  429. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  430. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  431. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  432. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  433. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  434. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  435. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  436. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  437. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  438. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  439. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  440. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  441. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  442. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  443. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  444. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  445. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  446. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  447. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  448. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  449. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  450. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  451. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  452. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  453. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  454. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  455. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  456. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  457. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  458. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  459. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  460. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  461. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  462. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  463. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  464. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  465. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  466. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  467. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  468. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  469. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  470. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  471. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  472. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  473. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  474. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  475. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  476. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  477. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
  478. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  479. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  480. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  481. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  482. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  483. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  484. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  485. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  486. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  487. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  488. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  489. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  490. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  491. inspect_ai/tool/beta/__init__.py +0 -5
  492. inspect_ai-0.3.62.dist-info/RECORD +0 -481
  493. /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
  494. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
  495. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  496. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  497. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  498. /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
  499. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  500. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  501. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  502. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  503. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  504. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  505. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  506. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
  507. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
  508. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  509. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  510. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
  511. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  512. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  513. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  514. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  515. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  516. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
  517. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
  518. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/_eval/eval.py CHANGED
@@ -89,67 +89,67 @@ def eval(
89
89
  r"""Evaluate tasks using a Model.
90
90
 
91
91
  Args:
92
- tasks: (Tasks): Task(s) to evaluate. If None, attempt
92
+ tasks: Task(s) to evaluate. If None, attempt
93
93
  to evaluate a task in the current working directory
94
- model (str | Model | list[str] | list[Model] | None): Model(s) for
94
+ model: Model(s) for
95
95
  evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
96
96
  environment variable.
97
- model_base_url: (str | None): Base URL for communicating
97
+ model_base_url: Base URL for communicating
98
98
  with the model API.
99
- model_args (dict[str,Any] | str): Model creation args
99
+ model_args: Model creation args
100
100
  (as a dictionary or as a path to a JSON or YAML config file)
101
- task_args (dict[str,Any] | str): Task creation arguments
101
+ task_args: Task creation arguments
102
102
  (as a dictionary or as a path to a JSON or YAML config file)
103
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
104
- (or optionally a str or tuple with a shorthand spec)
105
- sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
106
- (defaults to True)
107
- solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
108
- Optional (uses task solver by default).
109
- tags (list[str] | None): Tags to associate with this evaluation run.
110
- trace (bool | None): Trace message interactions with evaluated model to terminal.
111
- display (DisplayType | None): Task display type (defaults to 'full').
112
- approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
113
- Either a path to an approval policy config file or a list of approval policies.
114
- Defaults to no approval policy.
115
- log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
116
- "info", "warning", "error", or "critical" (defaults to "warning")
117
- log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
118
- log_dir (str | None): Output path for logging results
119
- (defaults to file log in ./logs directory).
120
- log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
121
- to "eval", the native high-performance format).
122
- limit (int | tuple[int, int] | None): Limit evaluated samples
123
- (defaults to all samples).
124
- sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset.
125
- epochs (int | Epochs | None): Epochs to repeat samples for and optional score
126
- reducer function(s) used to combine sample scores (defaults to "mean")
127
- fail_on_error (bool | float | None): `True` to fail on first sample error
128
- (default); `False` to never fail on sample errors; Value between 0 and 1
129
- to fail if a proportion of total samples fails. Value greater than 1 to fail
130
- eval if a count of samples fails.
131
- debug_errors (bool | None): Raise task errors (rather than logging them)
132
- so they can be debugged (defaults to False).
133
- message_limit (int | None): Limit on total messages used for each sample.
134
- token_limit (int | None): Limit on total tokens used for each sample.
135
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
136
- max_samples (int | None): Maximum number of samples to run in parallel
137
- (default is max_connections)
138
- max_tasks (int | None): Maximum number of tasks to run in parallel
139
- (default is 1)
140
- max_subprocesses (int | None): Maximum number of subprocesses to
141
- run in parallel (default is os.cpu_count())
142
- max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
143
- to run in parallel.
144
- log_samples: (bool | None): Log detailed samples and scores (defaults to True)
145
- log_images: (bool | None): Log base64 encoded version of images,
146
- even if specified as a filename or URL (defaults to False)
147
- log_buffer: (int | None): Number of samples to buffer before writing log file.
148
- If not specified, an appropriate default for the format and filesystem is
149
- chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
150
- score (bool): Score output (defaults to True)
151
- score_display (bool | None): Show scoring metrics in realtime (defaults to True)
152
- **kwargs (GenerateConfigArgs): Model generation options.
103
+ sandbox: Sandbox environment type
104
+ (or optionally a str or tuple with a shorthand spec)
105
+ sandbox_cleanup: Cleanup sandbox environments after task completes
106
+ (defaults to True)
107
+ solver: Alternative solver for task(s).
108
+ Optional (uses task solver by default).
109
+ tags: Tags to associate with this evaluation run.
110
+ trace: Trace message interactions with evaluated model to terminal.
111
+ display: Task display type (defaults to 'full').
112
+ approval: Tool use approval policies.
113
+ Either a path to an approval policy config file or a list of approval policies.
114
+ Defaults to no approval policy.
115
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
116
+ "info", "warning", "error", or "critical" (defaults to "warning")
117
+ log_level_transcript: Level for logging to the log file (defaults to "info")
118
+ log_dir: Output path for logging results
119
+ (defaults to file log in ./logs directory).
120
+ log_format: Format for writing log files (defaults
121
+ to "eval", the native high-performance format).
122
+ limit: Limit evaluated samples
123
+ (defaults to all samples).
124
+ sample_id: Evaluate specific sample(s) from the dataset.
125
+ epochs: Epochs to repeat samples for and optional score
126
+ reducer function(s) used to combine sample scores (defaults to "mean")
127
+ fail_on_error: `True` to fail on first sample error
128
+ (default); `False` to never fail on sample errors; Value between 0 and 1
129
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
130
+ eval if a count of samples fails.
131
+ debug_errors: Raise task errors (rather than logging them)
132
+ so they can be debugged (defaults to False).
133
+ message_limit: Limit on total messages used for each sample.
134
+ token_limit: Limit on total tokens used for each sample.
135
+ time_limit: Limit on time (in seconds) for execution of each sample.
136
+ max_samples: Maximum number of samples to run in parallel
137
+ (default is max_connections)
138
+ max_tasks: Maximum number of tasks to run in parallel
139
+ (default is 1)
140
+ max_subprocesses: Maximum number of subprocesses to
141
+ run in parallel (default is os.cpu_count())
142
+ max_sandboxes: Maximum number of sandboxes (per-provider)
143
+ to run in parallel.
144
+ log_samples: Log detailed samples and scores (defaults to True)
145
+ log_images: Log base64 encoded version of images,
146
+ even if specified as a filename or URL (defaults to False)
147
+ log_buffer: Number of samples to buffer before writing log file.
148
+ If not specified, an appropriate default for the format and filesystem is
149
+ chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
150
+ score: Score output (defaults to True)
151
+ score_display: Show scoring metrics in realtime (defaults to True)
152
+ **kwargs: Model generation options.
153
153
 
154
154
  Returns:
155
155
  List of EvalLog (one for each task)
@@ -200,6 +200,10 @@ def eval(
200
200
  )
201
201
 
202
202
 
203
+ # single call to eval_async at a time
204
+ _eval_async_running = False
205
+
206
+
203
207
  async def eval_async(
204
208
  tasks: Tasks,
205
209
  model: str | Model | list[str] | list[Model] | None = None,
@@ -355,10 +359,14 @@ async def eval_async(
355
359
  "Trace mode cannot be used when evaluating multiple models."
356
360
  )
357
361
 
358
- # resolve recorder
362
+ # resolve recorder (confirm writeable)
359
363
  log_dir = log_dir if log_dir else os.environ.get("INSPECT_LOG_DIR", "./logs")
360
364
  log_dir = absolute_file_path(log_dir)
361
365
  recorder = create_recorder_for_format(log_format or DEFAULT_LOG_FORMAT, log_dir)
366
+ if not recorder.is_writeable():
367
+ raise PrerequisiteError(
368
+ f"ERROR: You do not have write permission for the log_dir '{log_dir}'"
369
+ )
362
370
 
363
371
  # resolve solver
364
372
  solver = chain(solver) if isinstance(solver, list) else solver
@@ -461,10 +469,6 @@ async def eval_async(
461
469
  return logs
462
470
 
463
471
 
464
- # single call to eval_async at a time
465
- _eval_async_running = False
466
-
467
-
468
472
  def eval_retry(
469
473
  tasks: str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog],
470
474
  log_level: str | None = None,
@@ -492,47 +496,46 @@ def eval_retry(
492
496
  """Retry a previously failed evaluation task.
493
497
 
494
498
  Args:
495
- tasks: (str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog]):
496
- Log files for task(s) to retry.
497
- log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
498
- "info", "warning", "error", or "critical" (defaults to "warning")
499
- log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
500
- log_dir (str | None): Output path for logging results
501
- (defaults to file log in ./logs directory).
502
- log_format (Literal["eval", "json"] | None): Format for writing log files (defaults
503
- to "eval", the native high-performance format).
504
- max_samples (int | None): Maximum number of samples to run in parallel
505
- (default is max_connections)
506
- max_tasks (int | None): Maximum number of tasks to run in parallel
507
- (default is 1)
508
- max_subprocesses (int | None): Maximum number of subprocesses to
509
- run in parallel (default is os.cpu_count())
510
- max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
511
- to run in parallel.
512
- sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
513
- (defaults to True)
514
- trace (bool | None): Trace message interactions with evaluated model to terminal.
515
- display (DisplayType | None): Task display type (defaults to 'full').
516
- fail_on_error (bool | float | None): `True` to fail on first sample error
517
- (default); `False` to never fail on sample errors; Value between 0 and 1
518
- to fail if a proportion of total samples fails. Value greater than 1 to fail
519
- eval if a count of samples fails.
520
- debug_errors (bool | None): Raise task errors (rather than logging them)
521
- so they can be debugged (defaults to False).
522
- log_samples: (bool | None): Log detailed samples and scores (defaults to True)
523
- log_images: (bool | None): Log base64 encoded version of images,
524
- even if specified as a filename or URL (defaults to False)
525
- log_buffer: (int | None): Number of samples to buffer before writing log file.
526
- If not specified, an appropriate default for the format and filesystem is
527
- chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
528
- score (bool): Score output (defaults to True)
529
- score_display (bool | None): Show scoring metrics in realtime (defaults to True)
530
- max_retries (int | None):
531
- Maximum number of times to retry request.
532
- timeout: (int | None):
533
- Request timeout (in seconds)
534
- max_connections (int | None):
535
- Maximum number of concurrent connections to Model API (default is per Model API)
499
+ tasks: Log files for task(s) to retry.
500
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
501
+ "info", "warning", "error", or "critical" (defaults to "warning")
502
+ log_level_transcript: Level for logging to the log file (defaults to "info")
503
+ log_dir: Output path for logging results
504
+ (defaults to file log in ./logs directory).
505
+ log_format: Format for writing log files (defaults
506
+ to "eval", the native high-performance format).
507
+ max_samples: Maximum number of samples to run in parallel
508
+ (default is max_connections)
509
+ max_tasks: Maximum number of tasks to run in parallel
510
+ (default is 1)
511
+ max_subprocesses: Maximum number of subprocesses to
512
+ run in parallel (default is os.cpu_count())
513
+ max_sandboxes: Maximum number of sandboxes (per-provider)
514
+ to run in parallel.
515
+ sandbox_cleanup: Cleanup sandbox environments after task completes
516
+ (defaults to True)
517
+ trace: Trace message interactions with evaluated model to terminal.
518
+ display: Task display type (defaults to 'full').
519
+ fail_on_error: `True` to fail on first sample error
520
+ (default); `False` to never fail on sample errors; Value between 0 and 1
521
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
522
+ eval if a count of samples fails.
523
+ debug_errors: Raise task errors (rather than logging them)
524
+ so they can be debugged (defaults to False).
525
+ log_samples: Log detailed samples and scores (defaults to True)
526
+ log_images: Log base64 encoded version of images,
527
+ even if specified as a filename or URL (defaults to False)
528
+ log_buffer: Number of samples to buffer before writing log file.
529
+ If not specified, an appropriate default for the format and filesystem is
530
+ chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
531
+ score: Score output (defaults to True)
532
+ score_display: Show scoring metrics in realtime (defaults to True)
533
+ max_retries:
534
+ Maximum number of times to retry request.
535
+ timeout:
536
+ Request timeout (in seconds)
537
+ max_connections:
538
+ Maximum number of concurrent connections to Model API (default is per Model API)
536
539
 
537
540
  Returns:
538
541
  List of EvalLog (one for each task)
@@ -43,6 +43,12 @@ from .task.task import PreviousTask, Task
43
43
  logger = logging.getLogger(__name__)
44
44
 
45
45
 
46
+ class Log(NamedTuple):
47
+ info: EvalLogInfo
48
+ header: EvalLog
49
+ task_identifier: str
50
+
51
+
46
52
  def eval_set(
47
53
  tasks: Tasks,
48
54
  log_dir: str,
@@ -87,79 +93,79 @@ def eval_set(
87
93
  r"""Evaluate a set of tasks.
88
94
 
89
95
  Args:
90
- tasks: (Tasks): Task(s) to evaluate. If None, attempt
96
+ tasks: Task(s) to evaluate. If None, attempt
91
97
  to evaluate a task in the current working directory
92
- log_dir (str): Output path for logging results
93
- (required to ensure that a unique storage scope is assigned for the set).
94
- retry_attempts: (int | None): Maximum number of retry attempts before giving up
95
- (defaults to 10).
96
- retry_wait (float | None): Time to wait between attempts, increased exponentially.
97
- (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
98
- per-retry will in no case by longer than 1 hour.
99
- retry_connections (float | None): Reduce max_connections at this rate with each retry
100
- (defaults to 0.5)
101
- retry_cleanup (bool | None): Cleanup failed log files after retries
102
- (defaults to True)
103
- model (str | Model | list[str] | list[Model] | None): Model(s) for
104
- evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
105
- environment variable.
106
- model_base_url: (str | None): Base URL for communicating
107
- with the model API.
108
- model_args (dict[str,Any] | str): Model creation args
109
- (as a dictionary or as a path to a JSON or YAML config file)
110
- task_args (dict[str,Any] | str): Task creation arguments
111
- (as a dictionary or as a path to a JSON or YAML config file)
112
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
113
- (or optionally a str or tuple with a shorthand spec)
114
- sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
115
- (defaults to True)
116
- solver (Solver | list[Solver] | SolverSpec | None): Alternative solver(s) for
117
- evaluating task(s). ptional (uses task solver by default).
118
- tags (list[str] | None): Tags to associate with this evaluation run.
119
- trace: (bool | None): Trace message interactions with evaluated model to terminal.
120
- display (DisplayType | None): Task display type (defaults to 'full').
121
- approval: (str | list[ApprovalPolicy] | None): Tool use approval policies.
122
- Either a path to an approval policy config file or a list of approval policies.
123
- Defaults to no approval policy.
124
- score (bool): Score output (defaults to True)
125
- log_level (str | None): Level for logging to the console: "debug", "http", "sandbox",
126
- "info", "warning", "error", or "critical" (defaults to "warning")
127
- log_level_transcript (str | None): Level for logging to the log file (defaults to "info")
128
- log_format (Literal["eval", "json"] | None): Format for writing
129
- log files (defaults to "eval", the native high-performance format).
130
- limit (int | tuple[int, int] | None): Limit evaluated samples
131
- (defaults to all samples).
132
- sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset.
133
- epochs (int | Epochs | None): Epochs to repeat samples for and optional score
134
- reducer function(s) used to combine sample scores (defaults to "mean")
135
- fail_on_error (bool | float | None): `True` to fail on first sample error
136
- (default); `False` to never fail on sample errors; Value between 0 and 1
137
- to fail if a proportion of total samples fails. Value greater than 1 to fail
138
- eval if a count of samples fails.
139
- debug_errors (bool | None): Raise task errors (rather than logging them)
140
- so they can be debugged (defaults to False).
141
- message_limit (int | None): Limit on total messages used for each sample.
142
- token_limit (int | None): Limit on total tokens used for each sample.
143
- time_limit (int | None): Limit on time (in seconds) for execution of each sample.
144
- max_samples (int | None): Maximum number of samples to run in parallel
145
- (default is max_connections)
146
- max_tasks (int | None): Maximum number of tasks to run in parallel
147
- (default is 1)
148
- max_subprocesses (int | None): Maximum number of subprocesses to
149
- run in parallel (default is os.cpu_count())
150
- max_sandboxes (int | None): Maximum number of sandboxes (per-provider)
151
- to run in parallel.
152
- log_samples: (bool | None): Log detailed samples and scores (defaults to True)
153
- log_images: (bool | None): Log base64 encoded version of images,
98
+ log_dir: Output path for logging results
99
+ (required to ensure that a unique storage scope is assigned for the set).
100
+ retry_attempts: Maximum number of retry attempts before giving up
101
+ (defaults to 10).
102
+ retry_wait: Time to wait between attempts, increased exponentially.
103
+ (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time
104
+ per-retry will in no case by longer than 1 hour.
105
+ retry_connections: Reduce max_connections at this rate with each retry
106
+ (defaults to 0.5)
107
+ retry_cleanup: Cleanup failed log files after retries
108
+ (defaults to True)
109
+ model: Model(s) for
110
+ evaluation. If not specified use the value of the INSPECT_EVAL_MODEL
111
+ environment variable.
112
+ model_base_url: Base URL for communicating
113
+ with the model API.
114
+ model_args: Model creation args
115
+ (as a dictionary or as a path to a JSON or YAML config file)
116
+ task_args: Task creation arguments
117
+ (as a dictionary or as a path to a JSON or YAML config file)
118
+ sandbox: Sandbox environment type
119
+ (or optionally a str or tuple with a shorthand spec)
120
+ sandbox_cleanup: Cleanup sandbox environments after task completes
121
+ (defaults to True)
122
+ solver: Alternative solver(s) for
123
+ evaluating task(s). ptional (uses task solver by default).
124
+ tags: Tags to associate with this evaluation run.
125
+ trace: Trace message interactions with evaluated model to terminal.
126
+ display: Task display type (defaults to 'full').
127
+ approval: Tool use approval policies.
128
+ Either a path to an approval policy config file or a list of approval policies.
129
+ Defaults to no approval policy.
130
+ score: Score output (defaults to True)
131
+ log_level: Level for logging to the console: "debug", "http", "sandbox",
132
+ "info", "warning", "error", or "critical" (defaults to "warning")
133
+ log_level_transcript: Level for logging to the log file (defaults to "info")
134
+ log_format: Format for writing
135
+ log files (defaults to "eval", the native high-performance format).
136
+ limit: Limit evaluated samples
137
+ (defaults to all samples).
138
+ sample_id: Evaluate specific sample(s) from the dataset.
139
+ epochs: Epochs to repeat samples for and optional score
140
+ reducer function(s) used to combine sample scores (defaults to "mean")
141
+ fail_on_error: `True` to fail on first sample error
142
+ (default); `False` to never fail on sample errors; Value between 0 and 1
143
+ to fail if a proportion of total samples fails. Value greater than 1 to fail
144
+ eval if a count of samples fails.
145
+ debug_errors: Raise task errors (rather than logging them)
146
+ so they can be debugged (defaults to False).
147
+ message_limit: Limit on total messages used for each sample.
148
+ token_limit: Limit on total tokens used for each sample.
149
+ time_limit: Limit on time (in seconds) for execution of each sample.
150
+ max_samples: Maximum number of samples to run in parallel
151
+ (default is max_connections)
152
+ max_tasks: Maximum number of tasks to run in parallel
153
+ (default is 1)
154
+ max_subprocesses: Maximum number of subprocesses to
155
+ run in parallel (default is os.cpu_count())
156
+ max_sandboxes: Maximum number of sandboxes (per-provider)
157
+ to run in parallel.
158
+ log_samples: Log detailed samples and scores (defaults to True)
159
+ log_images: Log base64 encoded version of images,
154
160
  even if specified as a filename or URL (defaults to False)
155
- log_buffer: (int | None): Number of samples to buffer before writing log file.
156
- If not specified, an appropriate default for the format and filesystem is
157
- chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
158
- bundle_dir: (str | None): If specified, the log viewer and logs generated
161
+ log_buffer: Number of samples to buffer before writing log file.
162
+ If not specified, an appropriate default for the format and filesystem is
163
+ chosen (10 for most all cases, 100 for JSON logs on remote filesystems).
164
+ bundle_dir: If specified, the log viewer and logs generated
159
165
  by this eval set will be bundled into this directory.
160
- bundle_overwrite (bool): Whether to overwrite files in the bundle_dir.
166
+ bundle_overwrite: Whether to overwrite files in the bundle_dir.
161
167
  (defaults to False).
162
- **kwargs (GenerateConfigArgs): Model generation options.
168
+ **kwargs: Model generation options.
163
169
 
164
170
  Returns:
165
171
  Tuple of bool (whether all tasks completed successfully) and list of EvalLog
@@ -452,12 +458,6 @@ def return_last_value(retry_state: RetryCallState) -> list[EvalLog]:
452
458
  return []
453
459
 
454
460
 
455
- class Log(NamedTuple):
456
- info: EvalLogInfo
457
- header: EvalLog
458
- task_identifier: str
459
-
460
-
461
461
  # list all eval logs
462
462
  def list_all_eval_logs(log_dir: str) -> list[Log]:
463
463
  log_files = list_eval_logs(log_dir)
@@ -8,7 +8,7 @@ from importlib.util import module_from_spec, spec_from_loader
8
8
  from logging import getLogger
9
9
  from pathlib import Path
10
10
  from types import ModuleType
11
- from typing import Any, Callable, cast
11
+ from typing import Any, Callable, Tuple, cast
12
12
 
13
13
  from typing_extensions import overload
14
14
 
@@ -26,6 +26,7 @@ from inspect_ai._util.registry import (
26
26
  registry_params,
27
27
  )
28
28
  from inspect_ai.model import Model, ModelName
29
+ from inspect_ai.scorer._scorer import Scorer, ScorerSpec, scorer_create
29
30
  from inspect_ai.solver._bridge import bridge
30
31
  from inspect_ai.solver._solver import Solver, SolverSpec
31
32
  from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
@@ -421,16 +422,7 @@ def as_solver_spec(solver: Solver) -> SolverSpec:
421
422
 
422
423
  def solver_from_spec(spec: SolverSpec) -> Solver:
423
424
  # resolve @ reference
424
- spec_split = split_spec(spec.solver)
425
- if spec_split[1] is not None:
426
- solver_file: Path | None = Path(spec_split[0]).resolve()
427
- solver_name: str | None = spec_split[1]
428
- elif Path(spec_split[0]).exists():
429
- solver_file = Path(spec_split[0]).resolve()
430
- solver_name = None
431
- else:
432
- solver_file = None
433
- solver_name = spec_split[0]
425
+ solver_file, solver_name = parse_spec_str(spec.solver)
434
426
 
435
427
  # switch contexts if we are loading from a file
436
428
  create_cm = (
@@ -501,7 +493,7 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
501
493
  else:
502
494
  agent_fn = getattr(solver_module, solver_name, None)
503
495
  if inspect.isfunction(agent_fn):
504
- return bridge(agent_fn(**spec.args))
496
+ return bridge.bridge(agent_fn(**spec.args))
505
497
  elif agent_fn is not None:
506
498
  raise PrerequisiteError(
507
499
  f"The object {solver_name} in file {pretty_solver_file} is not a Python function."
@@ -510,3 +502,121 @@ def solver_from_spec(spec: SolverSpec) -> Solver:
510
502
  raise PrerequisiteError(
511
503
  f"The function {solver_name} was not found in file {pretty_solver_file}."
512
504
  )
505
+
506
+
507
+ def scorer_from_spec(spec: ScorerSpec, task_path: Path | None, **kwargs: Any) -> Scorer:
508
+ """
509
+ Load a scorer
510
+
511
+ Args:
512
+ spec: The scorer spec
513
+ task_path: An optional path to the task file
514
+ **kwargs: Additional keyword arguments passed to the scorer initialization
515
+
516
+ Returns:
517
+ Scorer: the loaded scorer
518
+
519
+ Raises:
520
+ PrerequisiteError: If the scorer cannot be found, loaded, or lacks required type annotations
521
+ """
522
+ # resolve @ reference
523
+ scorer_file, scorer_name = parse_spec_str(spec.scorer)
524
+
525
+ # switch contexts if we are loading from a file
526
+ create_cm = (
527
+ chdir_python(scorer_file.parent.as_posix())
528
+ if scorer_file is not None
529
+ else contextlib.nullcontext()
530
+ )
531
+
532
+ # pretty solver name for error messages
533
+ pretty_scorer_file = (
534
+ cwd_relative_path(scorer_file.as_posix()) if scorer_file else None
535
+ )
536
+
537
+ with create_cm:
538
+ # is there a scorer file being provided? if not, load from registry
539
+ if scorer_file is None:
540
+ if scorer_name is None:
541
+ raise ValueError(f"Unable to resolve scorer name from {spec.scorer}")
542
+
543
+ try:
544
+ return scorer_create(scorer_name, **kwargs)
545
+ except ValueError:
546
+ # We need a valid path to a scorer file to try to load the scorer from there
547
+ if not task_path:
548
+ raise PrerequisiteError(
549
+ f"The scorer '{scorer_name}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter"
550
+ )
551
+
552
+ task_pretty_path = task_path.as_posix()
553
+ if not task_path.exists():
554
+ raise PrerequisiteError(
555
+ f"The scorer `{scorer_name}` couldn't be loaded. The file '{task_pretty_path}' was not found. Please provide a path to the file containing the scorer using the '--scorer' parameter"
556
+ )
557
+
558
+ # We have the path to a file, so load that and try again
559
+ try:
560
+ load_module(task_path)
561
+ scorer_fn = scorer_create(scorer_name, **kwargs)
562
+
563
+ # See if the scorer doesn't have type annotations. Currently the registry will not load
564
+ # the function without type annotations.
565
+ # TODO: We could consider calling this ourselves if we're certain it is what we're looking for
566
+ signature = inspect.signature(scorer_fn)
567
+ if signature.return_annotation is inspect.Signature.empty:
568
+ raise PrerequisiteError(
569
+ f"The scorer '{scorer_name}' in the file '{task_pretty_path}' requires return type annotations. Please add type annotations to load the scorer."
570
+ )
571
+ return scorer_fn
572
+ except ValueError:
573
+ # we still couldn't load this, request the user provide a path
574
+ raise PrerequisiteError(
575
+ f"The scorer '{scorer_name}' in the file '{task_pretty_path}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter."
576
+ )
577
+ except ModuleNotFoundError:
578
+ # we still couldn't load this, request the user provide a path
579
+ raise PrerequisiteError(
580
+ f"The scorer '{scorer_name}' in the file '{task_pretty_path}' couldn't be loaded. Please provide a path to the file containing the scorer using the '--scorer' parameter."
581
+ )
582
+
583
+ # solver is a path, so load it that way
584
+ else:
585
+ load_module(scorer_file)
586
+ decorators = parse_decorators(scorer_file, "scorer")
587
+
588
+ # if there is no solver_name see if we can discover it
589
+ if scorer_name is None:
590
+ if len(decorators) == 1:
591
+ # decorator based solver
592
+ scorer_name = decorators[0][0]
593
+ elif len(decorators) == 0:
594
+ raise PrerequisiteError(
595
+ f"The source file {pretty_scorer_file} does not contain any @scorer functions."
596
+ )
597
+ else:
598
+ raise PrerequisiteError(
599
+ f"The source file {pretty_scorer_file} has more than one @solver function (qualify which solver using e.g. '{scorer_file.name}y@solver_fn')"
600
+ )
601
+
602
+ # create decorator based solvers using the registry
603
+ if any(solver[0] == scorer_name for solver in decorators):
604
+ return scorer_create(scorer_name, **kwargs)
605
+ else:
606
+ raise PrerequisiteError(
607
+ f"The function {scorer_name} was not found in file {pretty_scorer_file}."
608
+ )
609
+
610
+
611
+ def parse_spec_str(spec_str: str) -> Tuple[Path | None, str | None]:
612
+ spec_split = split_spec(spec_str)
613
+ if spec_split[1] is not None:
614
+ file: Path | None = Path(spec_split[0]).resolve()
615
+ name: str | None = spec_split[1]
616
+ elif Path(spec_split[0]).exists():
617
+ file = Path(spec_split[0]).resolve()
618
+ name = None
619
+ else:
620
+ file = None
621
+ name = spec_split[0]
622
+ return file, name
@@ -148,7 +148,7 @@ def task(*args: Any, name: str | None = None, **attribs: Any) -> Any:
148
148
  # module import, so set its task file and run dir
149
149
  if get_installed_package_name(task_type) is None:
150
150
  module = inspect.getmodule(task_type)
151
- if module and hasattr(module, "__file__"):
151
+ if module and hasattr(module, "__file__") and module.__file__:
152
152
  file = Path(getattr(module, "__file__"))
153
153
  setattr(task_instance, TASK_FILE_ATTR, file.as_posix())
154
154
  setattr(task_instance, TASK_RUN_DIR_ATTR, file.parent.as_posix())
inspect_ai/_eval/run.py CHANGED
@@ -20,8 +20,10 @@ from inspect_ai.log import EvalConfig, EvalLog
20
20
  from inspect_ai.log._recorders import Recorder
21
21
  from inspect_ai.model import GenerateConfigArgs
22
22
  from inspect_ai.model._model import ModelName
23
+ from inspect_ai.scorer._metric import to_metric_specs
23
24
  from inspect_ai.scorer._reducer import ScoreReducer, reducer_log_names
24
25
  from inspect_ai.scorer._reducer.registry import validate_reducer
26
+ from inspect_ai.scorer._scorer import as_scorer_spec
25
27
  from inspect_ai.solver._solver import Solver, SolverSpec
26
28
  from inspect_ai.util._sandbox.environment import (
27
29
  SandboxEnvironmentConfigType,
@@ -100,6 +102,16 @@ async def eval_run(
100
102
  eval_solver = None
101
103
  eval_solver_spec = None
102
104
 
105
+ # resolve the task scorers
106
+ eval_scorer_specs = (
107
+ [as_scorer_spec(scorer) for scorer in task.scorer]
108
+ if task.scorer is not None
109
+ else None
110
+ )
111
+
112
+ # resolve task metrics
113
+ eval_metrics = to_metric_specs(task.metrics) if task.metrics is not None else None
114
+
103
115
  try:
104
116
  # create run tasks
105
117
  task_run_options: list[TaskRunOptions] = []
@@ -168,6 +180,8 @@ async def eval_run(
168
180
  tags=tags,
169
181
  model=resolved_task.model,
170
182
  dataset=task.dataset,
183
+ scorer=eval_scorer_specs,
184
+ metrics=eval_metrics,
171
185
  sandbox=resolved_task.sandbox,
172
186
  task_attribs=task.attribs,
173
187
  task_args=resolved_task.task_args,