inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (518) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/main.py +1 -1
  8. inspect_ai/_cli/sandbox.py +4 -1
  9. inspect_ai/_cli/score.py +181 -32
  10. inspect_ai/_cli/trace.py +10 -0
  11. inspect_ai/_cli/view.py +4 -2
  12. inspect_ai/_display/core/active.py +2 -3
  13. inspect_ai/_display/core/config.py +7 -1
  14. inspect_ai/_display/textual/widgets/samples.py +4 -3
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/eval.py +104 -101
  17. inspect_ai/_eval/evalset.py +75 -75
  18. inspect_ai/_eval/loader.py +122 -12
  19. inspect_ai/_eval/registry.py +1 -1
  20. inspect_ai/_eval/run.py +14 -0
  21. inspect_ai/_eval/score.py +125 -36
  22. inspect_ai/_eval/task/log.py +105 -4
  23. inspect_ai/_eval/task/results.py +92 -38
  24. inspect_ai/_eval/task/run.py +9 -2
  25. inspect_ai/_eval/task/sandbox.py +35 -2
  26. inspect_ai/_eval/task/task.py +49 -46
  27. inspect_ai/_util/constants.py +1 -1
  28. inspect_ai/_util/content.py +8 -0
  29. inspect_ai/_util/error.py +2 -0
  30. inspect_ai/_util/file.py +15 -1
  31. inspect_ai/_util/hash.py +1 -1
  32. inspect_ai/_util/logger.py +4 -2
  33. inspect_ai/_util/registry.py +7 -1
  34. inspect_ai/_view/view.py +1 -2
  35. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  36. inspect_ai/_view/www/.vscode/settings.json +8 -0
  37. inspect_ai/_view/www/App.css +97 -29
  38. inspect_ai/_view/www/README.md +1 -1
  39. inspect_ai/_view/www/dist/assets/index.css +16663 -14674
  40. inspect_ai/_view/www/dist/assets/index.js +58808 -51348
  41. inspect_ai/_view/www/dist/index.html +1 -1
  42. inspect_ai/_view/www/index.html +2 -2
  43. inspect_ai/_view/www/log-schema.json +87 -73
  44. inspect_ai/_view/www/package.json +22 -4
  45. inspect_ai/_view/www/postcss.config.cjs +8 -9
  46. inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
  47. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  48. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  49. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  50. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  51. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  52. inspect_ai/_view/www/src/api/index.ts +4 -4
  53. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  54. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  55. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  56. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  57. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  58. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  59. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  60. inspect_ai/_view/www/src/components/Card.css +60 -0
  61. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  62. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  63. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  64. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  65. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  66. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  67. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  68. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  69. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  70. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  71. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  72. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  73. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  74. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  75. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  76. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  77. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  78. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  79. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  80. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  81. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  82. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  83. inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
  84. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  85. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  86. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  87. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  88. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  89. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  90. inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
  91. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  92. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  93. inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
  94. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  95. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  96. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  97. inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
  98. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  99. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  100. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  101. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  102. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  103. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
  104. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  105. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  106. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  107. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  108. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  109. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  110. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  111. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  112. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  113. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  114. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  115. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  116. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  117. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  118. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
  119. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  120. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  121. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  122. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  123. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  124. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  126. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  127. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
  128. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  129. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
  130. inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
  131. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  132. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  133. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  134. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  135. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  136. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
  137. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  138. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  139. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  140. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
  141. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  142. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
  143. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  144. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
  145. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  146. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
  147. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  148. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
  149. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  150. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  151. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
  152. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
  153. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  154. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  155. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  156. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  157. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  158. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  159. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  160. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  161. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  162. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  163. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  164. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  165. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  166. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  167. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  168. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  169. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  170. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  171. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  172. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  173. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  174. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  175. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  176. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  177. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  179. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  180. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  181. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  182. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
  183. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  184. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
  185. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  186. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  187. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  188. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  189. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  190. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  191. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  192. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  193. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  194. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  196. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  197. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  198. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  199. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  200. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  201. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  202. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  203. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  204. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  205. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  206. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
  207. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  208. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  209. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  210. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  211. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  212. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  213. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  214. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  215. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
  216. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  217. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  218. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  219. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  220. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  221. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
  222. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  223. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
  224. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  225. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  226. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  227. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  228. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  229. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
  230. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  231. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  232. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
  233. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  234. inspect_ai/_view/www/src/types/log.d.ts +108 -19
  235. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  236. inspect_ai/_view/www/src/types.ts +71 -0
  237. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
  238. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  239. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
  240. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  241. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  242. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  243. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  244. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  245. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  246. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  247. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  248. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  249. inspect_ai/_view/www/src/utils/format.ts +194 -0
  250. inspect_ai/_view/www/src/utils/git.ts +7 -0
  251. inspect_ai/_view/www/src/utils/html.ts +6 -0
  252. inspect_ai/_view/www/src/utils/http.ts +14 -0
  253. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  254. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  255. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  256. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  257. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  258. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  259. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  260. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  261. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
  262. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  263. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  264. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  265. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  266. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  267. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
  268. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
  269. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
  270. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  271. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
  272. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  273. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  274. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  275. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  276. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  277. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  278. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  279. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  280. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  281. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  282. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  283. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
  284. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  285. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
  286. inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
  287. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
  288. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  289. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  290. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  291. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  292. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  293. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  294. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  295. inspect_ai/_view/www/tsconfig.json +23 -9
  296. inspect_ai/_view/www/vite.config.js +8 -17
  297. inspect_ai/_view/www/yarn.lock +627 -556
  298. inspect_ai/approval/_approval.py +2 -0
  299. inspect_ai/approval/_approver.py +4 -4
  300. inspect_ai/approval/_auto.py +1 -1
  301. inspect_ai/approval/_human/approver.py +3 -0
  302. inspect_ai/approval/_policy.py +5 -0
  303. inspect_ai/approval/_registry.py +2 -2
  304. inspect_ai/dataset/_dataset.py +64 -37
  305. inspect_ai/dataset/_sources/__init__.py +0 -0
  306. inspect_ai/dataset/_sources/csv.py +20 -12
  307. inspect_ai/dataset/_sources/file.py +4 -0
  308. inspect_ai/dataset/_sources/hf.py +39 -29
  309. inspect_ai/dataset/_sources/json.py +17 -9
  310. inspect_ai/log/__init__.py +2 -0
  311. inspect_ai/log/_convert.py +3 -3
  312. inspect_ai/log/_file.py +24 -9
  313. inspect_ai/log/_log.py +101 -13
  314. inspect_ai/log/_message.py +4 -2
  315. inspect_ai/log/_recorders/file.py +4 -0
  316. inspect_ai/log/_recorders/json.py +5 -7
  317. inspect_ai/log/_recorders/recorder.py +3 -0
  318. inspect_ai/log/_transcript.py +19 -8
  319. inspect_ai/model/__init__.py +2 -0
  320. inspect_ai/model/_cache.py +39 -21
  321. inspect_ai/model/_call_tools.py +4 -3
  322. inspect_ai/model/_chat_message.py +14 -4
  323. inspect_ai/model/_generate_config.py +1 -1
  324. inspect_ai/model/_model.py +31 -24
  325. inspect_ai/model/_model_output.py +14 -1
  326. inspect_ai/model/_openai.py +10 -18
  327. inspect_ai/model/_providers/anthropic.py +3 -3
  328. inspect_ai/model/_providers/google.py +9 -5
  329. inspect_ai/model/_providers/openai.py +5 -9
  330. inspect_ai/model/_providers/openai_o1.py +3 -5
  331. inspect_ai/model/_providers/openrouter.py +86 -0
  332. inspect_ai/model/_providers/providers.py +11 -0
  333. inspect_ai/scorer/__init__.py +6 -1
  334. inspect_ai/scorer/_answer.py +7 -7
  335. inspect_ai/scorer/_classification.py +38 -18
  336. inspect_ai/scorer/_common.py +2 -8
  337. inspect_ai/scorer/_match.py +4 -5
  338. inspect_ai/scorer/_metric.py +87 -28
  339. inspect_ai/scorer/_metrics/__init__.py +3 -3
  340. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  341. inspect_ai/scorer/_metrics/mean.py +3 -17
  342. inspect_ai/scorer/_metrics/std.py +111 -30
  343. inspect_ai/scorer/_model.py +12 -12
  344. inspect_ai/scorer/_pattern.py +3 -3
  345. inspect_ai/scorer/_reducer/reducer.py +36 -21
  346. inspect_ai/scorer/_reducer/registry.py +2 -2
  347. inspect_ai/scorer/_reducer/types.py +7 -1
  348. inspect_ai/scorer/_score.py +11 -1
  349. inspect_ai/scorer/_scorer.py +110 -16
  350. inspect_ai/solver/__init__.py +1 -1
  351. inspect_ai/solver/_basic_agent.py +19 -22
  352. inspect_ai/solver/_bridge/__init__.py +0 -3
  353. inspect_ai/solver/_bridge/bridge.py +3 -3
  354. inspect_ai/solver/_chain.py +1 -2
  355. inspect_ai/solver/_critique.py +3 -3
  356. inspect_ai/solver/_fork.py +2 -2
  357. inspect_ai/solver/_human_agent/__init__.py +0 -0
  358. inspect_ai/solver/_human_agent/agent.py +5 -8
  359. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  360. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  361. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  362. inspect_ai/solver/_multiple_choice.py +38 -26
  363. inspect_ai/solver/_prompt.py +7 -7
  364. inspect_ai/solver/_solver.py +53 -52
  365. inspect_ai/solver/_task_state.py +80 -69
  366. inspect_ai/solver/_use_tools.py +9 -9
  367. inspect_ai/tool/__init__.py +4 -1
  368. inspect_ai/tool/_tool.py +43 -14
  369. inspect_ai/tool/_tool_call.py +6 -2
  370. inspect_ai/tool/_tool_choice.py +3 -1
  371. inspect_ai/tool/_tool_def.py +10 -8
  372. inspect_ai/tool/_tool_params.py +24 -0
  373. inspect_ai/tool/_tool_with.py +7 -7
  374. inspect_ai/tool/_tools/__init__.py +0 -0
  375. inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
  376. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
  377. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  378. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  379. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  380. inspect_ai/tool/_tools/_execute.py +23 -11
  381. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  382. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  383. inspect_ai/tool/_tools/_web_search.py +7 -5
  384. inspect_ai/tool/beta.py +3 -0
  385. inspect_ai/util/_concurrency.py +3 -3
  386. inspect_ai/util/_panel.py +2 -0
  387. inspect_ai/util/_resource.py +12 -12
  388. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  389. inspect_ai/util/_sandbox/docker/config.py +2 -1
  390. inspect_ai/util/_sandbox/docker/docker.py +42 -86
  391. inspect_ai/util/_sandbox/docker/service.py +100 -0
  392. inspect_ai/util/_sandbox/environment.py +99 -96
  393. inspect_ai/util/_sandbox/self_check.py +124 -16
  394. inspect_ai/util/_subprocess.py +5 -3
  395. inspect_ai/util/_subtask.py +15 -16
  396. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
  397. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
  398. inspect_ai-0.3.64.dist-info/RECORD +625 -0
  399. inspect_ai/_view/www/src/Register.mjs +0 -3
  400. inspect_ai/_view/www/src/Types.mjs +0 -38
  401. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  402. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  403. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  404. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  405. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  406. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  407. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  408. inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
  409. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  410. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  411. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  412. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  413. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  414. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  415. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  416. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  417. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  418. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  419. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  420. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  421. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  422. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  423. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  424. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  425. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  426. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  427. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  428. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  429. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  430. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  431. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  432. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  433. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  434. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  435. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  436. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  437. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  438. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  439. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  440. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  441. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  442. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  443. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  444. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  445. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  446. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  447. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  448. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  449. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  450. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  451. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  452. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  453. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  454. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  455. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  456. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  457. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  458. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  459. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  460. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  461. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  462. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  463. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  464. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  465. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  466. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  467. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  468. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  469. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  470. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  471. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  472. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  473. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  474. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  475. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  476. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  477. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
  478. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  479. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  480. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  481. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  482. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  483. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  484. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  485. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  486. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  487. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  488. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  489. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  490. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  491. inspect_ai/tool/beta/__init__.py +0 -5
  492. inspect_ai-0.3.62.dist-info/RECORD +0 -481
  493. /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
  494. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
  495. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  496. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  497. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  498. /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
  499. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  500. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  501. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  502. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  503. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  504. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  505. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  506. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
  507. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
  508. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  509. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  510. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
  511. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  512. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  513. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  514. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  515. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  516. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
  517. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
  518. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
inspect_ai/log/_log.py CHANGED
@@ -4,11 +4,17 @@ import sys
4
4
  import traceback
5
5
  from logging import getLogger
6
6
  from types import TracebackType
7
- from typing import Any, Literal, Type
7
+ from typing import Any, Literal, Type, TypedDict
8
8
 
9
9
  import click
10
10
  import tenacity
11
- from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator
11
+ from pydantic import (
12
+ BaseModel,
13
+ ConfigDict,
14
+ Field,
15
+ PrivateAttr,
16
+ model_validator,
17
+ )
12
18
  from rich.console import Console, RenderableType
13
19
  from rich.traceback import Traceback
14
20
 
@@ -17,12 +23,7 @@ from inspect_ai._util.error import EvalError, exception_message
17
23
  from inspect_ai._util.logger import warn_once
18
24
  from inspect_ai.approval._policy import ApprovalPolicyConfig
19
25
  from inspect_ai.dataset._dataset import MT, metadata_as
20
- from inspect_ai.model import (
21
- ChatMessage,
22
- GenerateConfig,
23
- ModelOutput,
24
- ModelUsage,
25
- )
26
+ from inspect_ai.model import ChatMessage, GenerateConfig, ModelOutput, ModelUsage
26
27
  from inspect_ai.scorer import Score
27
28
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
28
29
  from inspect_ai.util._store import Store
@@ -35,7 +36,31 @@ logger = getLogger(__name__)
35
36
  SCORER_PLACEHOLDER = "88F74D2C"
36
37
 
37
38
 
39
+ class EvalConfigDefaults(TypedDict):
40
+ epochs: int
41
+ epochs_reducer: list[str]
42
+ fail_on_error: bool
43
+ sandbox_cleanup: bool
44
+ log_samples: bool
45
+ log_images: bool
46
+ score_display: bool
47
+
48
+
49
+ def eval_config_defaults() -> EvalConfigDefaults:
50
+ return {
51
+ "epochs": 1,
52
+ "epochs_reducer": ["mean"],
53
+ "fail_on_error": True,
54
+ "sandbox_cleanup": True,
55
+ "log_samples": True,
56
+ "log_images": True,
57
+ "score_display": True,
58
+ }
59
+
60
+
38
61
  class EvalConfig(BaseModel):
62
+ """Configuration used for evaluation."""
63
+
39
64
  limit: int | tuple[int, int] | None = Field(default=None)
40
65
  """Sample limit (number of samples or range of samples)."""
41
66
 
@@ -114,6 +139,8 @@ class EvalConfig(BaseModel):
114
139
 
115
140
 
116
141
  class EvalSampleLimit(BaseModel):
142
+ """Limit encontered by sample."""
143
+
117
144
  type: Literal["context", "time", "message", "token", "operator", "custom"]
118
145
  """The type of limit"""
119
146
 
@@ -122,6 +149,8 @@ class EvalSampleLimit(BaseModel):
122
149
 
123
150
 
124
151
  class EvalSample(BaseModel):
152
+ """Sample from evaluation task."""
153
+
125
154
  id: int | str
126
155
  """Unique id for sample."""
127
156
 
@@ -196,7 +225,7 @@ class EvalSample(BaseModel):
196
225
  """Attachments referenced from messages and events.
197
226
 
198
227
  Resolve attachments for a sample (replacing attachment://* references with
199
- attachment content) with the resolve_sample_attachments() function.
228
+ attachment content) by passing `resolve_attachments=True` to log reading functions.
200
229
  """
201
230
 
202
231
  limit: EvalSampleLimit | None = Field(default=None)
@@ -267,6 +296,8 @@ class EvalEvents(BaseModel):
267
296
 
268
297
 
269
298
  class EvalPlanStep(BaseModel):
299
+ """Solver step."""
300
+
270
301
  solver: str
271
302
  """Name of solver."""
272
303
 
@@ -275,6 +306,8 @@ class EvalPlanStep(BaseModel):
275
306
 
276
307
 
277
308
  class EvalPlan(BaseModel):
309
+ """Plan (solvers) used in evaluation."""
310
+
278
311
  name: str = Field(default="plan")
279
312
  """Plan name."""
280
313
 
@@ -289,20 +322,24 @@ class EvalPlan(BaseModel):
289
322
 
290
323
 
291
324
  class EvalMetric(BaseModel):
325
+ """Metric for evaluation score."""
326
+
292
327
  name: str
293
328
  """Metric name."""
294
329
 
295
330
  value: int | float
296
331
  """Metric value."""
297
332
 
298
- options: dict[str, Any] = Field(default_factory=dict)
299
- """Options specified when creating metric."""
333
+ params: dict[str, Any] = Field(default_factory=dict)
334
+ """Params specified when creating metric."""
300
335
 
301
336
  metadata: dict[str, Any] | None = Field(default=None)
302
337
  """Additional metadata associated with metric."""
303
338
 
304
339
 
305
340
  class EvalScore(BaseModel):
341
+ """Score for evaluation task."""
342
+
306
343
  name: str
307
344
  """Score name."""
308
345
 
@@ -323,10 +360,15 @@ class EvalScore(BaseModel):
323
360
 
324
361
 
325
362
  class EvalSampleScore(Score):
363
+ """Score and sample_id scored."""
364
+
326
365
  sample_id: str | int | None = Field(default=None)
366
+ """Sample ID."""
327
367
 
328
368
 
329
369
  class EvalSampleReductions(BaseModel):
370
+ """Score reductions."""
371
+
330
372
  scorer: str
331
373
  """Name the of scorer"""
332
374
 
@@ -338,6 +380,8 @@ class EvalSampleReductions(BaseModel):
338
380
 
339
381
 
340
382
  class EvalResults(BaseModel):
383
+ """Scoring results from evaluation."""
384
+
341
385
  total_samples: int = Field(default=0)
342
386
  """Total samples in eval (dataset samples * epochs)"""
343
387
 
@@ -404,6 +448,8 @@ class EvalResults(BaseModel):
404
448
  if "metrics" in values:
405
449
  metrics = values["metrics"]
406
450
  del values["metrics"]
451
+ else:
452
+ metrics = None
407
453
  # Convert the scorer to the new schema
408
454
  score = values["scorer"]
409
455
  if metrics:
@@ -418,6 +464,8 @@ class EvalResults(BaseModel):
418
464
 
419
465
 
420
466
  class EvalDataset(BaseModel):
467
+ """Dataset used for evaluation."""
468
+
421
469
  name: str | None = Field(default=None)
422
470
  """Dataset name."""
423
471
 
@@ -434,7 +482,33 @@ class EvalDataset(BaseModel):
434
482
  """Was the dataset shuffled after reading."""
435
483
 
436
484
 
485
+ class EvalMetricDefinition(BaseModel):
486
+ name: str
487
+ """Metric name"""
488
+
489
+ options: dict[str, Any] | None = Field(default=None)
490
+
491
+
492
+ class EvalScorer(BaseModel):
493
+ name: str
494
+ """Scorer name"""
495
+
496
+ options: dict[str, Any] | None = Field(default=None)
497
+ """Scorer arguments"""
498
+
499
+ metrics: (
500
+ list[EvalMetricDefinition | dict[str, list[EvalMetricDefinition]]]
501
+ | dict[str, list[EvalMetricDefinition]]
502
+ | None
503
+ ) = Field(default=None)
504
+
505
+ metadata: dict[str, Any] | None = Field(default=None)
506
+ """Scorer metadata"""
507
+
508
+
437
509
  class EvalRevision(BaseModel):
510
+ """Git revision for evaluation."""
511
+
438
512
  type: Literal["git"]
439
513
  """Type of revision (currently only "git")"""
440
514
 
@@ -446,6 +520,8 @@ class EvalRevision(BaseModel):
446
520
 
447
521
 
448
522
  class EvalSpec(BaseModel):
523
+ """Eval target and configuration."""
524
+
449
525
  run_id: str = Field(default_factory=str)
450
526
  """Unique run id"""
451
527
 
@@ -506,6 +582,14 @@ class EvalSpec(BaseModel):
506
582
  metadata: dict[str, Any] | None = Field(default=None)
507
583
  """Additional eval metadata."""
508
584
 
585
+ scorers: list[EvalScorer] | None = Field(default=None)
586
+ """Scorers and args for this eval"""
587
+
588
+ metrics: (
589
+ list[EvalMetricDefinition] | dict[str, list[EvalMetricDefinition]] | None
590
+ ) = Field(default=None)
591
+ """metrics and args for this eval"""
592
+
509
593
  # allow field model_args
510
594
  model_config = ConfigDict(protected_namespaces=())
511
595
 
@@ -549,6 +633,8 @@ def rich_traceback(
549
633
 
550
634
 
551
635
  class EvalStats(BaseModel):
636
+ """Timing and usage statistics."""
637
+
552
638
  started_at: str = Field(default_factory=str)
553
639
  """Evaluation start time."""
554
640
 
@@ -563,6 +649,8 @@ class EvalStats(BaseModel):
563
649
 
564
650
 
565
651
  class EvalLog(BaseModel):
652
+ """Evaluation log."""
653
+
566
654
  # WARNING: The order of these fields is important for the log file format.
567
655
  # Do not change the order of these fields without incrementing the version number,
568
656
  # updating the log file read/write functionality (such as read_eval_log),
@@ -578,13 +666,13 @@ class EvalLog(BaseModel):
578
666
  eval: EvalSpec
579
667
  """Eval identity and configuration."""
580
668
 
581
- plan: EvalPlan = Field(default=EvalPlan())
669
+ plan: EvalPlan = Field(default_factory=EvalPlan)
582
670
  """Eval plan (solvers and config)"""
583
671
 
584
672
  results: EvalResults | None = None
585
673
  """Eval results (scores and metrics)."""
586
674
 
587
- stats: EvalStats = Field(default=EvalStats())
675
+ stats: EvalStats = Field(default_factory=EvalStats)
588
676
  """Eval stats (runtime, model usage)"""
589
677
 
590
678
  error: EvalError | None = Field(default=None)
@@ -5,12 +5,14 @@ from typing import Any, Literal, Type, cast
5
5
  from pydantic import BaseModel, Field, model_validator
6
6
 
7
7
  LoggingLevel = Literal[
8
- "debug", "http", "sandbox", "info", "warning", "error", "critical"
8
+ "debug", "trace", "http", "sandbox", "info", "warning", "error", "critical"
9
9
  ]
10
10
  """Logging level."""
11
11
 
12
12
 
13
13
  class LoggingMessage(BaseModel):
14
+ """Message written to Python log."""
15
+
14
16
  name: str | None = Field(default=None)
15
17
  """Logger name (e.g. 'httpx')"""
16
18
 
@@ -33,7 +35,7 @@ class LoggingMessage(BaseModel):
33
35
  """Logged from line number."""
34
36
 
35
37
  @staticmethod
36
- def from_log_record(record: LogRecord) -> "LoggingMessage":
38
+ def _from_log_record(record: LogRecord) -> "LoggingMessage":
37
39
  """Create a LoggingMesssage from a LogRecord.
38
40
 
39
41
  Args:
@@ -28,6 +28,10 @@ class FileRecorder(Recorder):
28
28
  def is_local(self) -> bool:
29
29
  return self.fs.is_local()
30
30
 
31
+ @override
32
+ def is_writeable(self) -> bool:
33
+ return self.fs.is_writeable(self.log_dir)
34
+
31
35
  @override
32
36
  @classmethod
33
37
  async def read_log_sample(
@@ -9,12 +9,7 @@ from typing_extensions import override
9
9
 
10
10
  from inspect_ai._util.constants import LOG_SCHEMA_VERSION
11
11
  from inspect_ai._util.error import EvalError
12
- from inspect_ai._util.file import (
13
- absolute_file_path,
14
- async_fileystem,
15
- file,
16
- filesystem,
17
- )
12
+ from inspect_ai._util.file import absolute_file_path, async_fileystem, file, filesystem
18
13
  from inspect_ai._util.trace import trace_action
19
14
 
20
15
  from .._log import (
@@ -236,12 +231,13 @@ def _read_header_streaming(log_file: str) -> EvalLog:
236
231
  f.seek(0)
237
232
 
238
233
  # Parse the log file, stopping before parsing samples
234
+ status: Literal["started", "success", "cancelled", "error"] | None = None
239
235
  for k, v in ijson.kvitems(f, ""):
240
236
  if k == "status":
241
237
  assert v in get_args(
242
238
  Literal["started", "success", "cancelled", "error"]
243
239
  )
244
- status: Literal["started", "success", "cancelled", "error"] = v
240
+ status = v
245
241
  if k == "eval":
246
242
  eval = EvalSpec(**v)
247
243
  elif k == "plan":
@@ -257,6 +253,8 @@ def _read_header_streaming(log_file: str) -> EvalLog:
257
253
  error = EvalError(**v)
258
254
  break
259
255
 
256
+ assert status, "Must encounter a 'status'"
257
+
260
258
  return EvalLog(
261
259
  eval=eval,
262
260
  plan=plan,
@@ -21,6 +21,9 @@ class Recorder(abc.ABC):
21
21
  @abc.abstractmethod
22
22
  def default_log_buffer(self) -> int: ...
23
23
 
24
+ @abc.abstractmethod
25
+ def is_writeable(self) -> bool: ...
26
+
24
27
  @abc.abstractmethod
25
28
  async def log_init(self, eval: EvalSpec, location: str | None = None) -> str: ...
26
29
 
@@ -167,7 +167,7 @@ class ToolEvent(BaseEvent):
167
167
  events: list["Event"] = Field(default_factory=list)
168
168
  """Transcript of events for tool."""
169
169
 
170
- def set_result(
170
+ def _set_result(
171
171
  self,
172
172
  result: ToolResult,
173
173
  truncated: tuple[int, int] | None,
@@ -182,11 +182,11 @@ class ToolEvent(BaseEvent):
182
182
 
183
183
  # mechanism for operator to cancel the tool call
184
184
 
185
- def set_task(self, task: asyncio.Task[Any]) -> None:
185
+ def _set_task(self, task: asyncio.Task[Any]) -> None:
186
186
  """Set the tool task (for possible cancellation)"""
187
187
  self._task = task
188
188
 
189
- def cancel(self) -> None:
189
+ def _cancel(self) -> None:
190
190
  """Cancel the tool task."""
191
191
  if self._task:
192
192
  self._cancelled = True
@@ -264,6 +264,9 @@ class InfoEvent(BaseEvent):
264
264
  event: Literal["info"] = Field(default="info")
265
265
  """Event type."""
266
266
 
267
+ source: str | None = Field(default=None)
268
+ """Optional source for info event."""
269
+
267
270
  data: JsonValue
268
271
  """Data provided with event."""
269
272
 
@@ -279,17 +282,24 @@ class ErrorEvent(BaseEvent):
279
282
 
280
283
 
281
284
  class ScoreEvent(BaseEvent):
282
- """Event with sample score."""
285
+ """Event with score.
286
+
287
+ Can be the final score for a `Sample`, or can be an intermediate score
288
+ resulting from a call to `score`.
289
+ """
283
290
 
284
291
  event: Literal["score"] = Field(default="score")
285
292
  """Event type."""
286
293
 
287
294
  score: Score
288
- """Sample score."""
295
+ """Score value."""
289
296
 
290
297
  target: str | list[str] | None = Field(default=None)
291
298
  """"Sample target."""
292
299
 
300
+ intermediate: bool = Field(default=False)
301
+ """Was this an intermediate scoring?"""
302
+
293
303
 
294
304
  class StepEvent(BaseEvent):
295
305
  """Step within current sample or subtask."""
@@ -355,13 +365,14 @@ class Transcript:
355
365
  self.name = name
356
366
  self._events: list[Event] = []
357
367
 
358
- def info(self, data: JsonValue) -> None:
368
+ def info(self, data: JsonValue, *, source: str | None = None) -> None:
359
369
  """Add an `InfoEvent` to the transcript.
360
370
 
361
371
  Args:
362
- data (JsonValue): Data associated with the event.
372
+ data: Data associated with the event.
373
+ source: Optional event source.
363
374
  """
364
- self._event(InfoEvent(data=data))
375
+ self._event(InfoEvent(source=source, data=data))
365
376
 
366
377
  @contextlib.contextmanager
367
378
  def step(self, name: str, type: str | None = None) -> Iterator[None]:
@@ -21,6 +21,7 @@ from ._call_tools import call_tools
21
21
  from ._chat_message import (
22
22
  ChatMessage,
23
23
  ChatMessageAssistant,
24
+ ChatMessageBase,
24
25
  ChatMessageSystem,
25
26
  ChatMessageTool,
26
27
  ChatMessageUser,
@@ -54,6 +55,7 @@ __all__ = [
54
55
  "ContentVideo",
55
56
  "Content",
56
57
  "ChatMessage",
58
+ "ChatMessageBase",
57
59
  "ChatMessageSystem",
58
60
  "ChatMessageUser",
59
61
  "ChatMessageAssistant",
@@ -58,22 +58,23 @@ def _parse_expiry(period: str) -> int:
58
58
  class CachePolicy:
59
59
  """The `CachePolicy` is used to define various criteria that impact how model calls are cached.
60
60
 
61
- Attributes:
62
- expiry(str | None): Default "24h". The expiry time for the cache entry.
63
- This is a string of the format "12h" for 12 hours or "1W" for a week,
64
- etc. This is how long we will keep the cache entry, if we access it
65
- after this point we'll clear it. Setting to `None` will cache
66
- indefinitely.
67
- per_epoch(bool): Default True. By default we cache responses separately
68
- for different epochs. The general use case is that if there are
69
- multiple epochs, we should cache each response separately because
70
- scorers will aggregate across epochs. However, sometimes a response
71
- can be cached regardless of epoch if the call being made isn't under
72
- test as part of the evaluation. If False, this option allows you to
73
- bypass that and cache independently of the epoch.
74
- scopes(dict[str, str]): A dictionary of additional metadata that should
75
- be included in the cache key. This allows for more fine-grained
76
- control over the cache key generation.
61
+ `expiry`: Default "24h". The expiry time for the cache entry.
62
+ This is a string of the format "12h" for 12 hours or "1W" for a week,
63
+ etc. This is how long we will keep the cache entry, if we access it
64
+ after this point we'll clear it. Setting to `None` will cache
65
+ indefinitely.
66
+
67
+ `per_epoch`: Default True. By default we cache responses separately
68
+ for different epochs. The general use case is that if there are
69
+ multiple epochs, we should cache each response separately because
70
+ scorers will aggregate across epochs. However, sometimes a response
71
+ can be cached regardless of epoch if the call being made isn't under
72
+ test as part of the evaluation. If False, this option allows you to
73
+ bypass that and cache independently of the epoch.
74
+
75
+ `scopes`: A dictionary of additional metadata that should
76
+ be included in the cache key. This allows for more fine-grained
77
+ control over the cache key generation.
77
78
  """
78
79
 
79
80
  def __init__(
@@ -82,6 +83,14 @@ class CachePolicy:
82
83
  per_epoch: bool = True,
83
84
  scopes: dict[str, str] = {},
84
85
  ) -> None:
86
+ """Create a CachePolicy.
87
+
88
+ Args:
89
+ expiry: Expiry.
90
+ per_epoch: Per epoch
91
+ scopes: Scopes
92
+
93
+ """
85
94
  self.per_epoch = per_epoch
86
95
  self.scopes = scopes
87
96
 
@@ -236,7 +245,11 @@ def cache_fetch(entry: CacheEntry) -> ModelOutput | None:
236
245
 
237
246
 
238
247
  def cache_clear(model: str = "") -> bool:
239
- """Clear the cache directory."""
248
+ """Clear the cache directory.
249
+
250
+ Args:
251
+ model: Model to clear cache for.
252
+ """
240
253
  try:
241
254
  path = cache_path(model)
242
255
 
@@ -252,6 +265,11 @@ def cache_clear(model: str = "") -> bool:
252
265
 
253
266
 
254
267
  def cache_path(model: str = "") -> Path:
268
+ """Path to cache directory.
269
+
270
+ Args:
271
+ model: Path to cache directory for specific model.
272
+ """
255
273
  env_cache_dir = os.environ.get("INSPECT_CACHE_DIR", None)
256
274
  if env_cache_dir:
257
275
  generate_cache = Path(env_cache_dir) / "generate"
@@ -320,9 +338,9 @@ def cache_size(
320
338
  will be calculated.
321
339
 
322
340
  Args:
323
- subdirs(list[str]): List of folders to filter by, which are generally
341
+ subdirs: List of folders to filter by, which are generally
324
342
  model names. Empty directories will be ignored.
325
- files(list[str]): List of files to filter by explicitly. Note that
343
+ files: List of files to filter by explicitly. Note that
326
344
  return value group these up by their parent directory
327
345
 
328
346
  Returns:
@@ -344,7 +362,7 @@ def cache_list_expired(filter_by: list[str] = []) -> list[Path]:
344
362
  """Returns a list of all the cached files that have passed their expiry time.
345
363
 
346
364
  Args:
347
- filter_by(list[str]): Default []. List of model names to filter by. If
365
+ filter_by: Default []. List of model names to filter by. If
348
366
  an empty list, this will search the entire cache.
349
367
  """
350
368
  expired_cache_entries = []
@@ -384,7 +402,7 @@ def cache_prune(files: list[Path] = []) -> None:
384
402
  """Delete all expired cache entries.
385
403
 
386
404
  Args:
387
- files(list[Path]): Default []. List of files to prune. If empty, this
405
+ files: List of files to prune. If empty, this
388
406
  will search the entire cache.
389
407
  """
390
408
  if not files:
@@ -133,7 +133,8 @@ async def call_tools(
133
133
  ):
134
134
  content: str | list[Content] = [result]
135
135
  elif isinstance(result, list) and (
136
- isinstance(
136
+ len(result) == 0
137
+ or isinstance(
137
138
  result[0], ContentText | ContentImage | ContentAudio | ContentVideo
138
139
  )
139
140
  ):
@@ -186,7 +187,7 @@ async def call_tools(
186
187
  view=call.view,
187
188
  pending=True,
188
189
  )
189
- event.set_task(task)
190
+ event._set_task(task)
190
191
  transcript()._event(event)
191
192
 
192
193
  # execute the tool call. if the operator cancelled the
@@ -226,7 +227,7 @@ async def call_tools(
226
227
  conversation_tool_mesage(tool_message)
227
228
 
228
229
  # update the event with the results
229
- event.set_result(
230
+ event._set_result(
230
231
  result=result_event.result,
231
232
  truncated=result_event.truncated,
232
233
  error=result_event.error,
@@ -13,8 +13,13 @@ logger = getLogger(__name__)
13
13
 
14
14
 
15
15
  class ChatMessageBase(BaseModel):
16
+ """Base class for chat messages."""
17
+
18
+ role: Literal["system", "user", "assistant", "tool"]
19
+ """Conversation role"""
20
+
16
21
  content: str | list[Content]
17
- """Content (simple string or list of string|image content)"""
22
+ """Content (simple string or list of content objects)"""
18
23
 
19
24
  source: Literal["input", "generate"] | None = Field(default=None)
20
25
  """Source of message."""
@@ -31,9 +36,6 @@ class ChatMessageBase(BaseModel):
31
36
  property returns either the plain str content, or if the
32
37
  content is a list of text and images, the text items
33
38
  concatenated together (separated by newline)
34
-
35
- Returns: Text content of `ChatMessage` If this message does
36
- not have text content then "" is returned.
37
39
  """
38
40
  if isinstance(self.content, str):
39
41
  return self.content
@@ -66,11 +68,15 @@ class ChatMessageBase(BaseModel):
66
68
 
67
69
 
68
70
  class ChatMessageSystem(ChatMessageBase):
71
+ """System chat message."""
72
+
69
73
  role: Literal["system"] = Field(default="system")
70
74
  """Conversation role."""
71
75
 
72
76
 
73
77
  class ChatMessageUser(ChatMessageBase):
78
+ """User chat message."""
79
+
74
80
  role: Literal["user"] = Field(default="user")
75
81
  """Conversation role."""
76
82
 
@@ -79,6 +85,8 @@ class ChatMessageUser(ChatMessageBase):
79
85
 
80
86
 
81
87
  class ChatMessageAssistant(ChatMessageBase):
88
+ """Assistant chat message."""
89
+
82
90
  role: Literal["assistant"] = Field(default="assistant")
83
91
  """Conversation role."""
84
92
 
@@ -112,6 +120,8 @@ class ChatMessageAssistant(ChatMessageBase):
112
120
 
113
121
 
114
122
  class ChatMessageTool(ChatMessageBase):
123
+ """Tool chat message."""
124
+
115
125
  role: Literal["tool"] = Field(default="tool")
116
126
  """Conversation role."""
117
127
 
@@ -80,7 +80,7 @@ class GenerateConfigArgs(TypedDict, total=False):
80
80
 
81
81
 
82
82
  class GenerateConfig(BaseModel):
83
- """Base class for model generation configs."""
83
+ """Model generation options."""
84
84
 
85
85
  max_retries: int | None = Field(default=None)
86
86
  """Maximum number of times to retry request (defaults to 5)."""