inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (518) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/main.py +1 -1
  8. inspect_ai/_cli/sandbox.py +4 -1
  9. inspect_ai/_cli/score.py +181 -32
  10. inspect_ai/_cli/trace.py +10 -0
  11. inspect_ai/_cli/view.py +4 -2
  12. inspect_ai/_display/core/active.py +2 -3
  13. inspect_ai/_display/core/config.py +7 -1
  14. inspect_ai/_display/textual/widgets/samples.py +4 -3
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/eval.py +104 -101
  17. inspect_ai/_eval/evalset.py +75 -75
  18. inspect_ai/_eval/loader.py +122 -12
  19. inspect_ai/_eval/registry.py +1 -1
  20. inspect_ai/_eval/run.py +14 -0
  21. inspect_ai/_eval/score.py +125 -36
  22. inspect_ai/_eval/task/log.py +105 -4
  23. inspect_ai/_eval/task/results.py +92 -38
  24. inspect_ai/_eval/task/run.py +9 -2
  25. inspect_ai/_eval/task/sandbox.py +35 -2
  26. inspect_ai/_eval/task/task.py +49 -46
  27. inspect_ai/_util/constants.py +1 -1
  28. inspect_ai/_util/content.py +8 -0
  29. inspect_ai/_util/error.py +2 -0
  30. inspect_ai/_util/file.py +15 -1
  31. inspect_ai/_util/hash.py +1 -1
  32. inspect_ai/_util/logger.py +4 -2
  33. inspect_ai/_util/registry.py +7 -1
  34. inspect_ai/_view/view.py +1 -2
  35. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  36. inspect_ai/_view/www/.vscode/settings.json +8 -0
  37. inspect_ai/_view/www/App.css +97 -29
  38. inspect_ai/_view/www/README.md +1 -1
  39. inspect_ai/_view/www/dist/assets/index.css +16663 -14674
  40. inspect_ai/_view/www/dist/assets/index.js +58808 -51348
  41. inspect_ai/_view/www/dist/index.html +1 -1
  42. inspect_ai/_view/www/index.html +2 -2
  43. inspect_ai/_view/www/log-schema.json +87 -73
  44. inspect_ai/_view/www/package.json +22 -4
  45. inspect_ai/_view/www/postcss.config.cjs +8 -9
  46. inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
  47. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  48. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  49. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  50. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  51. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  52. inspect_ai/_view/www/src/api/index.ts +4 -4
  53. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  54. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  55. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  56. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  57. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  58. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  59. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  60. inspect_ai/_view/www/src/components/Card.css +60 -0
  61. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  62. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  63. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  64. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  65. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  66. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  67. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  68. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  69. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  70. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  71. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  72. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  73. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  74. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  75. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  76. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  77. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  78. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  79. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  80. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  81. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  82. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  83. inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
  84. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  85. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  86. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  87. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  88. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  89. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  90. inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
  91. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  92. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  93. inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
  94. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  95. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  96. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  97. inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
  98. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  99. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  100. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  101. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  102. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  103. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
  104. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  105. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  106. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  107. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  108. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  109. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  110. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  111. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  112. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  113. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  114. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  115. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  116. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  117. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  118. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
  119. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  120. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  121. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  122. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  123. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  124. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  126. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  127. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
  128. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  129. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
  130. inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
  131. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  132. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  133. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  134. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  135. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  136. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
  137. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  138. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  139. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  140. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
  141. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  142. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
  143. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  144. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
  145. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  146. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
  147. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  148. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
  149. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  150. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  151. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
  152. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
  153. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  154. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  155. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  156. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  157. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  158. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  159. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  160. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  161. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  162. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  163. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  164. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  165. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  166. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  167. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  168. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  169. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  170. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  171. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  172. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  173. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  174. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  175. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  176. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  177. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  179. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  180. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  181. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  182. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
  183. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  184. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
  185. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  186. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  187. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  188. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  189. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  190. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  191. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  192. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  193. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  194. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  196. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  197. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  198. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  199. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  200. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  201. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  202. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  203. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  204. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  205. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  206. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
  207. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  208. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  209. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  210. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  211. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  212. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  213. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  214. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  215. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
  216. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  217. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  218. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  219. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  220. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  221. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
  222. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  223. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
  224. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  225. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  226. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  227. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  228. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  229. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
  230. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  231. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  232. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
  233. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  234. inspect_ai/_view/www/src/types/log.d.ts +108 -19
  235. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  236. inspect_ai/_view/www/src/types.ts +71 -0
  237. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
  238. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  239. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
  240. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  241. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  242. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  243. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  244. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  245. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  246. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  247. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  248. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  249. inspect_ai/_view/www/src/utils/format.ts +194 -0
  250. inspect_ai/_view/www/src/utils/git.ts +7 -0
  251. inspect_ai/_view/www/src/utils/html.ts +6 -0
  252. inspect_ai/_view/www/src/utils/http.ts +14 -0
  253. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  254. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  255. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  256. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  257. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  258. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  259. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  260. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  261. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
  262. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  263. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  264. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  265. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  266. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  267. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
  268. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
  269. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
  270. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  271. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
  272. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  273. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  274. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  275. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  276. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  277. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  278. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  279. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  280. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  281. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  282. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  283. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
  284. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  285. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
  286. inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
  287. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
  288. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  289. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  290. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  291. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  292. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  293. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  294. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  295. inspect_ai/_view/www/tsconfig.json +23 -9
  296. inspect_ai/_view/www/vite.config.js +8 -17
  297. inspect_ai/_view/www/yarn.lock +627 -556
  298. inspect_ai/approval/_approval.py +2 -0
  299. inspect_ai/approval/_approver.py +4 -4
  300. inspect_ai/approval/_auto.py +1 -1
  301. inspect_ai/approval/_human/approver.py +3 -0
  302. inspect_ai/approval/_policy.py +5 -0
  303. inspect_ai/approval/_registry.py +2 -2
  304. inspect_ai/dataset/_dataset.py +64 -37
  305. inspect_ai/dataset/_sources/__init__.py +0 -0
  306. inspect_ai/dataset/_sources/csv.py +20 -12
  307. inspect_ai/dataset/_sources/file.py +4 -0
  308. inspect_ai/dataset/_sources/hf.py +39 -29
  309. inspect_ai/dataset/_sources/json.py +17 -9
  310. inspect_ai/log/__init__.py +2 -0
  311. inspect_ai/log/_convert.py +3 -3
  312. inspect_ai/log/_file.py +24 -9
  313. inspect_ai/log/_log.py +101 -13
  314. inspect_ai/log/_message.py +4 -2
  315. inspect_ai/log/_recorders/file.py +4 -0
  316. inspect_ai/log/_recorders/json.py +5 -7
  317. inspect_ai/log/_recorders/recorder.py +3 -0
  318. inspect_ai/log/_transcript.py +19 -8
  319. inspect_ai/model/__init__.py +2 -0
  320. inspect_ai/model/_cache.py +39 -21
  321. inspect_ai/model/_call_tools.py +4 -3
  322. inspect_ai/model/_chat_message.py +14 -4
  323. inspect_ai/model/_generate_config.py +1 -1
  324. inspect_ai/model/_model.py +31 -24
  325. inspect_ai/model/_model_output.py +14 -1
  326. inspect_ai/model/_openai.py +10 -18
  327. inspect_ai/model/_providers/anthropic.py +3 -3
  328. inspect_ai/model/_providers/google.py +9 -5
  329. inspect_ai/model/_providers/openai.py +5 -9
  330. inspect_ai/model/_providers/openai_o1.py +3 -5
  331. inspect_ai/model/_providers/openrouter.py +86 -0
  332. inspect_ai/model/_providers/providers.py +11 -0
  333. inspect_ai/scorer/__init__.py +6 -1
  334. inspect_ai/scorer/_answer.py +7 -7
  335. inspect_ai/scorer/_classification.py +38 -18
  336. inspect_ai/scorer/_common.py +2 -8
  337. inspect_ai/scorer/_match.py +4 -5
  338. inspect_ai/scorer/_metric.py +87 -28
  339. inspect_ai/scorer/_metrics/__init__.py +3 -3
  340. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  341. inspect_ai/scorer/_metrics/mean.py +3 -17
  342. inspect_ai/scorer/_metrics/std.py +111 -30
  343. inspect_ai/scorer/_model.py +12 -12
  344. inspect_ai/scorer/_pattern.py +3 -3
  345. inspect_ai/scorer/_reducer/reducer.py +36 -21
  346. inspect_ai/scorer/_reducer/registry.py +2 -2
  347. inspect_ai/scorer/_reducer/types.py +7 -1
  348. inspect_ai/scorer/_score.py +11 -1
  349. inspect_ai/scorer/_scorer.py +110 -16
  350. inspect_ai/solver/__init__.py +1 -1
  351. inspect_ai/solver/_basic_agent.py +19 -22
  352. inspect_ai/solver/_bridge/__init__.py +0 -3
  353. inspect_ai/solver/_bridge/bridge.py +3 -3
  354. inspect_ai/solver/_chain.py +1 -2
  355. inspect_ai/solver/_critique.py +3 -3
  356. inspect_ai/solver/_fork.py +2 -2
  357. inspect_ai/solver/_human_agent/__init__.py +0 -0
  358. inspect_ai/solver/_human_agent/agent.py +5 -8
  359. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  360. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  361. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  362. inspect_ai/solver/_multiple_choice.py +38 -26
  363. inspect_ai/solver/_prompt.py +7 -7
  364. inspect_ai/solver/_solver.py +53 -52
  365. inspect_ai/solver/_task_state.py +80 -69
  366. inspect_ai/solver/_use_tools.py +9 -9
  367. inspect_ai/tool/__init__.py +4 -1
  368. inspect_ai/tool/_tool.py +43 -14
  369. inspect_ai/tool/_tool_call.py +6 -2
  370. inspect_ai/tool/_tool_choice.py +3 -1
  371. inspect_ai/tool/_tool_def.py +10 -8
  372. inspect_ai/tool/_tool_params.py +24 -0
  373. inspect_ai/tool/_tool_with.py +7 -7
  374. inspect_ai/tool/_tools/__init__.py +0 -0
  375. inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
  376. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
  377. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  378. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  379. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  380. inspect_ai/tool/_tools/_execute.py +23 -11
  381. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  382. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  383. inspect_ai/tool/_tools/_web_search.py +7 -5
  384. inspect_ai/tool/beta.py +3 -0
  385. inspect_ai/util/_concurrency.py +3 -3
  386. inspect_ai/util/_panel.py +2 -0
  387. inspect_ai/util/_resource.py +12 -12
  388. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  389. inspect_ai/util/_sandbox/docker/config.py +2 -1
  390. inspect_ai/util/_sandbox/docker/docker.py +42 -86
  391. inspect_ai/util/_sandbox/docker/service.py +100 -0
  392. inspect_ai/util/_sandbox/environment.py +99 -96
  393. inspect_ai/util/_sandbox/self_check.py +124 -16
  394. inspect_ai/util/_subprocess.py +5 -3
  395. inspect_ai/util/_subtask.py +15 -16
  396. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
  397. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
  398. inspect_ai-0.3.64.dist-info/RECORD +625 -0
  399. inspect_ai/_view/www/src/Register.mjs +0 -3
  400. inspect_ai/_view/www/src/Types.mjs +0 -38
  401. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  402. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  403. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  404. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  405. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  406. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  407. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  408. inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
  409. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  410. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  411. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  412. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  413. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  414. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  415. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  416. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  417. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  418. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  419. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  420. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  421. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  422. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  423. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  424. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  425. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  426. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  427. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  428. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  429. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  430. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  431. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  432. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  433. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  434. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  435. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  436. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  437. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  438. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  439. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  440. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  441. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  442. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  443. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  444. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  445. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  446. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  447. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  448. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  449. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  450. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  451. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  452. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  453. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  454. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  455. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  456. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  457. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  458. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  459. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  460. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  461. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  462. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  463. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  464. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  465. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  466. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  467. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  468. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  469. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  470. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  471. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  472. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  473. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  474. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  475. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  476. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  477. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
  478. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  479. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  480. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  481. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  482. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  483. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  484. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  485. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  486. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  487. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  488. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  489. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  490. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  491. inspect_ai/tool/beta/__init__.py +0 -5
  492. inspect_ai-0.3.62.dist-info/RECORD +0 -481
  493. /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
  494. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
  495. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  496. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  497. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  498. /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
  499. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  500. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  501. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  502. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  503. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  504. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  505. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  506. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
  507. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
  508. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  509. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  510. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
  511. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  512. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  513. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  514. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  515. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  516. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
  517. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
  518. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,8 @@ Possible values:
17
17
 
18
18
 
19
19
  class Approval(BaseModel):
20
+ """Approval details (decision, explanation, etc.)"""
21
+
20
22
  decision: ApprovalDecision
21
23
  """Approval decision."""
22
24
 
@@ -20,10 +20,10 @@ class Approver(Protocol):
20
20
  Approve or reject a tool call.
21
21
 
22
22
  Args:
23
- message (str): Message genreated by the model along with the tool call.
24
- call (ToolCall): The tool call to be approved.
25
- view (ToolCallView): Custom rendering of tool context and call.
26
- state (state | None): The current task state, if available.
23
+ message: Message genreated by the model along with the tool call.
24
+ call: The tool call to be approved.
25
+ view: Custom rendering of tool context and call.
26
+ state: The current task state, if available.
27
27
 
28
28
  Returns:
29
29
  Approval: An Approval object containing the decision and explanation.
@@ -11,7 +11,7 @@ def auto_approver(decision: ApprovalDecision = "approve") -> Approver:
11
11
  """Automatically apply a decision to tool calls.
12
12
 
13
13
  Args:
14
- decision (ApprovalDecision): Decision to apply.
14
+ decision: Decision to apply.
15
15
 
16
16
  Returns:
17
17
  Approver: Auto approver.
@@ -14,6 +14,9 @@ def human_approver(
14
14
  ) -> Approver:
15
15
  """Interactive human approver.
16
16
 
17
+ Args:
18
+ choices: Choices to present to human.
19
+
17
20
  Returns:
18
21
  Approver: Interactive human approver.
19
22
  """
@@ -20,8 +20,13 @@ from ._call import call_approver, record_approval
20
20
 
21
21
  @dataclass
22
22
  class ApprovalPolicy:
23
+ """Policy mapping approvers to tools."""
24
+
23
25
  approver: Approver
26
+ """Approver for policy."""
27
+
24
28
  tools: str | list[str]
29
+ """Tools to use this approver for (can be full tool names or globs)."""
25
30
 
26
31
 
27
32
  def policy_approver(policies: str | list[ApprovalPolicy]) -> Approver:
@@ -31,11 +31,11 @@ def approver(*args: Any, name: str | None = None, **attribs: Any) -> Any:
31
31
  Args:
32
32
  *args: Function returning `Approver` targeted by
33
33
  plain approver decorator without attributes (e.g. `@approver`)
34
- name (str | None):
34
+ name:
35
35
  Optional name for approver. If the decorator has no name
36
36
  argument then the name of the function
37
37
  will be used to automatically assign a name.
38
- **attribs: (dict[str,Any]): Additional approver attributes.
38
+ **attribs: Additional approver attributes.
39
39
 
40
40
  Returns:
41
41
  Approver with registry attributes.
@@ -27,6 +27,8 @@ MT = TypeVar("MT", bound=BaseModel)
27
27
 
28
28
 
29
29
  class Sample(BaseModel):
30
+ r"""Sample for an evaluation task."""
31
+
30
32
  def __init__(
31
33
  self,
32
34
  input: str | list[ChatMessage],
@@ -38,22 +40,22 @@ class Sample(BaseModel):
38
40
  files: dict[str, str] | None = None,
39
41
  setup: str | None = None,
40
42
  ) -> None:
41
- r"""Sample to be used in an evaluation task.
43
+ r"""Create a Sample.
42
44
 
43
45
  Args:
44
- input (str | list[ChatMessage]): The input to be submitted to the model.
45
- choices (list[str] | None): Optional. List of available answer choices
46
- (used only for multiple-choice evals).
47
- target (str | list[str]): Optional. Ideal target output. May be a literal value
46
+ input: The input to be submitted to the model.
47
+ choices: Optional. List of available answer choices
48
+ (used only for multiple-choice evals).
49
+ target: Optional. Ideal target output. May be a literal value
48
50
  or narrative text to be used by a model grader.
49
- id (int | str | None): Optional. Unique identifier for sample.
50
- metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample.
51
- sandbox (SandboxEnvironmentType | None): Sandbox environment type
52
- (or optionally a str or tuple with a shorthand spec)
53
- files (dict[str, str] | None): Optional. Files that go along with the sample (copied to
54
- SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
55
- setup (str | None): Optional. Setup script to run for sample (run
56
- within default SandboxEnvironment).
51
+ id: Optional. Unique identifier for sample.
52
+ metadata: Optional. Arbitrary metadata associated with the sample.
53
+ sandbox (SandboxEnvironmentType | None): Sandbox environment type (or optionally a str or tuple with a shorthand spec)
54
+ sandbox: Optional. Sandbox specification for this sample.
55
+ files: Optional. Files that go along with the sample (copied to
56
+ SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
57
+ setup: Optional. Setup script to run for sample (run
58
+ within default SandboxEnvironment).
57
59
  """
58
60
  super().__init__(
59
61
  input=input,
@@ -156,14 +158,6 @@ class Dataset(Sequence[Sample], abc.ABC):
156
158
  @abc.abstractmethod
157
159
  def __len__(self) -> int: ...
158
160
 
159
- @abc.abstractmethod
160
- def shuffle(self, seed: int | None = None) -> None:
161
- """Shuffle the order of the dataset (in place).
162
-
163
- Args:
164
- seed: (int | None): Random seed for shuffling (optional).
165
- """
166
-
167
161
  @abc.abstractmethod
168
162
  def sort(
169
163
  self,
@@ -177,8 +171,8 @@ class Dataset(Sequence[Sample], abc.ABC):
177
171
  The key function defaults to measuring the length of the sample's input field.
178
172
 
179
173
  Args:
180
- reverse (bool): if true, sort in descending order. Defaults to False.
181
- key (Callable[[Any], Any]): a callable mapping each item to a numeric value (optional, defaults to sample_input_len).
174
+ reverse: If `Treu`, sort in descending order. Defaults to False.
175
+ key: a callable mapping each item to a numeric value (optional, defaults to sample_input_len).
182
176
  """
183
177
 
184
178
  @abc.abstractmethod
@@ -188,28 +182,33 @@ class Dataset(Sequence[Sample], abc.ABC):
188
182
  """Filter the dataset using a predicate.
189
183
 
190
184
  Args:
191
- predicate (Callable[[Sample], bool]): Filtering function.
192
- name (str | None): Name for filtered dataset (optional).
185
+ predicate: Filtering function.
186
+ name: Name for filtered dataset (optional).
193
187
 
194
188
  Returns:
195
189
  Filtered dataset.
196
190
  """
197
191
 
192
+ @abc.abstractmethod
193
+ def shuffle(self, seed: int | None = None) -> None:
194
+ """Shuffle the order of the dataset (in place).
195
+
196
+ Args:
197
+ seed: Random seed for shuffling (optional).
198
+ """
199
+
200
+ @abc.abstractmethod
201
+ def shuffle_choices(self, seed: int | None = None) -> None:
202
+ """Shuffle the order of the choices with each sample.
203
+
204
+ Args:
205
+ seed: Random seed for shuffling (optional).
206
+ """
207
+
198
208
 
199
209
  @dataclass
200
210
  class FieldSpec:
201
- r"""Specification for mapping data source fields to sample fields.
202
-
203
- Args:
204
- input (str): Name of the field containing the sample input.
205
- target (str): Name of the field containing the sample target.
206
- choices (str): Optional. Name of field containing the list of answer choices.
207
- id (str): Optional. Unique identifier for the sample.
208
- metadata (list[str] | None): List of additional field names that should be read as metadata.
209
- sandbox (str): Optional. Sandbox type along with optional config file
210
- files (str): Optional. Files that go along with the sample.
211
- setup (str): Optional. Setup script to run for sample .
212
- """
211
+ r"""Specification for mapping data source fields to sample fields."""
213
212
 
214
213
  input: str = field(default="input")
215
214
  """Name of the field containing the sample input."""
@@ -315,6 +314,34 @@ class MemoryDataset(Dataset):
315
314
  random.shuffle(self.samples)
316
315
  self._shuffled = True
317
316
 
317
+ @override
318
+ def shuffle_choices(self, seed: int | None = None) -> None:
319
+ rand = random.Random(seed)
320
+ for sample in self.samples:
321
+ if not sample.choices:
322
+ continue
323
+ # The original positions
324
+ positions = list(range(len(sample.choices)))
325
+
326
+ # Shuffle the choices
327
+ rand.shuffle(positions)
328
+ shuffled_choices = [sample.choices[i] for i in positions]
329
+
330
+ # Map of original position / target letter
331
+ position_map = {i: chr(65 + new_i) for new_i, i in enumerate(positions)}
332
+
333
+ # Update to the shuffled choices and target
334
+ sample.choices = shuffled_choices
335
+ sample.target = self._remap_target(sample.target, position_map=position_map)
336
+
337
+ def _remap_target(
338
+ self, target: str | list[str], position_map: dict[int, str]
339
+ ) -> str | list[str]:
340
+ if isinstance(target, list):
341
+ return [position_map[ord(t) - 65] for t in target]
342
+ else:
343
+ return position_map[ord(target) - 65]
344
+
318
345
  @override
319
346
  def sort(
320
347
  self,
File without changes
@@ -23,6 +23,7 @@ def csv_dataset(
23
23
  auto_id: bool = False,
24
24
  shuffle: bool = False,
25
25
  seed: int | None = None,
26
+ shuffle_choices: bool | int | None = None,
26
27
  limit: int | None = None,
27
28
  dialect: str = "unix",
28
29
  encoding: str = "utf-8",
@@ -34,29 +35,30 @@ def csv_dataset(
34
35
  r"""Read dataset from CSV file.
35
36
 
36
37
  Args:
37
- csv_file (str): Path to CSV file. Can be a local filesystem path,
38
+ csv_file: Path to CSV file. Can be a local filesystem path,
38
39
  a path to an S3 bucket (e.g. "s3://my-bucket"), or an HTTPS URL.
39
40
  Use `fs_options` to pass arguments through to the `S3FileSystem` constructor.
40
- sample_fields (FieldSpec | RecordToSample): Method of mapping underlying
41
+ sample_fields: Method of mapping underlying
41
42
  fields in the data source to Sample objects. Pass `None` if the data is already
42
43
  stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
43
44
  `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
44
45
  handle mapping with a custom function that returns one or more samples.
45
- auto_id (bool): Assign an auto-incrementing ID for each sample.
46
- shuffle (bool): Randomly shuffle the dataset order.
47
- seed: (int | None): Seed used for random shuffle.
48
- limit (int | None): Limit the number of records to read.
49
- dialect (str): CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details
50
- encoding (str): Text encoding for file (defaults to "utf-8").
51
- name (str): Optional name for dataset (for logging). If not specified,
46
+ auto_id: Assign an auto-incrementing ID for each sample.
47
+ shuffle: Randomly shuffle the dataset order.
48
+ seed: Seed used for random shuffle.
49
+ shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
50
+ limit: Limit the number of records to read.
51
+ dialect: CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details
52
+ encoding: Text encoding for file (defaults to "utf-8").
53
+ name: Optional name for dataset (for logging). If not specified,
52
54
  defaults to the stem of the filename
53
- fs_options (dict[str, Any]): Optional. Additional arguments to pass through
55
+ fs_options: Optional. Additional arguments to pass through
54
56
  to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }`
55
57
  if you are accessing a public S3 bucket with no credentials.
56
- fieldnames (list[str] | None): Optional. A list of fieldnames to use for the CSV.
58
+ fieldnames: Optional. A list of fieldnames to use for the CSV.
57
59
  If None, the values in the first row of the file will be used as the fieldnames.
58
60
  Useful for files without a header.
59
- delimiter (str): Optional. The delimiter to use when parsing the file. Defaults to ",".
61
+ delimiter: Optional. The delimiter to use when parsing the file. Defaults to ",".
60
62
 
61
63
  Returns:
62
64
  Dataset read from CSV file.
@@ -86,6 +88,12 @@ def csv_dataset(
86
88
  if shuffle:
87
89
  dataset.shuffle(seed=seed)
88
90
 
91
+ # shuffle choices, if requested
92
+ if isinstance(shuffle_choices, int):
93
+ dataset.shuffle_choices(seed=shuffle_choices)
94
+ elif shuffle_choices is True:
95
+ dataset.shuffle_choices()
96
+
89
97
  # limit if requested
90
98
  if limit:
91
99
  return dataset[0:limit]
@@ -16,6 +16,7 @@ def file_dataset(
16
16
  auto_id: bool = False,
17
17
  shuffle: bool = False,
18
18
  seed: int | None = None,
19
+ shuffle_choices: bool | int | None = None,
19
20
  limit: int | None = None,
20
21
  dialect: str = "unix",
21
22
  encoding: str = "utf-8",
@@ -40,6 +41,7 @@ def file_dataset(
40
41
  auto_id (bool): Assign an auto-incrementing ID for each sample.
41
42
  shuffle (bool): Randomly shuffle the dataset order.
42
43
  seed: (int | None): Seed used for random shuffle.
44
+ shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
43
45
  limit (int | None): Limit the number of records to read.
44
46
  dialect (str): CSV dialect ("unix" or "excel", defaults to "unix"). Only
45
47
  applies to reading CSV files.
@@ -66,6 +68,7 @@ def file_dataset(
66
68
  auto_id=auto_id,
67
69
  shuffle=shuffle,
68
70
  seed=seed,
71
+ shuffle_choices=shuffle_choices,
69
72
  limit=limit,
70
73
  encoding=encoding,
71
74
  name=name,
@@ -78,6 +81,7 @@ def file_dataset(
78
81
  auto_id=auto_id,
79
82
  shuffle=shuffle,
80
83
  seed=seed,
84
+ shuffle_choices=shuffle_choices,
81
85
  limit=limit,
82
86
  dialect=dialect,
83
87
  encoding=encoding,
@@ -29,6 +29,7 @@ def hf_dataset(
29
29
  auto_id: bool = False,
30
30
  shuffle: bool = False,
31
31
  seed: int | None = None,
32
+ shuffle_choices: bool | int | None = None,
32
33
  limit: int | None = None,
33
34
  trust: bool = False,
34
35
  cached: bool = True,
@@ -40,35 +41,36 @@ def hf_dataset(
40
41
  `datasets` package, including remote datasets on Hugging Face Hub.
41
42
 
42
43
  Args:
43
- path (str): Path or name of the dataset. Depending on path, the dataset
44
- builder that is used comes from a generic dataset script (JSON, CSV,
45
- Parquet, text etc.) or from the dataset script (a python file) inside
46
- the dataset directory.
47
- split (str): Which split of the data to load.
48
- name (str | None): Name of the dataset configuration.
49
- data_dir (str | None): data_dir of the dataset configuration
50
- to read data from.
51
- revision (str | None): Specific revision to load (e.g. "main", a branch
52
- name, or a specific commit SHA). When using `revision` the `cached` option
53
- is ignored and datasets are revalidated on Hugging Face before loading.
54
- sample_fields (FieldSpec | RecordToSample): Method of mapping underlying
55
- fields in the data source to Sample objects. Pass `None` if the data is already
56
- stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
57
- `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
44
+ path: Path or name of the dataset. Depending on path, the dataset
45
+ builder that is used comes from a generic dataset script (JSON, CSV,
46
+ Parquet, text etc.) or from the dataset script (a python file) inside
47
+ the dataset directory.
48
+ split: Which split of the data to load.
49
+ name: Name of the dataset configuration.
50
+ data_dir: data_dir of the dataset configuration
51
+ to read data from.
52
+ revision: Specific revision to load (e.g. "main", a branch
53
+ name, or a specific commit SHA). When using `revision` the `cached` option
54
+ is ignored and datasets are revalidated on Hugging Face before loading.
55
+ sample_fields: Method of mapping underlying
56
+ fields in the data source to Sample objects. Pass `None` if the data is already
57
+ stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a
58
+ `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
58
59
  handle mapping with a custom function that returns one or more samples.
59
- auto_id (bool): Assign an auto-incrementing ID for each sample.
60
- shuffle (bool): Randomly shuffle the dataset order.
61
- seed: (int | None): Seed used for random shuffle.
62
- limit (int | None): Limit the number of records to read.
63
- trust (bool): Whether or not to allow for datasets defined on the Hub
64
- using a dataset script. This option should only be set to True for
65
- repositories you trust and in which you have read the code, as it
66
- will execute code present on the Hub on your local machine.
67
- cached (bool): By default, datasets are read once from HuggingFace
68
- Hub and then cached for future reads. Pass `cached=False` to force
69
- re-reading the dataset from Hugging Face. Ignored when the `revision`
70
- option is specified.
71
- **kwargs (dict[str, Any]): Additional arguments to pass through to the
60
+ auto_id: Assign an auto-incrementing ID for each sample.
61
+ shuffle: Randomly shuffle the dataset order.
62
+ seed: Seed used for random shuffle.
63
+ shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
64
+ limit: Limit the number of records to read.
65
+ trust: Whether or not to allow for datasets defined on the Hub
66
+ using a dataset script. This option should only be set to True for
67
+ repositories you trust and in which you have read the code, as it
68
+ will execute code present on the Hub on your local machine.
69
+ cached: By default, datasets are read once from HuggingFace
70
+ Hub and then cached for future reads. Pass `cached=False` to force
71
+ re-reading the dataset from Hugging Face. Ignored when the `revision`
72
+ option is specified.
73
+ **kwargs (dict[str, Any]): Additional arguments to pass through to the
72
74
  `load_dataset` function of the `datasets` package.
73
75
 
74
76
  Returns:
@@ -117,8 +119,16 @@ def hf_dataset(
117
119
  dataset = dataset.select(range(limit))
118
120
 
119
121
  # return the dataset
120
- return MemoryDataset(
122
+ memory_dataset = MemoryDataset(
121
123
  samples=data_to_samples(dataset.to_list(), data_to_sample, auto_id),
122
124
  name=Path(path).stem if Path(path).exists() else path,
123
125
  location=path,
124
126
  )
127
+
128
+ # maybe shuffle the choices
129
+ if isinstance(shuffle_choices, int):
130
+ memory_dataset.shuffle_choices(seed=shuffle_choices)
131
+ elif shuffle_choices is True:
132
+ memory_dataset.shuffle_choices()
133
+
134
+ return memory_dataset
@@ -25,6 +25,7 @@ def json_dataset(
25
25
  auto_id: bool = False,
26
26
  shuffle: bool = False,
27
27
  seed: int | None = None,
28
+ shuffle_choices: bool | int | None = None,
28
29
  limit: int | None = None,
29
30
  encoding: str = "utf-8",
30
31
  name: str | None = None,
@@ -38,22 +39,23 @@ def json_dataset(
38
39
  the `sample_fields` argument.
39
40
 
40
41
  Args:
41
- json_file (str): Path to JSON file. Can be a local filesystem path or
42
+ json_file: Path to JSON file. Can be a local filesystem path or
42
43
  a path to an S3 bucket (e.g. "s3://my-bucket"). Use `fs_options`
43
44
  to pass arguments through to the `S3FileSystem` constructor.
44
- sample_fields (FieldSpec | RecordToSample): Method of mapping underlying
45
+ sample_fields: Method of mapping underlying
45
46
  fields in the data source to `Sample` objects. Pass `None` if the data is already
46
47
  stored in `Sample` form (i.e. object with "input" and "target" fields); Pass a
47
48
  `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
48
49
  handle mapping with a custom function that returns one or more samples.
49
- auto_id (bool): Assign an auto-incrementing ID for each sample.
50
- shuffle (bool): Randomly shuffle the dataset order.
51
- seed: (int | None): Seed used for random shuffle.
52
- limit (int | None): Limit the number of records to read.
53
- encoding (str): Text encoding for file (defaults to "utf-8").
54
- name (str): Optional name for dataset (for logging). If not specified,
50
+ auto_id: Assign an auto-incrementing ID for each sample.
51
+ shuffle: Randomly shuffle the dataset order.
52
+ seed: Seed used for random shuffle.
53
+ shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling.
54
+ limit: Limit the number of records to read.
55
+ encoding: Text encoding for file (defaults to "utf-8").
56
+ name: Optional name for dataset (for logging). If not specified,
55
57
  defaults to the stem of the filename.
56
- fs_options (dict[str, Any]): Optional. Additional arguments to pass through
58
+ fs_options: Optional. Additional arguments to pass through
57
59
  to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }`
58
60
  if you are accessing a public S3 bucket with no credentials.
59
61
 
@@ -86,6 +88,12 @@ def json_dataset(
86
88
  if shuffle:
87
89
  dataset.shuffle(seed=seed)
88
90
 
91
+ # shuffle choices, if requested
92
+ if isinstance(shuffle_choices, int):
93
+ dataset.shuffle_choices(seed=shuffle_choices)
94
+ elif shuffle_choices is True:
95
+ dataset.shuffle_choices()
96
+
89
97
  # limit if requested
90
98
  if limit:
91
99
  return dataset[0:limit]
@@ -22,6 +22,7 @@ from ._log import (
22
22
  EvalResults,
23
23
  EvalRevision,
24
24
  EvalSample,
25
+ EvalSampleLimit,
25
26
  EvalSampleReductions,
26
27
  EvalSampleScore,
27
28
  EvalScore,
@@ -61,6 +62,7 @@ __all__ = [
61
62
  "EvalResults",
62
63
  "EvalRevision",
63
64
  "EvalSample",
65
+ "EvalSampleLimit",
64
66
  "EvalSampleScore",
65
67
  "EvalSampleReductions",
66
68
  "EvalScore",
@@ -20,12 +20,12 @@ def convert_eval_logs(
20
20
 
21
21
  Args:
22
22
  path (str): Path to source log file(s). Should be either a single
23
- log file or a directory containing log files.
23
+ log file or a directory containing log files.
24
24
  to (Literal["eval", "json"]): Format to convert to. If a file is
25
- already in the target format it will just be copied to the output dir.
25
+ already in the target format it will just be copied to the output dir.
26
26
  output_dir (str): Output directory to write converted log file(s) to.
27
27
  overwrite (bool): Overwrite existing log files (defaults to `False`,
28
- raising an error if the output file path already exists).
28
+ raising an error if the output file path already exists).
29
29
  """
30
30
  from inspect_ai._display import display
31
31
 
inspect_ai/log/_file.py CHANGED
@@ -3,6 +3,7 @@ import re
3
3
  from logging import getLogger
4
4
  from typing import Any, Callable, Generator, Literal, cast
5
5
 
6
+ from pydantic import BaseModel
6
7
  from pydantic_core import to_json
7
8
 
8
9
  from inspect_ai._util._async import run_coroutine
@@ -22,7 +23,21 @@ from ._recorders import recorder_type_for_format, recorder_type_for_location
22
23
  logger = getLogger(__name__)
23
24
 
24
25
 
25
- class EvalLogInfo(FileInfo):
26
+ class EvalLogInfo(BaseModel):
27
+ """File info and task identifiers for eval log."""
28
+
29
+ name: str
30
+ """Name of file."""
31
+
32
+ type: str
33
+ """Type of file (file or directory)"""
34
+
35
+ size: int
36
+ """File size in bytes."""
37
+
38
+ mtime: float | None
39
+ """File modification time (None if the file is a directory on S3)."""
40
+
26
41
  task: str
27
42
  """Task name."""
28
43
 
@@ -231,7 +246,7 @@ def write_log_dir_manifest(
231
246
 
232
247
 
233
248
  def read_eval_log(
234
- log_file: str | FileInfo,
249
+ log_file: str | EvalLogInfo,
235
250
  header_only: bool = False,
236
251
  resolve_attachments: bool = False,
237
252
  format: Literal["eval", "json", "auto"] = "auto",
@@ -241,7 +256,7 @@ def read_eval_log(
241
256
  Args:
242
257
  log_file (str | FileInfo): Log file to read.
243
258
  header_only (bool): Read only the header (i.e. exclude
244
- the "samples" and "logging" fields). Defaults to False.
259
+ the "samples" and "logging" fields). Defaults to False.
245
260
  resolve_attachments (bool): Resolve attachments (e.g. images)
246
261
  to their full content.
247
262
  format (Literal["eval", "json", "auto"]): Read from format
@@ -256,7 +271,7 @@ def read_eval_log(
256
271
 
257
272
 
258
273
  async def read_eval_log_async(
259
- log_file: str | FileInfo,
274
+ log_file: str | EvalLogInfo,
260
275
  header_only: bool = False,
261
276
  resolve_attachments: bool = False,
262
277
  format: Literal["eval", "json", "auto"] = "auto",
@@ -304,13 +319,13 @@ async def read_eval_log_async(
304
319
 
305
320
 
306
321
  def read_eval_log_headers(
307
- log_files: list[str] | list[FileInfo] | list[EvalLogInfo],
322
+ log_files: list[str] | list[EvalLogInfo],
308
323
  ) -> list[EvalLog]:
309
324
  return run_coroutine(read_eval_log_headers_async(log_files))
310
325
 
311
326
 
312
327
  async def read_eval_log_headers_async(
313
- log_files: list[str] | list[FileInfo] | list[EvalLogInfo],
328
+ log_files: list[str] | list[EvalLogInfo],
314
329
  ) -> list[EvalLog]:
315
330
  return [
316
331
  await read_eval_log_async(log_file, header_only=True) for log_file in log_files
@@ -318,7 +333,7 @@ async def read_eval_log_headers_async(
318
333
 
319
334
 
320
335
  def read_eval_log_sample(
321
- log_file: str | FileInfo,
336
+ log_file: str | EvalLogInfo,
322
337
  id: int | str,
323
338
  epoch: int = 1,
324
339
  resolve_attachments: bool = False,
@@ -347,7 +362,7 @@ def read_eval_log_sample(
347
362
 
348
363
 
349
364
  async def read_eval_log_sample_async(
350
- log_file: str | FileInfo,
365
+ log_file: str | EvalLogInfo,
351
366
  id: int | str,
352
367
  epoch: int = 1,
353
368
  resolve_attachments: bool = False,
@@ -386,7 +401,7 @@ async def read_eval_log_sample_async(
386
401
 
387
402
 
388
403
  def read_eval_log_samples(
389
- log_file: str | FileInfo,
404
+ log_file: str | EvalLogInfo,
390
405
  all_samples_required: bool = True,
391
406
  resolve_attachments: bool = False,
392
407
  format: Literal["eval", "json", "auto"] = "auto",