inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (518) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/main.py +1 -1
  8. inspect_ai/_cli/sandbox.py +4 -1
  9. inspect_ai/_cli/score.py +181 -32
  10. inspect_ai/_cli/trace.py +10 -0
  11. inspect_ai/_cli/view.py +4 -2
  12. inspect_ai/_display/core/active.py +2 -3
  13. inspect_ai/_display/core/config.py +7 -1
  14. inspect_ai/_display/textual/widgets/samples.py +4 -3
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/eval.py +104 -101
  17. inspect_ai/_eval/evalset.py +75 -75
  18. inspect_ai/_eval/loader.py +122 -12
  19. inspect_ai/_eval/registry.py +1 -1
  20. inspect_ai/_eval/run.py +14 -0
  21. inspect_ai/_eval/score.py +125 -36
  22. inspect_ai/_eval/task/log.py +105 -4
  23. inspect_ai/_eval/task/results.py +92 -38
  24. inspect_ai/_eval/task/run.py +9 -2
  25. inspect_ai/_eval/task/sandbox.py +35 -2
  26. inspect_ai/_eval/task/task.py +49 -46
  27. inspect_ai/_util/constants.py +1 -1
  28. inspect_ai/_util/content.py +8 -0
  29. inspect_ai/_util/error.py +2 -0
  30. inspect_ai/_util/file.py +15 -1
  31. inspect_ai/_util/hash.py +1 -1
  32. inspect_ai/_util/logger.py +4 -2
  33. inspect_ai/_util/registry.py +7 -1
  34. inspect_ai/_view/view.py +1 -2
  35. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  36. inspect_ai/_view/www/.vscode/settings.json +8 -0
  37. inspect_ai/_view/www/App.css +97 -29
  38. inspect_ai/_view/www/README.md +1 -1
  39. inspect_ai/_view/www/dist/assets/index.css +16663 -14674
  40. inspect_ai/_view/www/dist/assets/index.js +58808 -51348
  41. inspect_ai/_view/www/dist/index.html +1 -1
  42. inspect_ai/_view/www/index.html +2 -2
  43. inspect_ai/_view/www/log-schema.json +87 -73
  44. inspect_ai/_view/www/package.json +22 -4
  45. inspect_ai/_view/www/postcss.config.cjs +8 -9
  46. inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
  47. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  48. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  49. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  50. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  51. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  52. inspect_ai/_view/www/src/api/index.ts +4 -4
  53. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  54. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  55. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  56. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  57. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  58. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  59. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  60. inspect_ai/_view/www/src/components/Card.css +60 -0
  61. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  62. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  63. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  64. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  65. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  66. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  67. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  68. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  69. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  70. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  71. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  72. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  73. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  74. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  75. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  76. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  77. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  78. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  79. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  80. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  81. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  82. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  83. inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
  84. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  85. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  86. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  87. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  88. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  89. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  90. inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
  91. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  92. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  93. inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
  94. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  95. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  96. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  97. inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
  98. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  99. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  100. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  101. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  102. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  103. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
  104. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  105. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  106. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  107. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  108. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  109. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  110. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  111. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  112. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  113. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  114. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  115. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  116. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  117. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  118. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
  119. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  120. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  121. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  122. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  123. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  124. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  126. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  127. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
  128. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  129. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
  130. inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
  131. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  132. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  133. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  134. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  135. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  136. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
  137. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  138. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  139. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  140. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
  141. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  142. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
  143. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  144. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
  145. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  146. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
  147. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  148. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
  149. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  150. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  151. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
  152. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
  153. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  154. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  155. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  156. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  157. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  158. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  159. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  160. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  161. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  162. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  163. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  164. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  165. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  166. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  167. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  168. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  169. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  170. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  171. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  172. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  173. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  174. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  175. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  176. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  177. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  179. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  180. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  181. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  182. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
  183. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  184. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
  185. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  186. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  187. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  188. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  189. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  190. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  191. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  192. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  193. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  194. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  196. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  197. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  198. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  199. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  200. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  201. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  202. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  203. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  204. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  205. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  206. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
  207. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  208. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  209. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  210. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  211. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  212. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  213. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  214. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  215. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
  216. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  217. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  218. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  219. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  220. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  221. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
  222. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  223. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
  224. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  225. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  226. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  227. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  228. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  229. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
  230. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  231. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  232. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
  233. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  234. inspect_ai/_view/www/src/types/log.d.ts +108 -19
  235. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  236. inspect_ai/_view/www/src/types.ts +71 -0
  237. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
  238. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  239. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
  240. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  241. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  242. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  243. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  244. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  245. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  246. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  247. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  248. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  249. inspect_ai/_view/www/src/utils/format.ts +194 -0
  250. inspect_ai/_view/www/src/utils/git.ts +7 -0
  251. inspect_ai/_view/www/src/utils/html.ts +6 -0
  252. inspect_ai/_view/www/src/utils/http.ts +14 -0
  253. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  254. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  255. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  256. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  257. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  258. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  259. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  260. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  261. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
  262. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  263. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  264. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  265. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  266. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  267. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
  268. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
  269. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
  270. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  271. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
  272. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  273. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  274. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  275. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  276. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  277. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  278. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  279. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  280. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  281. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  282. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  283. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
  284. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  285. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
  286. inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
  287. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
  288. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  289. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  290. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  291. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  292. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  293. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  294. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  295. inspect_ai/_view/www/tsconfig.json +23 -9
  296. inspect_ai/_view/www/vite.config.js +8 -17
  297. inspect_ai/_view/www/yarn.lock +627 -556
  298. inspect_ai/approval/_approval.py +2 -0
  299. inspect_ai/approval/_approver.py +4 -4
  300. inspect_ai/approval/_auto.py +1 -1
  301. inspect_ai/approval/_human/approver.py +3 -0
  302. inspect_ai/approval/_policy.py +5 -0
  303. inspect_ai/approval/_registry.py +2 -2
  304. inspect_ai/dataset/_dataset.py +64 -37
  305. inspect_ai/dataset/_sources/__init__.py +0 -0
  306. inspect_ai/dataset/_sources/csv.py +20 -12
  307. inspect_ai/dataset/_sources/file.py +4 -0
  308. inspect_ai/dataset/_sources/hf.py +39 -29
  309. inspect_ai/dataset/_sources/json.py +17 -9
  310. inspect_ai/log/__init__.py +2 -0
  311. inspect_ai/log/_convert.py +3 -3
  312. inspect_ai/log/_file.py +24 -9
  313. inspect_ai/log/_log.py +101 -13
  314. inspect_ai/log/_message.py +4 -2
  315. inspect_ai/log/_recorders/file.py +4 -0
  316. inspect_ai/log/_recorders/json.py +5 -7
  317. inspect_ai/log/_recorders/recorder.py +3 -0
  318. inspect_ai/log/_transcript.py +19 -8
  319. inspect_ai/model/__init__.py +2 -0
  320. inspect_ai/model/_cache.py +39 -21
  321. inspect_ai/model/_call_tools.py +4 -3
  322. inspect_ai/model/_chat_message.py +14 -4
  323. inspect_ai/model/_generate_config.py +1 -1
  324. inspect_ai/model/_model.py +31 -24
  325. inspect_ai/model/_model_output.py +14 -1
  326. inspect_ai/model/_openai.py +10 -18
  327. inspect_ai/model/_providers/anthropic.py +3 -3
  328. inspect_ai/model/_providers/google.py +9 -5
  329. inspect_ai/model/_providers/openai.py +5 -9
  330. inspect_ai/model/_providers/openai_o1.py +3 -5
  331. inspect_ai/model/_providers/openrouter.py +86 -0
  332. inspect_ai/model/_providers/providers.py +11 -0
  333. inspect_ai/scorer/__init__.py +6 -1
  334. inspect_ai/scorer/_answer.py +7 -7
  335. inspect_ai/scorer/_classification.py +38 -18
  336. inspect_ai/scorer/_common.py +2 -8
  337. inspect_ai/scorer/_match.py +4 -5
  338. inspect_ai/scorer/_metric.py +87 -28
  339. inspect_ai/scorer/_metrics/__init__.py +3 -3
  340. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  341. inspect_ai/scorer/_metrics/mean.py +3 -17
  342. inspect_ai/scorer/_metrics/std.py +111 -30
  343. inspect_ai/scorer/_model.py +12 -12
  344. inspect_ai/scorer/_pattern.py +3 -3
  345. inspect_ai/scorer/_reducer/reducer.py +36 -21
  346. inspect_ai/scorer/_reducer/registry.py +2 -2
  347. inspect_ai/scorer/_reducer/types.py +7 -1
  348. inspect_ai/scorer/_score.py +11 -1
  349. inspect_ai/scorer/_scorer.py +110 -16
  350. inspect_ai/solver/__init__.py +1 -1
  351. inspect_ai/solver/_basic_agent.py +19 -22
  352. inspect_ai/solver/_bridge/__init__.py +0 -3
  353. inspect_ai/solver/_bridge/bridge.py +3 -3
  354. inspect_ai/solver/_chain.py +1 -2
  355. inspect_ai/solver/_critique.py +3 -3
  356. inspect_ai/solver/_fork.py +2 -2
  357. inspect_ai/solver/_human_agent/__init__.py +0 -0
  358. inspect_ai/solver/_human_agent/agent.py +5 -8
  359. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  360. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  361. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  362. inspect_ai/solver/_multiple_choice.py +38 -26
  363. inspect_ai/solver/_prompt.py +7 -7
  364. inspect_ai/solver/_solver.py +53 -52
  365. inspect_ai/solver/_task_state.py +80 -69
  366. inspect_ai/solver/_use_tools.py +9 -9
  367. inspect_ai/tool/__init__.py +4 -1
  368. inspect_ai/tool/_tool.py +43 -14
  369. inspect_ai/tool/_tool_call.py +6 -2
  370. inspect_ai/tool/_tool_choice.py +3 -1
  371. inspect_ai/tool/_tool_def.py +10 -8
  372. inspect_ai/tool/_tool_params.py +24 -0
  373. inspect_ai/tool/_tool_with.py +7 -7
  374. inspect_ai/tool/_tools/__init__.py +0 -0
  375. inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
  376. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
  377. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  378. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  379. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  380. inspect_ai/tool/_tools/_execute.py +23 -11
  381. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  382. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  383. inspect_ai/tool/_tools/_web_search.py +7 -5
  384. inspect_ai/tool/beta.py +3 -0
  385. inspect_ai/util/_concurrency.py +3 -3
  386. inspect_ai/util/_panel.py +2 -0
  387. inspect_ai/util/_resource.py +12 -12
  388. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  389. inspect_ai/util/_sandbox/docker/config.py +2 -1
  390. inspect_ai/util/_sandbox/docker/docker.py +42 -86
  391. inspect_ai/util/_sandbox/docker/service.py +100 -0
  392. inspect_ai/util/_sandbox/environment.py +99 -96
  393. inspect_ai/util/_sandbox/self_check.py +124 -16
  394. inspect_ai/util/_subprocess.py +5 -3
  395. inspect_ai/util/_subtask.py +15 -16
  396. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
  397. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
  398. inspect_ai-0.3.64.dist-info/RECORD +625 -0
  399. inspect_ai/_view/www/src/Register.mjs +0 -3
  400. inspect_ai/_view/www/src/Types.mjs +0 -38
  401. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  402. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  403. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  404. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  405. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  406. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  407. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  408. inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
  409. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  410. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  411. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  412. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  413. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  414. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  415. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  416. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  417. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  418. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  419. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  420. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  421. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  422. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  423. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  424. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  425. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  426. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  427. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  428. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  429. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  430. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  431. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  432. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  433. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  434. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  435. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  436. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  437. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  438. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  439. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  440. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  441. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  442. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  443. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  444. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  445. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  446. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  447. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  448. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  449. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  450. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  451. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  452. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  453. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  454. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  455. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  456. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  457. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  458. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  459. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  460. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  461. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  462. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  463. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  464. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  465. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  466. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  467. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  468. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  469. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  470. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  471. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  472. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  473. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  474. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  475. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  476. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  477. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
  478. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  479. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  480. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  481. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  482. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  483. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  484. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  485. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  486. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  487. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  488. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  489. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  490. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  491. inspect_ai/tool/beta/__init__.py +0 -5
  492. inspect_ai-0.3.62.dist-info/RECORD +0 -481
  493. /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
  494. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
  495. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  496. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  497. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  498. /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
  499. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  500. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  501. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  502. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  503. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  504. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  505. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  506. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
  507. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
  508. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  509. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  510. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
  511. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  512. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  513. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  514. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  515. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  516. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
  517. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
  518. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
@@ -29,36 +29,32 @@ from ._task_state import TaskState, set_sample_state
29
29
 
30
30
  @runtime_checkable
31
31
  class Generate(Protocol):
32
- """Generate using the model and add the assistant message to the task state.
33
-
34
- Args:
35
- state (TaskState): Beginning task state.
36
-
37
- tool_calls (Literal["loop", "single", "none"]): Resolve tool calls:
38
- - `"loop"` resolves tools calls and then invokes `generate()`,
39
- proceeding in a loop which terminates when there are no more
40
- tool calls, or `message_limit` or `token_limit` is exceeded.
41
- This is the default behavior.
42
- - `"single"` resolves at most a single set of tool calls and then returns.
43
- - `"none"` does not resolve tool calls at all (in this
44
- case you will need to invoke `call_tools()` directly).
45
-
46
- cache: (bool | CachePolicy):
47
- Caching behaviour for generate responses (defaults to no caching).
48
-
49
- **kwargs: Optional generation config arguments.
50
-
51
- Returns:
52
- Updated TaskState.
53
- """
54
-
55
32
  async def __call__(
56
33
  self,
57
34
  state: TaskState,
58
35
  tool_calls: Literal["loop", "single", "none"] = "loop",
59
36
  cache: bool | CachePolicy = False,
60
37
  **kwargs: Unpack[GenerateConfigArgs],
61
- ) -> TaskState: ...
38
+ ) -> TaskState:
39
+ """Generate using the model and add the assistant message to the task state.
40
+
41
+ Args:
42
+ state: Beginning task state.
43
+ tool_calls:
44
+ - `"loop"` resolves tools calls and then invokes `generate()`,
45
+ proceeding in a loop which terminates when there are no more
46
+ tool calls, or `message_limit` or `token_limit` is exceeded.
47
+ This is the default behavior.
48
+ - `"single"` resolves at most a single set of tool calls and then returns.
49
+ - `"none"` does not resolve tool calls at all (in this
50
+ case you will need to invoke `call_tools()` directly).
51
+ cache: Caching behaviour for generate responses (defaults to no caching).
52
+ **kwargs: Optional generation config arguments.
53
+
54
+ Returns:
55
+ Updated TaskState.
56
+ """
57
+ ...
62
58
 
63
59
 
64
60
  @dataclass(frozen=True)
@@ -74,28 +70,37 @@ class SolverSpec:
74
70
 
75
71
  @runtime_checkable
76
72
  class Solver(Protocol):
77
- r"""Contribute to solving an evaluation task.
73
+ async def __call__(
74
+ self,
75
+ state: TaskState,
76
+ generate: Generate,
77
+ ) -> TaskState:
78
+ r"""Contribute to solving an evaluation task.
78
79
 
79
- Contribute to the solution of a task by transforming a TaskState
80
- (e.g. prompt enhancement, elicitation, etc.). Solvers return a
81
- TaskState (which could simply be a modified version of the one
82
- they were passed) and optionally may call the generate() function
83
- to generate output (and a new TaskState with that output).
80
+ Transform a `TaskState`, returning the new state. Solvers may
81
+ optionally call the `generate()` function to create a new
82
+ state resulting from model generation. Solvers may also do
83
+ prompt engineering or other types of elicitation.
84
84
 
85
+ Args:
86
+ state: State for tasks being evaluated.
87
+ generate: Function for generating outputs.
85
88
 
86
- Args:
87
- state (TaskState): States for tasks being evaluated.
88
- generate (Generate): Function for generating outputs.
89
+ Returns:
90
+ Updated TaskState.
89
91
 
90
- Returns:
91
- Updated TaskState.
92
- """
92
+ Examples:
93
+ ```python
94
+ @solver
95
+ def prompt_cot(template: str) -> Solver:
96
+ def solve(state: TaskState, generate: Generate) -> TaskState:
97
+ # insert chain of thought prompt
98
+ return state
93
99
 
94
- async def __call__(
95
- self,
96
- state: TaskState,
97
- generate: Generate,
98
- ) -> TaskState: ...
100
+ return solve
101
+ ```
102
+ """
103
+ ...
99
104
 
100
105
 
101
106
  P = ParamSpec("P")
@@ -144,7 +149,7 @@ def solver(
144
149
  r"""Decorator for registering solvers.
145
150
 
146
151
  Args:
147
- name: (str | Callable[P, Solver]):
152
+ name:
148
153
  Optional name for solver. If the decorator has no name
149
154
  argument then the name of the underlying Callable[P, Solver]
150
155
  object will be used to automatically assign a name.
@@ -153,19 +158,15 @@ def solver(
153
158
  Solver with registry attributes.
154
159
 
155
160
  Examples:
156
- @solver
157
- def prompt_cot(state: TaskState, generate: Generate) -> None:
158
- ...
159
-
160
- @solver(name = "prompt_cot")
161
- def cot(state: TaskState, generate: Generate) -> None:
162
- ...
163
-
161
+ ```python
164
162
  @solver
165
163
  def prompt_cot(template: str) -> Solver:
166
- def solve(state: TaskState, generate: Generate) -> None:
167
- ...
164
+ def solve(state: TaskState, generate: Generate) -> TaskState:
165
+ # insert chain of thought prompt
166
+ return state
167
+
168
168
  return solve
169
+ ```
169
170
  """
170
171
 
171
172
  # create_solver_wrapper:
@@ -31,17 +31,20 @@ class Choice:
31
31
  """
32
32
  A `Choice` represents a single choice in a multiple choice question.
33
33
 
34
- It is only relevant for the `multiple_choice` solver and corresponding `choice` scorer.
34
+ It is only relevant for the `multiple_choice` solver and corresponding
35
+ `choice` scorer.
35
36
  """
36
37
 
37
38
  value: str
38
39
  """The original value of the choice from the `Sample`."""
39
40
 
40
41
  correct: bool | None
41
- """Did the model think this choice satisfies the question? `None` indicates this has not been set yet"""
42
+ """Did the model think this choice satisfies the question? `None`
43
+ indicates this has not been set yet"""
42
44
 
43
45
  original_position: int
44
- """Choices may be re-ordered during processing, this represents the original position in the sample's list of choices"""
46
+ """Choices may be re-ordered during processing, this represents the
47
+ original position in the sample's list of choices"""
45
48
 
46
49
 
47
50
  class Choices(Sequence[Choice]):
@@ -127,10 +130,10 @@ class TaskState:
127
130
  """
128
131
  The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
129
132
 
130
- It's a mutable object that is updated by each solver during a sample's
131
- evaluation. It allows us to maintain things like the message history between
132
- the running `Task` and the model, the tools available to the model, the
133
- final output of the model and whether or not it's completed yet.
133
+ The `TaskState` is passed to and returned from each solver during a sample's
134
+ evaluation. It allows us to manipulated the message history, the tools
135
+ available to the model, the final output of the model, and whether the task
136
+ is completed or has hit a limit.
134
137
  """
135
138
 
136
139
  def __init__(
@@ -149,73 +152,39 @@ class TaskState:
149
152
  metadata: dict[str, Any] = {},
150
153
  ) -> None:
151
154
  self._model = model
152
- """Model name used for this task."""
153
-
154
- self.sample_id = sample_id
155
- """Unique id for sample."""
156
-
157
- self.epoch = epoch
158
- """Epoch number for sample."""
159
-
155
+ self._sample_id = sample_id
156
+ self._epoch = epoch
160
157
  self._input = input
161
- """
162
- The original input from the `Sample` for this `TaskState`.
163
-
164
- Should be treated as immutable and not changed during the run, so that
165
- it can be referenced or checked wherever needed. Access through `input`
166
- or `input_text` only
167
- """
168
-
169
- self.target = target
170
- """The scoring target for this `Sample`."""
171
-
172
- self.metadata = metadata
173
- """Metadata from the `Sample` for this `TaskState`"""
174
-
158
+ self._target = target
159
+ self._metadata = metadata
175
160
  self._messages: list[ChatMessage] = ChatMessageList(messages, self)
176
- """
177
- Chat conversation history for sample.
178
-
179
- This will generally get appended to every time a `generate` call is made
180
- to the model. Useful for both debug and for solvers/scorers to assess
181
- model performance or choose the next step.
182
- """
183
-
184
161
  self._tools: list[Tool] = []
185
- """Tools available to the model."""
186
-
187
- self.tool_choice: ToolChoice | None = None
188
- """Tool choice directive."""
189
-
190
- self.output = output if output else ModelOutput(model=str(model), choices=[])
191
- """
192
- The 'final' model output once we've completed all solving.
193
-
194
- For simple evals this may just be the last `message` from the
195
- conversation history, but more complex solvers may generate this in
196
- different ways depending on what solvers are used..
197
- """
198
-
162
+ self._output = output if output else ModelOutput(model=str(model))
199
163
  self._message_limit = message_limit
200
164
  self._token_limit = token_limit
201
165
  self._completed = completed
202
-
203
- """Store for shared data"""
204
- self.store = Store()
166
+ self._store = Store()
205
167
 
206
168
  if choices:
207
169
  self.choices = Choices(choices)
208
170
  else:
209
171
  self.choices = Choices([])
210
172
 
211
- self.scores: dict[str, Score] | None = None
212
- """Scores yielded by running task."""
213
-
214
173
  @property
215
174
  def model(self) -> ModelName:
216
175
  """Name of model being evaluated."""
217
176
  return self._model
218
177
 
178
+ @property
179
+ def sample_id(self) -> int | str:
180
+ """Unique id for sample."""
181
+ return self._sample_id
182
+
183
+ @property
184
+ def epoch(self) -> int:
185
+ """Epoch number for sample."""
186
+ return self._epoch
187
+
219
188
  @property
220
189
  def input(self) -> str | list[ChatMessage]:
221
190
  """Input from the `Sample`, should be considered immutable."""
@@ -253,9 +222,6 @@ class TaskState:
253
222
  engineering solvers). This property enables easy read and
254
223
  write access to the user chat prompt. Raises an
255
224
  exception if there is no user prompt
256
-
257
- Returns:
258
- First user `ChatMessage` in the task state.
259
225
  """
260
226
  prompt = next((m for m in self.messages if m.role == "user"), None)
261
227
  if prompt:
@@ -263,16 +229,63 @@ class TaskState:
263
229
  else:
264
230
  raise ValueError("user_prompt requested from TaskState but none available")
265
231
 
232
+ @property
233
+ def metadata(self) -> dict[str, Any]:
234
+ """Metadata from the `Sample` for this `TaskState`"""
235
+ return self._metadata
236
+
237
+ @metadata.setter
238
+ def metadata(self, metadata: dict[str, Any]) -> None:
239
+ self._metadata = metadata
240
+
266
241
  @property
267
242
  def messages(self) -> list[ChatMessage]:
268
- """Messages in chat history"""
243
+ """
244
+ Chat conversation history for sample.
245
+
246
+ This will generally get appended to every time a `generate` call is made
247
+ to the model. Useful for both debug and for solvers/scorers to assess
248
+ model performance or choose the next step.
249
+ """
269
250
  return self._messages
270
251
 
271
252
  @messages.setter
272
253
  def messages(self, messages: list[ChatMessage]) -> None:
273
- """Set messages in chat history."""
274
254
  self._messages = ChatMessageList(messages, self)
275
255
 
256
+ @property
257
+ def output(self) -> ModelOutput:
258
+ """
259
+ The 'final' model output once we've completed all solving.
260
+
261
+ For simple evals this may just be the last `message` from the
262
+ conversation history, but more complex solvers may set this directly.
263
+ """
264
+ return self._output
265
+
266
+ @output.setter
267
+ def output(self, output: ModelOutput) -> None:
268
+ self._output = output
269
+
270
+ @property
271
+ def store(self) -> Store:
272
+ """Store for shared data"""
273
+ return self._store
274
+
275
+ @property
276
+ def tools(self) -> list[Tool]:
277
+ """Tools available to the model."""
278
+ return self._tools
279
+
280
+ @tools.setter
281
+ def tools(self, tools: list[Tool | ToolDef]) -> None:
282
+ self._tools.clear()
283
+ for tool in tools:
284
+ self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
285
+
286
+ tool_choice: ToolChoice | None = None
287
+ """Tool choice directive."""
288
+
276
289
  @property
277
290
  def max_messages(self) -> int | None:
278
291
  """Deprecated (use message_limit)."""
@@ -351,14 +364,12 @@ class TaskState:
351
364
  self._completed = completed
352
365
 
353
366
  @property
354
- def tools(self) -> list[Tool]:
355
- return self._tools
367
+ def target(self) -> Target:
368
+ """The scoring target for this `Sample`."""
369
+ return self._target
356
370
 
357
- @tools.setter
358
- def tools(self, tools: list[Tool | ToolDef]) -> None:
359
- self._tools.clear()
360
- for tool in tools:
361
- self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool())
371
+ scores: dict[str, Score] | None = None
372
+ """Scores yielded by running task."""
362
373
 
363
374
  def metadata_as(self, metadata_cls: Type[MT]) -> MT:
364
375
  """Pydantic model interface to metadata.
@@ -15,15 +15,15 @@ def use_tools(
15
15
  Inject tools into the task state to be used in generate().
16
16
 
17
17
  Args:
18
- *tools (Tool | list[Tool]): One or more tools or lists of tools
19
- to make available to the model. If no tools are passed, then
20
- no change to the currently available set of `tools` is made.
21
- tool_choice (ToolChoice | None): Directive indicating which
22
- tools the model should use. If `None` is passed, then no
23
- change to `tool_choice` is made.
24
- append (bool): If `True`, then the passed-in tools are appended
25
- to the existing tools; otherwise any existing tools are
26
- replaced (the default)
18
+ *tools: One or more tools or lists of tools
19
+ to make available to the model. If no tools are passed, then
20
+ no change to the currently available set of `tools` is made.
21
+ tool_choice: Directive indicating which
22
+ tools the model should use. If `None` is passed, then no
23
+ change to `tool_choice` is made.
24
+ append: If `True`, then the passed-in tools are appended
25
+ to the existing tools; otherwise any existing tools are
26
+ replaced (the default)
27
27
 
28
28
  Returns:
29
29
  A solver that injects the tools and tool_choice into the task state.
@@ -19,14 +19,16 @@ from ._tool_call import (
19
19
  from ._tool_choice import ToolChoice, ToolFunction
20
20
  from ._tool_def import ToolDef
21
21
  from ._tool_info import ToolInfo
22
- from ._tool_params import ToolParam, ToolParams
22
+ from ._tool_params import JSONType, ToolParam, ToolParams
23
23
  from ._tool_with import tool_with
24
+ from ._tools._computer import computer
24
25
  from ._tools._execute import bash, python
25
26
  from ._tools._web_browser import web_browser
26
27
  from ._tools._web_search import web_search
27
28
 
28
29
  __all__ = [
29
30
  "bash",
31
+ "computer",
30
32
  "python",
31
33
  "web_browser",
32
34
  "web_search",
@@ -52,6 +54,7 @@ __all__ = [
52
54
  "ToolInfo",
53
55
  "ToolParam",
54
56
  "ToolParams",
57
+ "JSONType",
55
58
  ]
56
59
 
57
60
  _UTIL_MODULE_VERSION = "0.3.19"
inspect_ai/tool/_tool.py CHANGED
@@ -40,10 +40,25 @@ ToolResult = (
40
40
  | ContentVideo
41
41
  | list[ContentText | ContentImage | ContentAudio | ContentVideo]
42
42
  )
43
+ """Valid types for results from tool calls."""
43
44
 
44
45
 
45
46
  class ToolError(Exception):
47
+ """Exception thrown from tool call.
48
+
49
+ If you throw a `ToolError` form within a tool call,
50
+ the error will be reported to the model for further
51
+ processing (rather than ending the sample). If you want
52
+ to raise a fatal error from a tool call use an appropriate
53
+ standard exception type (e.g. `RuntimeError`, `ValueError`, etc.)
54
+ """
55
+
46
56
  def __init__(self, message: str) -> None:
57
+ """Create a ToolError.
58
+
59
+ Args:
60
+ message: Error message to report to the model.
61
+ """
47
62
  super().__init__(message)
48
63
  self.message = message
49
64
 
@@ -68,11 +83,21 @@ class Tool(Protocol):
68
83
  r"""Additional tool that an agent can use to solve a task.
69
84
 
70
85
  Args:
71
- *args (Any): Arguments for the tool.
72
- **kwargs (Any): Keyword arguments for the tool.
86
+ *args: Arguments for the tool.
87
+ **kwargs: Keyword arguments for the tool.
73
88
 
74
89
  Returns:
75
90
  Result of tool call.
91
+
92
+ Examples:
93
+ ```python
94
+ @tool
95
+ def add() -> Tool:
96
+ async def execute(x: int, y: int) -> int:
97
+ return x + y
98
+
99
+ return execute
100
+ ```
76
101
  """
77
102
  ...
78
103
 
@@ -130,25 +155,29 @@ def tool(
130
155
  r"""Decorator for registering tools.
131
156
 
132
157
  Args:
133
- func (ToolType | None): Tool function
134
- name (str | None):
135
- Optional name for tool. If the decorator has no name
158
+ func: Tool function
159
+ name: Optional name for tool. If the decorator has no name
136
160
  argument then the name of the tool creation function
137
161
  will be used as the name of the tool.
138
- viewer (ToolCallViewer | None): Provide a custom view
139
- of tool call and context.
140
- model_input (ToolCallModelInput | None): Provide a custom
141
- function for playing back tool results as model input.
142
- parallel (bool):
143
- Does this tool support parallel execution?
144
- (defaults to True).
145
- prompt (str):
146
- Deprecated (provide all descriptive information about
162
+ viewer: Provide a custom view of tool call and context.
163
+ model_input: Provide a custom function for playing back tool results as model input.
164
+ parallel: Does this tool support parallel execution? (defaults to `True`).
165
+ prompt: Deprecated (provide all descriptive information about
147
166
  the tool within the tool function's doc comment)
148
167
 
149
168
 
150
169
  Returns:
151
170
  Tool with registry attributes.
171
+
172
+ Examples:
173
+ ```python
174
+ @tool
175
+ def add() -> Tool:
176
+ async def execute(x: int, y: int) -> int:
177
+ return x + y
178
+
179
+ return execute
180
+ ```
152
181
  """
153
182
  if prompt:
154
183
  from inspect_ai._util.logger import warn_once
@@ -13,10 +13,10 @@ class ToolCallContent(BaseModel):
13
13
  """Optional (plain text) title for tool call content."""
14
14
 
15
15
  format: Literal["text", "markdown"]
16
- """Format."""
16
+ """Format (text or markdown)."""
17
17
 
18
18
  content: str
19
- """Content."""
19
+ """Text or markdown content."""
20
20
 
21
21
 
22
22
  class ToolCallView(BaseModel):
@@ -56,6 +56,8 @@ class ToolCall:
56
56
 
57
57
  @dataclass
58
58
  class ToolCallError:
59
+ """Error raised by a tool call."""
60
+
59
61
  type: Literal[
60
62
  "parsing",
61
63
  "timeout",
@@ -67,8 +69,10 @@ class ToolCallError:
67
69
  "approval",
68
70
  "unknown",
69
71
  ]
72
+ """Error type."""
70
73
 
71
74
  message: str
75
+ """Error message."""
72
76
 
73
77
 
74
78
  ToolCallViewer = Callable[[ToolCall], ToolCallView]
@@ -4,8 +4,10 @@ from typing import Literal, Union
4
4
 
5
5
  @dataclass
6
6
  class ToolFunction:
7
+ """Indicate that a specific tool function should be called."""
8
+
7
9
  name: str
8
- """The name of the function to call."""
10
+ """The name of the tool function to call."""
9
11
 
10
12
 
11
13
  ToolChoice = Union[Literal["auto", "any", "none"], ToolFunction]
@@ -25,6 +25,8 @@ from ._tool_params import ToolParams
25
25
 
26
26
 
27
27
  class ToolDef:
28
+ """Tool definition."""
29
+
28
30
  def __init__(
29
31
  self,
30
32
  tool: Callable[..., Any],
@@ -35,19 +37,19 @@ class ToolDef:
35
37
  viewer: ToolCallViewer | None = None,
36
38
  model_input: ToolCallModelInput | None = None,
37
39
  ) -> None:
38
- """Tool definition.
40
+ """Create a tool definition.
39
41
 
40
42
  Args:
41
- tool (Callable[..., Any]): Callable to execute tool.
42
- name (str | None): Name of tool. Discovered automatically if not specified.
43
- description (str | None): Description of tool. Discovered automatically
43
+ tool: Callable to execute tool.
44
+ name: Name of tool. Discovered automatically if not specified.
45
+ description: Description of tool. Discovered automatically
44
46
  by parsing doc comments if not specified.
45
- parameters (dict[str,str] | ToolParams | None): Tool parameter descriptions and types.
47
+ parameters: Tool parameter descriptions and types.
46
48
  Discovered automatically by parsing doc comments if not specified.
47
- parallel (bool | None): Does the tool support parallel execution
49
+ parallel: Does the tool support parallel execution
48
50
  (defaults to True if not specified)
49
- viewer (ToolCallViewer | None): Optional tool call viewer implementation.
50
- model_input (ToolCallModelInput | None): Optional function that determines how
51
+ viewer: Optional tool call viewer implementation.
52
+ model_input: Optional function that determines how
51
53
  tool call results are played back as model input.
52
54
 
53
55
  Returns:
@@ -14,20 +14,44 @@ class ToolParam(BaseModel):
14
14
  """Description of tool parameter in JSON Schema format."""
15
15
 
16
16
  type: JSONType | None = Field(default=None)
17
+ """JSON type of tool parameter."""
18
+
17
19
  description: str | None = Field(default=None)
20
+ """Parameter description."""
21
+
18
22
  default: Any = Field(default=None)
23
+ """Default value for parameter."""
24
+
19
25
  enum: list[Any] | None = Field(default=None)
26
+ """Valid values for enum parameters."""
27
+
20
28
  items: Optional["ToolParam"] = Field(default=None)
29
+ """Valid type for array parameters."""
30
+
21
31
  properties: dict[str, "ToolParam"] | None = Field(default=None)
32
+ """Valid fields for object parametrs."""
33
+
22
34
  additionalProperties: Optional["ToolParam"] | bool | None = Field(default=None)
35
+ """Are additional properties allowed?"""
36
+
23
37
  anyOf: list["ToolParam"] | None = Field(default=None)
38
+ """Valid types for union parameters."""
39
+
24
40
  required: list[str] | None = Field(default=None)
41
+ """Required fields for object parameters."""
25
42
 
26
43
 
27
44
  class ToolParams(BaseModel):
28
45
  """Description of tool parameters object in JSON Schema format."""
29
46
 
30
47
  type: Literal["object"] = Field(default="object")
48
+ """Params type (always 'object')"""
49
+
31
50
  properties: dict[str, ToolParam] = Field(default_factory=dict)
51
+ """Tool function parameters."""
52
+
32
53
  required: list[str] = Field(default_factory=list)
54
+ """List of required fields."""
55
+
33
56
  additionalProperties: bool = Field(default=False)
57
+ """Are additional object properties allowed? (always `False`)"""
@@ -25,14 +25,14 @@ def tool_with(
25
25
  """Tool with modifications to name and descriptions.
26
26
 
27
27
  Args:
28
- tool (Tool): Tool instance to copy and add descriptions to.
29
- name (str | None): Tool name (optional).
30
- description (str | None): Tool description (optional).
31
- parameters (dict[str,str] | None): Parameter descriptions (optional)
32
- parallel (bool | None): Does the tool support parallel execution
28
+ tool: Tool instance to copy and add descriptions to.
29
+ name: Tool name (optional).
30
+ description: Tool description (optional).
31
+ parameters: Parameter descriptions (optional)
32
+ parallel: Does the tool support parallel execution
33
33
  (defaults to True if not specified)
34
- viewer (ToolCallViewer | None): Optional tool call viewer implementation.
35
- model_input (ToolCallModelInput | None): Optional function that determines how
34
+ viewer: Optional tool call viewer implementation.
35
+ model_input: Optional function that determines how
36
36
  tool call results are played back as model input.
37
37
 
38
38
  Returns:
File without changes