inspect-ai 0.3.62__py3-none-any.whl → 0.3.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (518) hide show
  1. inspect_ai/_cli/cache.py +8 -7
  2. inspect_ai/_cli/common.py +0 -12
  3. inspect_ai/_cli/eval.py +32 -4
  4. inspect_ai/_cli/info.py +1 -0
  5. inspect_ai/_cli/list.py +1 -1
  6. inspect_ai/_cli/log.py +2 -0
  7. inspect_ai/_cli/main.py +1 -1
  8. inspect_ai/_cli/sandbox.py +4 -1
  9. inspect_ai/_cli/score.py +181 -32
  10. inspect_ai/_cli/trace.py +10 -0
  11. inspect_ai/_cli/view.py +4 -2
  12. inspect_ai/_display/core/active.py +2 -3
  13. inspect_ai/_display/core/config.py +7 -1
  14. inspect_ai/_display/textual/widgets/samples.py +4 -3
  15. inspect_ai/_display/textual/widgets/sandbox.py +6 -0
  16. inspect_ai/_eval/eval.py +104 -101
  17. inspect_ai/_eval/evalset.py +75 -75
  18. inspect_ai/_eval/loader.py +122 -12
  19. inspect_ai/_eval/registry.py +1 -1
  20. inspect_ai/_eval/run.py +14 -0
  21. inspect_ai/_eval/score.py +125 -36
  22. inspect_ai/_eval/task/log.py +105 -4
  23. inspect_ai/_eval/task/results.py +92 -38
  24. inspect_ai/_eval/task/run.py +9 -2
  25. inspect_ai/_eval/task/sandbox.py +35 -2
  26. inspect_ai/_eval/task/task.py +49 -46
  27. inspect_ai/_util/constants.py +1 -1
  28. inspect_ai/_util/content.py +8 -0
  29. inspect_ai/_util/error.py +2 -0
  30. inspect_ai/_util/file.py +15 -1
  31. inspect_ai/_util/hash.py +1 -1
  32. inspect_ai/_util/logger.py +4 -2
  33. inspect_ai/_util/registry.py +7 -1
  34. inspect_ai/_view/view.py +1 -2
  35. inspect_ai/_view/www/.vscode/extensions.json +3 -0
  36. inspect_ai/_view/www/.vscode/settings.json +8 -0
  37. inspect_ai/_view/www/App.css +97 -29
  38. inspect_ai/_view/www/README.md +1 -1
  39. inspect_ai/_view/www/dist/assets/index.css +16663 -14674
  40. inspect_ai/_view/www/dist/assets/index.js +58808 -51348
  41. inspect_ai/_view/www/dist/index.html +1 -1
  42. inspect_ai/_view/www/index.html +2 -2
  43. inspect_ai/_view/www/log-schema.json +87 -73
  44. inspect_ai/_view/www/package.json +22 -4
  45. inspect_ai/_view/www/postcss.config.cjs +8 -9
  46. inspect_ai/_view/www/src/{App.mjs → App.tsx} +356 -365
  47. inspect_ai/_view/www/src/AppErrorBoundary.tsx +47 -0
  48. inspect_ai/_view/www/src/api/api-browser.ts +2 -2
  49. inspect_ai/_view/www/src/api/api-http.ts +3 -5
  50. inspect_ai/_view/www/src/api/api-vscode.ts +6 -6
  51. inspect_ai/_view/www/src/api/client-api.ts +4 -4
  52. inspect_ai/_view/www/src/api/index.ts +4 -4
  53. inspect_ai/_view/www/src/api/{Types.ts → types.ts} +25 -9
  54. inspect_ai/_view/www/src/appearance/colors.ts +9 -0
  55. inspect_ai/_view/www/src/appearance/fonts.ts +39 -0
  56. inspect_ai/_view/www/src/appearance/icons.ts +100 -0
  57. inspect_ai/_view/www/src/appearance/{Styles.mjs → styles.ts} +2 -32
  58. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +198 -0
  59. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +86 -0
  60. inspect_ai/_view/www/src/components/Card.css +60 -0
  61. inspect_ai/_view/www/src/components/Card.tsx +109 -0
  62. inspect_ai/_view/www/src/components/CopyButton.module.css +11 -0
  63. inspect_ai/_view/www/src/components/CopyButton.tsx +58 -0
  64. inspect_ai/_view/www/src/components/DownloadButton.css +4 -0
  65. inspect_ai/_view/www/src/components/DownloadButton.tsx +25 -0
  66. inspect_ai/_view/www/src/components/DownloadPanel.css +10 -0
  67. inspect_ai/_view/www/src/components/DownloadPanel.tsx +30 -0
  68. inspect_ai/_view/www/src/components/EmptyPanel.css +12 -0
  69. inspect_ai/_view/www/src/components/EmptyPanel.tsx +15 -0
  70. inspect_ai/_view/www/src/components/ErrorPanel.css +37 -0
  71. inspect_ai/_view/www/src/components/ErrorPanel.tsx +39 -0
  72. inspect_ai/_view/www/src/components/ExpandablePanel.css +40 -0
  73. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +115 -0
  74. inspect_ai/_view/www/src/components/FindBand.css +49 -0
  75. inspect_ai/_view/www/src/components/FindBand.tsx +130 -0
  76. inspect_ai/_view/www/src/components/HumanBaselineView.css +41 -0
  77. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +162 -0
  78. inspect_ai/_view/www/src/components/JsonPanel.css +20 -0
  79. inspect_ai/_view/www/src/components/JsonPanel.tsx +82 -0
  80. inspect_ai/_view/www/src/components/LabeledValue.css +20 -0
  81. inspect_ai/_view/www/src/components/LabeledValue.tsx +41 -0
  82. inspect_ai/_view/www/src/components/LargeModal.module.css +54 -0
  83. inspect_ai/_view/www/src/components/LargeModal.tsx +189 -0
  84. inspect_ai/_view/www/src/components/LightboxCarousel.css +95 -0
  85. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +132 -0
  86. inspect_ai/_view/www/src/components/MarkdownDiv.css +3 -0
  87. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +133 -0
  88. inspect_ai/_view/www/src/components/MessageBand.css +43 -0
  89. inspect_ai/_view/www/src/components/MessageBand.tsx +39 -0
  90. inspect_ai/_view/www/src/components/MorePopOver.css +0 -0
  91. inspect_ai/_view/www/src/components/MorePopOver.tsx +67 -0
  92. inspect_ai/_view/www/src/components/NavPills.module.css +18 -0
  93. inspect_ai/_view/www/src/components/NavPills.tsx +101 -0
  94. inspect_ai/_view/www/src/components/ProgressBar.module.css +37 -0
  95. inspect_ai/_view/www/src/components/ProgressBar.tsx +22 -0
  96. inspect_ai/_view/www/src/components/TabSet.module.css +40 -0
  97. inspect_ai/_view/www/src/components/TabSet.tsx +215 -0
  98. inspect_ai/_view/www/src/components/ToolButton.css +3 -0
  99. inspect_ai/_view/www/src/components/ToolButton.tsx +27 -0
  100. inspect_ai/_view/www/src/components/VirtualList.module.css +19 -0
  101. inspect_ai/_view/www/src/components/VirtualList.tsx +292 -0
  102. inspect_ai/_view/www/src/{index.js → index.tsx} +45 -19
  103. inspect_ai/_view/www/src/{log → logfile}/remoteLogFile.ts +3 -8
  104. inspect_ai/_view/www/src/{utils/remoteZipFile.mjs → logfile/remoteZipFile.ts} +86 -80
  105. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +83 -0
  106. inspect_ai/_view/www/src/metadata/MetaDataView.module.css +35 -0
  107. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +95 -0
  108. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +15 -0
  109. inspect_ai/_view/www/src/metadata/RenderedContent.module.css +12 -0
  110. inspect_ai/_view/www/src/{components/RenderedContent/RenderedContent.mjs → metadata/RenderedContent.tsx} +92 -73
  111. inspect_ai/_view/www/src/metadata/types.ts +18 -0
  112. inspect_ai/_view/www/src/plan/DatasetDetailView.module.css +3 -0
  113. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +37 -0
  114. inspect_ai/_view/www/src/plan/DetailStep.module.css +9 -0
  115. inspect_ai/_view/www/src/plan/DetailStep.tsx +31 -0
  116. inspect_ai/_view/www/src/plan/PlanCard.tsx +28 -0
  117. inspect_ai/_view/www/src/plan/PlanDetailView.module.css +48 -0
  118. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +324 -0
  119. inspect_ai/_view/www/src/plan/ScorerDetailView.module.css +3 -0
  120. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +30 -0
  121. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +15 -0
  122. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +32 -0
  123. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +8 -0
  124. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +53 -0
  125. inspect_ai/_view/www/src/samples/SampleDialog.tsx +122 -0
  126. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +29 -0
  127. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +331 -0
  128. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +24 -0
  129. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +177 -0
  130. inspect_ai/_view/www/src/samples/SamplesTools.tsx +52 -0
  131. inspect_ai/_view/www/src/samples/chat/ChatMessage.module.css +29 -0
  132. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +76 -0
  133. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +60 -0
  134. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.module.css +9 -0
  135. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +57 -0
  136. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +47 -0
  137. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.module.css +4 -0
  138. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +58 -0
  139. inspect_ai/_view/www/src/samples/chat/MessageContent.module.css +4 -0
  140. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +157 -0
  141. inspect_ai/_view/www/src/samples/chat/MessageContents.module.css +3 -0
  142. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +133 -0
  143. inspect_ai/_view/www/src/samples/chat/messages.ts +112 -0
  144. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +147 -0
  145. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +14 -0
  146. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +76 -0
  147. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +19 -0
  148. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +60 -0
  149. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.module.css +4 -0
  150. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +18 -0
  151. inspect_ai/_view/www/src/samples/chat/tools/tool.ts +92 -0
  152. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +365 -0
  153. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +22 -0
  154. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +26 -0
  155. inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +18 -0
  156. inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +27 -0
  157. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +18 -0
  158. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +71 -0
  159. inspect_ai/_view/www/src/samples/descriptor/score/OtherScoreDescriptor.tsx +20 -0
  160. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.module.css +28 -0
  161. inspect_ai/_view/www/src/samples/descriptor/score/PassFailScoreDescriptor.tsx +81 -0
  162. inspect_ai/_view/www/src/samples/descriptor/score/ScoreDescriptor.tsx +99 -0
  163. inspect_ai/_view/www/src/samples/descriptor/types.ts +55 -0
  164. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +19 -0
  165. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +22 -0
  166. inspect_ai/_view/www/src/samples/error/SampleErrorView.module.css +17 -0
  167. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +31 -0
  168. inspect_ai/_view/www/src/samples/error/error.ts +15 -0
  169. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +9 -0
  170. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +14 -0
  171. inspect_ai/_view/www/src/samples/list/SampleHeader.module.css +13 -0
  172. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +36 -0
  173. inspect_ai/_view/www/src/samples/list/SampleList.module.css +11 -0
  174. inspect_ai/_view/www/src/samples/list/SampleList.tsx +247 -0
  175. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +33 -0
  176. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +98 -0
  177. inspect_ai/_view/www/src/samples/list/SampleSeparator.module.css +6 -0
  178. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +24 -0
  179. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.module.css +9 -0
  180. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +51 -0
  181. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.module.css +16 -0
  182. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +175 -0
  183. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.module.css +9 -0
  184. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +186 -0
  185. inspect_ai/_view/www/src/samples/{tools/filters.mjs → sample-tools/filters.ts} +86 -81
  186. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.module.css +16 -0
  187. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +288 -0
  188. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/completions.ts +346 -0
  189. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/language.ts +19 -0
  190. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/tokenize.ts +97 -0
  191. inspect_ai/_view/www/src/samples/{SampleLimit.mjs → sampleLimit.ts} +3 -6
  192. inspect_ai/_view/www/src/samples/scores/SampleScoreView.module.css +53 -0
  193. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +168 -0
  194. inspect_ai/_view/www/src/samples/scores/SampleScores.module.css +5 -0
  195. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +37 -0
  196. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +66 -0
  197. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +51 -0
  198. inspect_ai/_view/www/src/samples/transcript/InfoEventView.module.css +3 -0
  199. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +54 -0
  200. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +48 -0
  201. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +6 -0
  202. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +36 -0
  203. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +43 -0
  204. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +223 -0
  205. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.module.css +23 -0
  206. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +112 -0
  207. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +75 -0
  208. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +22 -0
  209. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.module.css +15 -0
  210. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +100 -0
  211. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +171 -0
  212. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.module.css +19 -0
  213. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +133 -0
  214. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +10 -0
  215. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +92 -0
  216. inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +49 -0
  217. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +449 -0
  218. inspect_ai/_view/www/src/samples/transcript/event/EventNav.module.css +5 -0
  219. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +43 -0
  220. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.module.css +3 -0
  221. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +39 -0
  222. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.module.css +25 -0
  223. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +191 -0
  224. inspect_ai/_view/www/src/samples/transcript/event/EventRow.module.css +13 -0
  225. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +32 -0
  226. inspect_ai/_view/www/src/samples/transcript/event/EventSection.module.css +8 -0
  227. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +29 -0
  228. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +67 -0
  229. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +285 -0
  230. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenders.module.css +10 -0
  231. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.module.css +9 -0
  232. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +346 -0
  233. inspect_ai/_view/www/src/samples/transcript/types.ts +58 -0
  234. inspect_ai/_view/www/src/types/log.d.ts +108 -19
  235. inspect_ai/_view/www/src/types/prism.d.ts +11 -0
  236. inspect_ai/_view/www/src/types.ts +71 -0
  237. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +28 -0
  238. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +24 -0
  239. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +97 -0
  240. inspect_ai/_view/www/src/usage/TokenTable.module.css +17 -0
  241. inspect_ai/_view/www/src/usage/TokenTable.tsx +91 -0
  242. inspect_ai/_view/www/src/usage/UsageCard.module.css +15 -0
  243. inspect_ai/_view/www/src/usage/UsageCard.tsx +67 -0
  244. inspect_ai/_view/www/src/utils/attachments.ts +42 -0
  245. inspect_ai/_view/www/src/utils/{Base64.mjs → base64.ts} +1 -6
  246. inspect_ai/_view/www/src/{components/Browser.mjs → utils/browser.ts} +0 -1
  247. inspect_ai/_view/www/src/utils/debugging.ts +28 -0
  248. inspect_ai/_view/www/src/utils/dom.ts +30 -0
  249. inspect_ai/_view/www/src/utils/format.ts +194 -0
  250. inspect_ai/_view/www/src/utils/git.ts +7 -0
  251. inspect_ai/_view/www/src/utils/html.ts +6 -0
  252. inspect_ai/_view/www/src/utils/http.ts +14 -0
  253. inspect_ai/_view/www/src/utils/{Path.mjs → path.ts} +2 -9
  254. inspect_ai/_view/www/src/utils/{Print.mjs → print.ts} +34 -26
  255. inspect_ai/_view/www/src/utils/queue.ts +51 -0
  256. inspect_ai/_view/www/src/utils/sync.ts +114 -0
  257. inspect_ai/_view/www/src/utils/{Type.mjs → type.ts} +3 -6
  258. inspect_ai/_view/www/src/utils/vscode.ts +13 -0
  259. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +324 -0
  260. inspect_ai/_view/www/src/workspace/WorkSpaceView.module.css +33 -0
  261. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +158 -0
  262. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.module.css +3 -0
  263. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +28 -0
  264. inspect_ai/_view/www/src/workspace/navbar/Navbar.module.css +54 -0
  265. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +68 -0
  266. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +52 -0
  267. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +114 -0
  268. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +90 -0
  269. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +180 -0
  270. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.module.css +28 -0
  271. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +226 -0
  272. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.module.css +14 -0
  273. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +61 -0
  274. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.module.css +15 -0
  275. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +71 -0
  276. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.module.css +5 -0
  277. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +56 -0
  278. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +68 -0
  279. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +85 -0
  280. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.module.css +29 -0
  281. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +95 -0
  282. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.module.css +23 -0
  283. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +43 -0
  284. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.module.css +35 -0
  285. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +63 -0
  286. inspect_ai/_view/www/src/workspace/tabs/InfoTab.module.css +0 -0
  287. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +70 -0
  288. inspect_ai/_view/www/src/workspace/tabs/JsonTab.module.css +5 -0
  289. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +46 -0
  290. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +204 -0
  291. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +195 -0
  292. inspect_ai/_view/www/src/workspace/tabs/types.ts +19 -0
  293. inspect_ai/_view/www/src/workspace/types.ts +10 -0
  294. inspect_ai/_view/www/src/workspace/utils.ts +34 -0
  295. inspect_ai/_view/www/tsconfig.json +23 -9
  296. inspect_ai/_view/www/vite.config.js +8 -17
  297. inspect_ai/_view/www/yarn.lock +627 -556
  298. inspect_ai/approval/_approval.py +2 -0
  299. inspect_ai/approval/_approver.py +4 -4
  300. inspect_ai/approval/_auto.py +1 -1
  301. inspect_ai/approval/_human/approver.py +3 -0
  302. inspect_ai/approval/_policy.py +5 -0
  303. inspect_ai/approval/_registry.py +2 -2
  304. inspect_ai/dataset/_dataset.py +64 -37
  305. inspect_ai/dataset/_sources/__init__.py +0 -0
  306. inspect_ai/dataset/_sources/csv.py +20 -12
  307. inspect_ai/dataset/_sources/file.py +4 -0
  308. inspect_ai/dataset/_sources/hf.py +39 -29
  309. inspect_ai/dataset/_sources/json.py +17 -9
  310. inspect_ai/log/__init__.py +2 -0
  311. inspect_ai/log/_convert.py +3 -3
  312. inspect_ai/log/_file.py +24 -9
  313. inspect_ai/log/_log.py +101 -13
  314. inspect_ai/log/_message.py +4 -2
  315. inspect_ai/log/_recorders/file.py +4 -0
  316. inspect_ai/log/_recorders/json.py +5 -7
  317. inspect_ai/log/_recorders/recorder.py +3 -0
  318. inspect_ai/log/_transcript.py +19 -8
  319. inspect_ai/model/__init__.py +2 -0
  320. inspect_ai/model/_cache.py +39 -21
  321. inspect_ai/model/_call_tools.py +4 -3
  322. inspect_ai/model/_chat_message.py +14 -4
  323. inspect_ai/model/_generate_config.py +1 -1
  324. inspect_ai/model/_model.py +31 -24
  325. inspect_ai/model/_model_output.py +14 -1
  326. inspect_ai/model/_openai.py +10 -18
  327. inspect_ai/model/_providers/anthropic.py +3 -3
  328. inspect_ai/model/_providers/google.py +9 -5
  329. inspect_ai/model/_providers/openai.py +5 -9
  330. inspect_ai/model/_providers/openai_o1.py +3 -5
  331. inspect_ai/model/_providers/openrouter.py +86 -0
  332. inspect_ai/model/_providers/providers.py +11 -0
  333. inspect_ai/scorer/__init__.py +6 -1
  334. inspect_ai/scorer/_answer.py +7 -7
  335. inspect_ai/scorer/_classification.py +38 -18
  336. inspect_ai/scorer/_common.py +2 -8
  337. inspect_ai/scorer/_match.py +4 -5
  338. inspect_ai/scorer/_metric.py +87 -28
  339. inspect_ai/scorer/_metrics/__init__.py +3 -3
  340. inspect_ai/scorer/_metrics/accuracy.py +8 -10
  341. inspect_ai/scorer/_metrics/mean.py +3 -17
  342. inspect_ai/scorer/_metrics/std.py +111 -30
  343. inspect_ai/scorer/_model.py +12 -12
  344. inspect_ai/scorer/_pattern.py +3 -3
  345. inspect_ai/scorer/_reducer/reducer.py +36 -21
  346. inspect_ai/scorer/_reducer/registry.py +2 -2
  347. inspect_ai/scorer/_reducer/types.py +7 -1
  348. inspect_ai/scorer/_score.py +11 -1
  349. inspect_ai/scorer/_scorer.py +110 -16
  350. inspect_ai/solver/__init__.py +1 -1
  351. inspect_ai/solver/_basic_agent.py +19 -22
  352. inspect_ai/solver/_bridge/__init__.py +0 -3
  353. inspect_ai/solver/_bridge/bridge.py +3 -3
  354. inspect_ai/solver/_chain.py +1 -2
  355. inspect_ai/solver/_critique.py +3 -3
  356. inspect_ai/solver/_fork.py +2 -2
  357. inspect_ai/solver/_human_agent/__init__.py +0 -0
  358. inspect_ai/solver/_human_agent/agent.py +5 -8
  359. inspect_ai/solver/_human_agent/commands/clock.py +14 -10
  360. inspect_ai/solver/_human_agent/commands/note.py +1 -1
  361. inspect_ai/solver/_human_agent/commands/score.py +0 -11
  362. inspect_ai/solver/_multiple_choice.py +38 -26
  363. inspect_ai/solver/_prompt.py +7 -7
  364. inspect_ai/solver/_solver.py +53 -52
  365. inspect_ai/solver/_task_state.py +80 -69
  366. inspect_ai/solver/_use_tools.py +9 -9
  367. inspect_ai/tool/__init__.py +4 -1
  368. inspect_ai/tool/_tool.py +43 -14
  369. inspect_ai/tool/_tool_call.py +6 -2
  370. inspect_ai/tool/_tool_choice.py +3 -1
  371. inspect_ai/tool/_tool_def.py +10 -8
  372. inspect_ai/tool/_tool_params.py +24 -0
  373. inspect_ai/tool/_tool_with.py +7 -7
  374. inspect_ai/tool/_tools/__init__.py +0 -0
  375. inspect_ai/tool/{beta → _tools}/_computer/_common.py +2 -2
  376. inspect_ai/tool/{beta → _tools}/_computer/_computer.py +13 -5
  377. inspect_ai/tool/_tools/_computer/_resources/tool/__init__.py +0 -0
  378. inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_x11_client.py +1 -1
  379. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  380. inspect_ai/tool/_tools/_execute.py +23 -11
  381. inspect_ai/tool/_tools/_web_browser/_resources/README.md +2 -2
  382. inspect_ai/tool/_tools/_web_browser/_web_browser.py +5 -3
  383. inspect_ai/tool/_tools/_web_search.py +7 -5
  384. inspect_ai/tool/beta.py +3 -0
  385. inspect_ai/util/_concurrency.py +3 -3
  386. inspect_ai/util/_panel.py +2 -0
  387. inspect_ai/util/_resource.py +12 -12
  388. inspect_ai/util/_sandbox/docker/compose.py +23 -20
  389. inspect_ai/util/_sandbox/docker/config.py +2 -1
  390. inspect_ai/util/_sandbox/docker/docker.py +42 -86
  391. inspect_ai/util/_sandbox/docker/service.py +100 -0
  392. inspect_ai/util/_sandbox/environment.py +99 -96
  393. inspect_ai/util/_sandbox/self_check.py +124 -16
  394. inspect_ai/util/_subprocess.py +5 -3
  395. inspect_ai/util/_subtask.py +15 -16
  396. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/LICENSE +1 -1
  397. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/METADATA +11 -6
  398. inspect_ai-0.3.64.dist-info/RECORD +625 -0
  399. inspect_ai/_view/www/src/Register.mjs +0 -3
  400. inspect_ai/_view/www/src/Types.mjs +0 -38
  401. inspect_ai/_view/www/src/appearance/Colors.mjs +0 -27
  402. inspect_ai/_view/www/src/appearance/Fonts.mjs +0 -66
  403. inspect_ai/_view/www/src/appearance/Icons.mjs +0 -240
  404. inspect_ai/_view/www/src/components/AnsiDisplay.mjs +0 -184
  405. inspect_ai/_view/www/src/components/AppErrorBoundary.mjs +0 -34
  406. inspect_ai/_view/www/src/components/AsciiCinemaPlayer.mjs +0 -74
  407. inspect_ai/_view/www/src/components/Card.mjs +0 -126
  408. inspect_ai/_view/www/src/components/ChatView.mjs +0 -441
  409. inspect_ai/_view/www/src/components/CopyButton.mjs +0 -48
  410. inspect_ai/_view/www/src/components/Dialog.mjs +0 -61
  411. inspect_ai/_view/www/src/components/DownloadButton.mjs +0 -15
  412. inspect_ai/_view/www/src/components/DownloadPanel.mjs +0 -29
  413. inspect_ai/_view/www/src/components/EmptyPanel.mjs +0 -23
  414. inspect_ai/_view/www/src/components/ErrorPanel.mjs +0 -66
  415. inspect_ai/_view/www/src/components/ExpandablePanel.mjs +0 -136
  416. inspect_ai/_view/www/src/components/FindBand.mjs +0 -157
  417. inspect_ai/_view/www/src/components/HumanBaselineView.mjs +0 -168
  418. inspect_ai/_view/www/src/components/JsonPanel.mjs +0 -61
  419. inspect_ai/_view/www/src/components/LabeledValue.mjs +0 -32
  420. inspect_ai/_view/www/src/components/LargeModal.mjs +0 -190
  421. inspect_ai/_view/www/src/components/LightboxCarousel.mjs +0 -217
  422. inspect_ai/_view/www/src/components/MarkdownDiv.mjs +0 -118
  423. inspect_ai/_view/www/src/components/MessageBand.mjs +0 -48
  424. inspect_ai/_view/www/src/components/MessageContent.mjs +0 -111
  425. inspect_ai/_view/www/src/components/MetaDataGrid.mjs +0 -92
  426. inspect_ai/_view/www/src/components/MetaDataView.mjs +0 -109
  427. inspect_ai/_view/www/src/components/MorePopOver.mjs +0 -50
  428. inspect_ai/_view/www/src/components/NavPills.mjs +0 -63
  429. inspect_ai/_view/www/src/components/ProgressBar.mjs +0 -51
  430. inspect_ai/_view/www/src/components/RenderedContent/ChatMessageRenderer.mjs +0 -54
  431. inspect_ai/_view/www/src/components/RenderedContent/Types.mjs +0 -19
  432. inspect_ai/_view/www/src/components/TabSet.mjs +0 -184
  433. inspect_ai/_view/www/src/components/ToolButton.mjs +0 -16
  434. inspect_ai/_view/www/src/components/Tools.mjs +0 -376
  435. inspect_ai/_view/www/src/components/VirtualList.mjs +0 -280
  436. inspect_ai/_view/www/src/components/ansi-output.js +0 -932
  437. inspect_ai/_view/www/src/json/JsonTab.mjs +0 -48
  438. inspect_ai/_view/www/src/log-reader/Log-Reader.mjs +0 -25
  439. inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs +0 -13
  440. inspect_ai/_view/www/src/log-reader/Open-AI-Log-Reader.mjs +0 -263
  441. inspect_ai/_view/www/src/navbar/Navbar.mjs +0 -418
  442. inspect_ai/_view/www/src/navbar/SecondaryBar.mjs +0 -175
  443. inspect_ai/_view/www/src/plan/PlanCard.mjs +0 -418
  444. inspect_ai/_view/www/src/samples/SampleDialog.mjs +0 -123
  445. inspect_ai/_view/www/src/samples/SampleDisplay.mjs +0 -516
  446. inspect_ai/_view/www/src/samples/SampleError.mjs +0 -99
  447. inspect_ai/_view/www/src/samples/SampleList.mjs +0 -427
  448. inspect_ai/_view/www/src/samples/SampleScoreView.mjs +0 -172
  449. inspect_ai/_view/www/src/samples/SampleScores.mjs +0 -34
  450. inspect_ai/_view/www/src/samples/SampleTranscript.mjs +0 -20
  451. inspect_ai/_view/www/src/samples/SamplesDescriptor.mjs +0 -771
  452. inspect_ai/_view/www/src/samples/SamplesTab.mjs +0 -399
  453. inspect_ai/_view/www/src/samples/SamplesTools.mjs +0 -64
  454. inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs +0 -38
  455. inspect_ai/_view/www/src/samples/tools/SampleFilter.mjs +0 -756
  456. inspect_ai/_view/www/src/samples/tools/SelectScorer.mjs +0 -141
  457. inspect_ai/_view/www/src/samples/tools/SortFilter.mjs +0 -151
  458. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.mjs +0 -71
  459. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.mjs +0 -44
  460. inspect_ai/_view/www/src/samples/transcript/EventPanel.mjs +0 -271
  461. inspect_ai/_view/www/src/samples/transcript/EventRow.mjs +0 -46
  462. inspect_ai/_view/www/src/samples/transcript/EventSection.mjs +0 -33
  463. inspect_ai/_view/www/src/samples/transcript/InfoEventView.mjs +0 -59
  464. inspect_ai/_view/www/src/samples/transcript/InputEventView.mjs +0 -44
  465. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.mjs +0 -32
  466. inspect_ai/_view/www/src/samples/transcript/ModelEventView.mjs +0 -216
  467. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.mjs +0 -107
  468. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.mjs +0 -74
  469. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.mjs +0 -100
  470. inspect_ai/_view/www/src/samples/transcript/StepEventView.mjs +0 -187
  471. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.mjs +0 -133
  472. inspect_ai/_view/www/src/samples/transcript/ToolEventView.mjs +0 -88
  473. inspect_ai/_view/www/src/samples/transcript/TranscriptView.mjs +0 -459
  474. inspect_ai/_view/www/src/samples/transcript/Types.mjs +0 -44
  475. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.mjs +0 -53
  476. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.mjs +0 -254
  477. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.mjs +0 -313
  478. inspect_ai/_view/www/src/sidebar/Sidebar.mjs +0 -418
  479. inspect_ai/_view/www/src/usage/ModelTokenTable.mjs +0 -72
  480. inspect_ai/_view/www/src/usage/UsageCard.mjs +0 -159
  481. inspect_ai/_view/www/src/utils/Format.mjs +0 -260
  482. inspect_ai/_view/www/src/utils/Git.mjs +0 -12
  483. inspect_ai/_view/www/src/utils/Html.mjs +0 -21
  484. inspect_ai/_view/www/src/utils/attachments.mjs +0 -31
  485. inspect_ai/_view/www/src/utils/debugging.mjs +0 -23
  486. inspect_ai/_view/www/src/utils/http.mjs +0 -18
  487. inspect_ai/_view/www/src/utils/queue.mjs +0 -67
  488. inspect_ai/_view/www/src/utils/sync.mjs +0 -101
  489. inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs +0 -17
  490. inspect_ai/_view/www/src/workspace/WorkSpace.mjs +0 -516
  491. inspect_ai/tool/beta/__init__.py +0 -5
  492. inspect_ai-0.3.62.dist-info/RECORD +0 -481
  493. /inspect_ai/{tool/beta/_computer/_resources/tool → _eval}/__init__.py +0 -0
  494. /inspect_ai/{tool/beta/_computer/_resources/tool/requirements.txt → _util/__init__.py} +0 -0
  495. /inspect_ai/_view/www/src/{constants.mjs → constants.ts} +0 -0
  496. /inspect_ai/tool/{beta → _tools}/_computer/__init__.py +0 -0
  497. /inspect_ai/tool/{beta → _tools}/_computer/_computer_split.py +0 -0
  498. /inspect_ai/tool/{beta → _tools}/_computer/_resources/Dockerfile +0 -0
  499. /inspect_ai/tool/{beta → _tools}/_computer/_resources/README.md +0 -0
  500. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/entrypoint.sh +0 -0
  501. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/novnc_startup.sh +0 -0
  502. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -0
  503. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xfce_startup.sh +0 -0
  504. /inspect_ai/tool/{beta → _tools}/_computer/_resources/entrypoint/xvfb_startup.sh +0 -0
  505. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  506. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -0
  507. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -0
  508. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -0
  509. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -0
  510. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -0
  511. /inspect_ai/tool/{beta → _tools}/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -0
  512. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_logger.py +0 -0
  513. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_run.py +0 -0
  514. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/_tool_result.py +0 -0
  515. /inspect_ai/tool/{beta → _tools}/_computer/_resources/tool/computer_tool.py +0 -0
  516. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/WHEEL +0 -0
  517. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/entry_points.txt +0 -0
  518. {inspect_ai-0.3.62.dist-info → inspect_ai-0.3.64.dist-info}/top_level.txt +0 -0
@@ -149,7 +149,11 @@ class ModelAPI(abc.ABC):
149
149
  return "default"
150
150
 
151
151
  def is_rate_limit(self, ex: BaseException) -> bool:
152
- """Is this exception a rate limit error."""
152
+ """Is this exception a rate limit error.
153
+
154
+ Args:
155
+ ex: Exception to check for rate limit.
156
+ """
153
157
  return False
154
158
 
155
159
  def collapse_user_messages(self) -> bool:
@@ -176,12 +180,18 @@ class ModelAPI(abc.ABC):
176
180
  class Model:
177
181
  """Model interface."""
178
182
 
183
+ api: ModelAPI
184
+ """Model API."""
185
+
186
+ config: GenerateConfig
187
+ """Generation config."""
188
+
179
189
  def __init__(self, api: ModelAPI, config: GenerateConfig) -> None:
180
190
  """Create a model.
181
191
 
182
192
  Args:
183
- api (ModelAPI): Model API provider.
184
- config (GenerateConfig): Model configuration.
193
+ api: Model API provider.
194
+ config: Model configuration.
185
195
  """
186
196
  self.api = api
187
197
  self.config = config
@@ -212,16 +222,12 @@ class Model:
212
222
  """Generate output from the model.
213
223
 
214
224
  Args:
215
- input (str | list[ChatMessage]): Chat message
216
- input (if a `str` is passed it is converted
225
+ input: Chat message input (if a `str` is passed it is converted
217
226
  to a `ChatMessageUser`).
218
- tools (list[Tool] | list[ToolDef] | list[ToolInfo]): Tools available for the
219
- model to call.
220
- tool_choice (ToolChoice): Directives to the model
221
- as to which tools to prefer.
222
- cache (bool | CachePolicy): Caching behavior for
223
- generate responses (defaults to no caching).
224
- config (GenerateConfig): Model configuration.
227
+ tools: Tools available for the model to call.
228
+ tool_choice: Directives to the model as to which tools to prefer.
229
+ config: Model configuration.
230
+ cache: Caching behavior for generate responses (defaults to no caching).
225
231
 
226
232
  Returns:
227
233
  ModelOutput
@@ -517,7 +523,8 @@ class Model:
517
523
  ) -> None:
518
524
  # trace
519
525
  if isinstance(result, ModelOutput):
520
- conversation_assistant_message(input, result.choices[0].message)
526
+ if result.choices:
527
+ conversation_assistant_message(input, result.choices[0].message)
521
528
  event.output = result
522
529
  else:
523
530
  conversation_assistant_error(result)
@@ -550,7 +557,7 @@ class ModelName:
550
557
  """Create a ModelName.
551
558
 
552
559
  Args:
553
- model: (str | Model): Model to create name for.
560
+ model: Model to create name for.
554
561
  """
555
562
  if isinstance(model, str):
556
563
  (api, name) = self._parse_model(model)
@@ -596,16 +603,16 @@ def get_model(
596
603
  """Get an instance of a model.
597
604
 
598
605
  Args:
599
- model (str | Model | None): Model specification.
600
- If `Model` is passed it is returned unmodified,
601
- if `None` is passed then the model currently being
602
- evaluated is returned (or if there is no evaluation
603
- then the model referred to by `INSPECT_EVAL_MODEL`).
604
- config (GenerateConfig): Configuration for model.
605
- base_url (str | None): Optional. Alternate base URL for model.
606
- api_key (str | None): Optional. API key for model.
607
- **model_args (dict[str,Any]): Additional args to
608
- pass to model constructor.
606
+ model: Model specification.
607
+ If `Model` is passed it is returned unmodified,
608
+ if `None` is passed then the model currently being
609
+ evaluated is returned (or if there is no evaluation
610
+ then the model referred to by `INSPECT_EVAL_MODEL`).
611
+ config: Configuration for model.
612
+ base_url: Optional. Alternate base URL for model.
613
+ api_key: Optional. API key for model.
614
+ **model_args: Additional args to
615
+ pass to model constructor.
609
616
 
610
617
  Returns:
611
618
  Model instance.
@@ -9,6 +9,8 @@ from ._chat_message import ChatMessageAssistant
9
9
 
10
10
 
11
11
  class ModelUsage(BaseModel):
12
+ """Token usage for completion."""
13
+
12
14
  input_tokens: int = Field(default=0)
13
15
  """Total input tokens used."""
14
16
 
@@ -73,6 +75,8 @@ class Logprobs(BaseModel):
73
75
 
74
76
 
75
77
  class ChatCompletionChoice(BaseModel):
78
+ """Choice generated for completion."""
79
+
76
80
  message: ChatMessageAssistant
77
81
  """Assistant message."""
78
82
 
@@ -96,6 +100,8 @@ class ChatCompletionChoice(BaseModel):
96
100
 
97
101
 
98
102
  class ModelOutput(BaseModel):
103
+ """Output from model generation."""
104
+
99
105
  model: str = Field(default_factory=str)
100
106
  """Model used for generation."""
101
107
 
@@ -155,7 +161,14 @@ class ModelOutput(BaseModel):
155
161
  stop_reason: StopReason = "stop",
156
162
  error: str | None = None,
157
163
  ) -> "ModelOutput":
158
- """Convenient method to create ModelOutput from simple text content."""
164
+ """Create ModelOutput from simple text content.
165
+
166
+ Args:
167
+ model: Model name.
168
+ content: Text content from generation.
169
+ stop_reason: Stop reason for generation.
170
+ error: Error message.
171
+ """
159
172
  return ModelOutput(
160
173
  model=model,
161
174
  choices=[
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import re
2
3
  from typing import Literal
3
4
 
4
5
  from openai.types.chat import (
@@ -44,29 +45,13 @@ from ._model_output import ModelUsage, StopReason, as_stop_reason
44
45
 
45
46
 
46
47
  def is_o_series(name: str) -> bool:
47
- return is_o1(name) or is_o3(name)
48
-
49
-
50
- def is_o1(name: str) -> bool:
51
- return name.startswith("o1")
52
-
53
-
54
- def is_o3(name: str) -> bool:
55
- return name.startswith("o3")
56
-
57
-
58
- def is_o1_full(name: str) -> bool:
59
- return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name)
48
+ return bool(re.match(r"^o\d+", name))
60
49
 
61
50
 
62
51
  def is_o1_mini(name: str) -> bool:
63
52
  return name.startswith("o1-mini")
64
53
 
65
54
 
66
- def is_o3_mini(name: str) -> bool:
67
- return name.startswith("o3-mini")
68
-
69
-
70
55
  def is_o1_preview(name: str) -> bool:
71
56
  return name.startswith("o1-preview")
72
57
 
@@ -132,10 +117,17 @@ async def openai_chat_message(
132
117
  message: ChatMessage, model: str
133
118
  ) -> ChatCompletionMessageParam:
134
119
  if message.role == "system":
135
- if is_o1(model):
120
+ # o1-mini does not support developer or system messages
121
+ # (see Dec 17, 2024 changelog: https://platform.openai.com/docs/changelog)
122
+ if is_o1_mini(model):
123
+ return ChatCompletionUserMessageParam(role="user", content=message.text)
124
+ # other o-series models use 'developer' rather than 'system' messages
125
+ # https://platform.openai.com/docs/guides/reasoning#advice-on-prompting
126
+ elif is_o_series(model):
136
127
  return ChatCompletionDeveloperMessageParam(
137
128
  role="developer", content=message.text
138
129
  )
130
+ # gpt models use standard 'system' messages
139
131
  else:
140
132
  return ChatCompletionSystemMessageParam(
141
133
  role=message.role, content=message.text
@@ -216,6 +216,9 @@ class AnthropicAPI(ModelAPI):
216
216
  # return output and call
217
217
  return output, model_call()
218
218
 
219
+ except BadRequestError as ex:
220
+ return self.handle_bad_request(ex), model_call()
221
+
219
222
  except APIStatusError as ex:
220
223
  if ex.status_code == 413:
221
224
  return ModelOutput.from_content(
@@ -227,9 +230,6 @@ class AnthropicAPI(ModelAPI):
227
230
  else:
228
231
  raise ex
229
232
 
230
- except BadRequestError as ex:
231
- return self.handle_bad_request(ex), model_call()
232
-
233
233
  def completion_params(self, config: GenerateConfig) -> dict[str, Any]:
234
234
  params = dict(model=self.model_name, max_tokens=cast(int, config.max_tokens))
235
235
  if config.temperature is not None:
@@ -5,7 +5,7 @@ import json
5
5
  from copy import copy
6
6
  from io import BytesIO
7
7
  from logging import getLogger
8
- from typing import Any, cast
8
+ from typing import Any, MutableSequence, cast
9
9
 
10
10
  import proto # type: ignore
11
11
  from google.ai.generativelanguage import (
@@ -553,11 +553,15 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi
553
553
 
554
554
 
555
555
  def completion_choices_from_candidates(
556
- candidates: list[Candidate],
556
+ candidates: MutableSequence[Candidate],
557
557
  ) -> list[ChatCompletionChoice]:
558
- candidates = copy(candidates)
559
- candidates.sort(key=lambda c: c.index)
560
- return [completion_choice_from_candidate(candidate) for candidate in candidates]
558
+ if candidates:
559
+ candidates_list = sorted(candidates, key=lambda c: c.index)
560
+ return [
561
+ completion_choice_from_candidate(candidate) for candidate in candidates_list
562
+ ]
563
+ else:
564
+ return []
561
565
 
562
566
 
563
567
  # google doesn't export FinishReason (it's in a sub-namespace with a beta
@@ -36,10 +36,8 @@ from .._model_output import (
36
36
  )
37
37
  from .._openai import (
38
38
  is_gpt,
39
- is_o1_full,
40
39
  is_o1_mini,
41
40
  is_o1_preview,
42
- is_o3,
43
41
  is_o_series,
44
42
  openai_chat_messages,
45
43
  openai_chat_tool_choice,
@@ -145,15 +143,9 @@ class OpenAIAPI(ModelAPI):
145
143
  def is_o_series(self) -> bool:
146
144
  return is_o_series(self.model_name)
147
145
 
148
- def is_o1_full(self) -> bool:
149
- return is_o1_full(self.model_name)
150
-
151
146
  def is_o1_mini(self) -> bool:
152
147
  return is_o1_mini(self.model_name)
153
148
 
154
- def is_o3(self) -> bool:
155
- return is_o3(self.model_name)
156
-
157
149
  def is_o1_preview(self) -> bool:
158
150
  return is_o1_preview(self.model_name)
159
151
 
@@ -303,7 +295,11 @@ class OpenAIAPI(ModelAPI):
303
295
  params["top_logprobs"] = config.top_logprobs
304
296
  if tools and config.parallel_tool_calls is not None and not self.is_o_series():
305
297
  params["parallel_tool_calls"] = config.parallel_tool_calls
306
- if config.reasoning_effort is not None and not self.is_gpt():
298
+ if (
299
+ config.reasoning_effort is not None
300
+ and not self.is_gpt()
301
+ and not self.is_o1_mini()
302
+ ):
307
303
  params["reasoning_effort"] = config.reasoning_effort
308
304
 
309
305
  return params
@@ -27,11 +27,7 @@ from inspect_ai.tool import ToolCall, ToolInfo
27
27
  from .._call_tools import parse_tool_call, tool_parse_error_message
28
28
  from .._model_call import ModelCall
29
29
  from .._model_output import ModelUsage, StopReason, as_stop_reason
30
- from .._providers.util import (
31
- ChatAPIHandler,
32
- ChatAPIMessage,
33
- chat_api_input,
34
- )
30
+ from .._providers.util import ChatAPIHandler, ChatAPIMessage, chat_api_input
35
31
 
36
32
  logger = getLogger(__name__)
37
33
 
@@ -85,6 +81,8 @@ def handle_bad_request(model: str, ex: BadRequestError) -> ModelOutput | Excepti
85
81
  stop_reason: StopReason | None = "model_length"
86
82
  elif ex.code == "invalid_prompt":
87
83
  stop_reason = "content_filter"
84
+ else:
85
+ stop_reason = None
88
86
 
89
87
  if stop_reason:
90
88
  return ModelOutput.from_content(
@@ -0,0 +1,86 @@
1
+ import os
2
+ from typing import Any
3
+
4
+ from typing_extensions import override
5
+
6
+ from inspect_ai._util.error import PrerequisiteError
7
+ from inspect_ai.model._providers.util import model_base_url
8
+ from inspect_ai.model._providers.util.util import environment_prerequisite_error
9
+
10
+ from .._generate_config import GenerateConfig
11
+ from .openai import OpenAIAPI
12
+
13
+ OPENROUTER_API_KEY = "OPENROUTER_API_KEY"
14
+
15
+
16
+ class OpenRouterAPI(OpenAIAPI):
17
+ def __init__(
18
+ self,
19
+ model_name: str,
20
+ base_url: str | None = None,
21
+ api_key: str | None = None,
22
+ config: GenerateConfig = GenerateConfig(),
23
+ **model_args: Any,
24
+ ) -> None:
25
+ # api_key
26
+ if not api_key:
27
+ api_key = os.environ.get(OPENROUTER_API_KEY, None)
28
+ if not api_key:
29
+ raise environment_prerequisite_error("OpenRouter", OPENROUTER_API_KEY)
30
+
31
+ # base_url
32
+ base_url = model_base_url(base_url, "OPENROUTER_BASE_URL")
33
+ base_url = base_url if base_url else "https://openrouter.ai/api/v1"
34
+
35
+ # collect known model args that we forward to generate
36
+ def collect_model_arg(name: str) -> Any | None:
37
+ nonlocal model_args
38
+ value = model_args.get(name, None)
39
+ if value is not None:
40
+ model_args.pop(name)
41
+ return value
42
+
43
+ # models arg
44
+ self.models = collect_model_arg("models")
45
+ if self.models is not None:
46
+ if not isinstance(self.models, list):
47
+ raise PrerequisiteError("models must be a list of strings")
48
+
49
+ # providers arg
50
+ self.provider = collect_model_arg("provider")
51
+ if self.provider is not None:
52
+ if not isinstance(self.provider, dict):
53
+ raise PrerequisiteError("provider must be a dict")
54
+
55
+ # transforms arg
56
+ self.transforms = collect_model_arg("transforms")
57
+ if self.transforms is not None:
58
+ if not isinstance(self.transforms, list):
59
+ raise PrerequisiteError("transforms must be a list of strings")
60
+
61
+ # call super
62
+ super().__init__(
63
+ model_name=model_name,
64
+ base_url=base_url,
65
+ api_key=api_key,
66
+ config=config,
67
+ **model_args,
68
+ )
69
+
70
+ @override
71
+ def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, Any]:
72
+ # default params
73
+ params = super().completion_params(config, tools)
74
+
75
+ # pass args if specifed
76
+ EXTRA_BODY = "extra_body"
77
+ if self.models or self.provider or self.transforms:
78
+ params[EXTRA_BODY] = params.get(EXTRA_BODY, {})
79
+ if self.models:
80
+ params[EXTRA_BODY]["models"] = self.models
81
+ if self.provider:
82
+ params[EXTRA_BODY]["provider"] = self.provider
83
+ if self.transforms:
84
+ params[EXTRA_BODY]["transforms"] = self.transforms
85
+
86
+ return params
@@ -198,6 +198,17 @@ def ollama() -> type[ModelAPI]:
198
198
  return OllamaAPI
199
199
 
200
200
 
201
+ @modelapi(name="openrouter")
202
+ def openrouter() -> type[ModelAPI]:
203
+ # validate
204
+ validate_openai_client("OpenRouter API")
205
+
206
+ # in the clear
207
+ from .openrouter import OpenRouterAPI
208
+
209
+ return OpenRouterAPI
210
+
211
+
201
212
  @modelapi(name="llama-cpp-python")
202
213
  def llama_cpp_python() -> type[ModelAPI]:
203
214
  # validate
@@ -10,6 +10,8 @@ from ._metric import (
10
10
  NOANSWER,
11
11
  PARTIAL,
12
12
  Metric,
13
+ MetricProtocol,
14
+ SampleScore,
13
15
  Score,
14
16
  Value,
15
17
  ValueToFloat,
@@ -18,7 +20,7 @@ from ._metric import (
18
20
  )
19
21
  from ._metrics.accuracy import accuracy
20
22
  from ._metrics.mean import mean
21
- from ._metrics.std import bootstrap_stderr, std, stderr
23
+ from ._metrics.std import bootstrap_stderr, std, stderr, var
22
24
  from ._model import model_graded_fact, model_graded_qa
23
25
  from ._multi import multi_scorer
24
26
  from ._pattern import pattern
@@ -56,9 +58,12 @@ __all__ = [
56
58
  "std",
57
59
  "stderr",
58
60
  "mean",
61
+ "var",
59
62
  "Metric",
63
+ "MetricProtocol",
60
64
  "metric",
61
65
  "Score",
66
+ "SampleScore",
62
67
  "score",
63
68
  "Value",
64
69
  "ValueToFloat",
@@ -8,7 +8,7 @@ from inspect_ai._util.pattern import (
8
8
  )
9
9
 
10
10
  from ._metrics import accuracy, stderr
11
- from ._pattern import pattern
11
+ from ._pattern import pattern as make_pattern
12
12
  from ._scorer import Scorer, scorer
13
13
 
14
14
 
@@ -33,7 +33,7 @@ class AnswerPattern(str, Enum):
33
33
 
34
34
 
35
35
  @scorer(metrics=[accuracy(), stderr()])
36
- def answer(type: Literal["letter", "word", "line"]) -> Scorer:
36
+ def answer(pattern: Literal["letter", "word", "line"]) -> Scorer:
37
37
  """Scorer for model output that preceded answers with ANSWER:.
38
38
 
39
39
  Some solvers including multiple_choice solicit answers from
@@ -43,7 +43,7 @@ def answer(type: Literal["letter", "word", "line"]) -> Scorer:
43
43
  Note that you must specify a `type` for the answer scorer.
44
44
 
45
45
  Args:
46
- type: (Literal["letter", "word", "line"]): Type of answer
46
+ pattern: Type of answer
47
47
  to extract. "letter" is used with multiple choice and
48
48
  extracts a single letter; "word" will extract the next
49
49
  word (often used for yes/no answers); "line" will take
@@ -53,10 +53,10 @@ def answer(type: Literal["letter", "word", "line"]) -> Scorer:
53
53
  with a separate line at the end.
54
54
 
55
55
  """
56
- match type:
56
+ match pattern:
57
57
  case "letter":
58
- return pattern(AnswerPattern.LETTER)
58
+ return make_pattern(AnswerPattern.LETTER)
59
59
  case "word":
60
- return pattern(AnswerPattern.WORD)
60
+ return make_pattern(AnswerPattern.WORD)
61
61
  case "line":
62
- return pattern(AnswerPattern.LINE)
62
+ return make_pattern(AnswerPattern.LINE)
@@ -12,11 +12,15 @@ from ._target import Target
12
12
 
13
13
  @scorer(metrics=[mean(), stderr()])
14
14
  def f1(
15
- answer_fn: Callable[[str], str] | None = None,
15
+ answer_fn: Callable[[str], str] | None = None, stop_words: list[str] | None = None
16
16
  ) -> Scorer:
17
17
  """Scorer which produces an F1 score
18
18
 
19
19
  Computes the `F1` score for the answer (which balances recall precision by taking the harmonic mean between recall and precision).
20
+
21
+ Args:
22
+ answer_fn: Custom function to extract the answer from the completion (defaults to using the completion).
23
+ stop_words: Stop words to include in answer tokenization.
20
24
  """
21
25
 
22
26
  async def score(state: TaskState, target: Target) -> Score:
@@ -26,7 +30,7 @@ def f1(
26
30
  )
27
31
  targets = target.target
28
32
 
29
- f1_score = max_f1_score(answer, targets)
33
+ f1_score = max_f1_score(answer, targets, stop_words=stop_words)
30
34
  return Score(
31
35
  value=f1_score,
32
36
  answer=answer,
@@ -53,12 +57,14 @@ def exact() -> Scorer:
53
57
  return score
54
58
 
55
59
 
56
- def max_f1_score(answer: str, targets: List[str]) -> float:
60
+ def max_f1_score(
61
+ answer: str, targets: List[str], stop_words: list[str] | None = None
62
+ ) -> float:
57
63
  # Find the maximum F1 score for this answer
58
64
  max_f1 = 0.0
59
65
  for target in targets:
60
66
  if target[0].strip():
61
- f1_score = compute_f1(answer, target)
67
+ f1_score = compute_f1(answer, target, stop_words)
62
68
  max_f1 = max(max_f1, f1_score)
63
69
  return round(max_f1, 2)
64
70
 
@@ -75,18 +81,16 @@ def max_exact_score(answer: str, targets: List[str]) -> float:
75
81
  return max_exact
76
82
 
77
83
 
78
- def compute_f1(answer: str, target: str) -> float:
84
+ def compute_f1(answer: str, target: str, stop_words: list[str] | None = None) -> float:
79
85
  """Takes a predicted answer and a gold answer (that are both either a string or a list of strings), and returns exact match and the SQuAD F1 metric for the prediction."""
80
- answer_words = _to_words(answer)
81
- target_words = _to_words(target)
86
+ answer_words = _to_words(answer, stop_words)
87
+ target_words = _to_words(target, stop_words)
82
88
 
83
89
  return _f1(answer_words=answer_words, target_words=target_words)
84
90
 
85
91
 
86
- def _to_words(
87
- answer: str,
88
- ) -> set[str]:
89
- normalized = _normalize(answer)
92
+ def _to_words(answer: str, stop_words: list[str] | None = None) -> set[str]:
93
+ normalized = _normalize(answer, stop_words)
90
94
  token_bag = set(normalized.split())
91
95
  return token_bag
92
96
 
@@ -147,16 +151,32 @@ def _tokenize(text: str) -> List[str]:
147
151
  return re.split(" |-", text)
148
152
 
149
153
 
150
- def _normalize(answer: str) -> str:
154
+ def _normalize(text: str, stop_words: list[str] | None = None) -> str:
151
155
  """Normalize text to remove extraneous characters and words."""
152
156
  tokens = []
153
- tokenized_answer = _tokenize(answer)
157
+ tokenized_answer = _tokenize(text)
158
+
159
+ # Process stop words, if present
160
+ if stop_words is not None:
161
+ folded_stop_words = [_normalize_token(word) for word in stop_words]
162
+ else:
163
+ folded_stop_words = []
164
+
165
+ # Now process the text
154
166
  for token in tokenized_answer:
155
- token = _remove_punc(token.casefold())
156
- token = _normalize_number(token)
157
- token = _remove_articles(token)
158
- token = _normalize_whitespace(token)
159
- tokens.append(token)
167
+ token = _normalize_token(token)
168
+ if folded_stop_words is None or token not in folded_stop_words:
169
+ tokens.append(token)
170
+
171
+ # re-join the tokens into a normalized string
160
172
  tokens = [token for token in tokens if token.strip()]
161
173
  normalized = " ".join(tokens).strip()
162
174
  return normalized
175
+
176
+
177
+ def _normalize_token(token: str) -> str:
178
+ token = _remove_punc(token.casefold())
179
+ token = _normalize_number(token)
180
+ token = _remove_articles(token)
181
+ token = _normalize_whitespace(token)
182
+ return token
@@ -25,19 +25,13 @@ def str_match_scorer(match: Callable[[str, str], tuple[str, bool]]) -> Scorer:
25
25
  for value in target:
26
26
  answer, matched = match(state.output.completion, value)
27
27
  if matched:
28
- explanation = (
29
- state.output.completion
30
- if state.output.completion != answer
31
- else None
32
- )
33
28
  return Score(
34
29
  value=CORRECT, answer=answer, explanation=state.output.completion
35
30
  )
36
31
 
37
- explanation = (
38
- state.output.completion if state.output.completion != answer else None
32
+ return Score(
33
+ value=INCORRECT, answer=answer, explanation=state.output.completion
39
34
  )
40
- return Score(value=INCORRECT, answer=answer, explanation=explanation)
41
35
 
42
36
  return score
43
37
 
@@ -15,12 +15,11 @@ def match(
15
15
  """Scorer which matches text or a number.
16
16
 
17
17
  Args:
18
- location (Literal["begin", "end", "any", "exact"]):
19
- Location to match at. "any" matches anywhere in the
18
+ location: Location to match at. "any" matches anywhere in the
20
19
  output; "exact" requires the output be exactly
21
20
  equal to the target (module whitespace, etc.)
22
- ignore_case (bool): Do case insensitive comparison.
23
- numeric (bool): Is this a numeric match? (in this
21
+ ignore_case: Do case insensitive comparison.
22
+ numeric: Is this a numeric match? (in this
24
23
  case different punctuation removal rules are
25
24
  used and numbers are normalized before comparison).
26
25
  """
@@ -42,7 +41,7 @@ def includes(ignore_case: bool = True) -> Scorer:
42
41
  """Check whether the specified text is included in the model output.
43
42
 
44
43
  Args:
45
- ignore_case (bool): Use a case insensitive comparison.
44
+ ignore_case: Use a case insensitive comparison.
46
45
 
47
46
  """
48
47