inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (297) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/eval.py +35 -2
  3. inspect_ai/_cli/util.py +44 -1
  4. inspect_ai/_display/core/config.py +1 -1
  5. inspect_ai/_display/core/display.py +13 -4
  6. inspect_ai/_display/core/results.py +1 -1
  7. inspect_ai/_display/textual/app.py +14 -3
  8. inspect_ai/_display/textual/display.py +4 -0
  9. inspect_ai/_display/textual/widgets/samples.py +9 -3
  10. inspect_ai/_display/textual/widgets/task_detail.py +8 -8
  11. inspect_ai/_display/textual/widgets/tasks.py +17 -1
  12. inspect_ai/_display/textual/widgets/vscode.py +44 -0
  13. inspect_ai/_eval/eval.py +74 -25
  14. inspect_ai/_eval/evalset.py +22 -18
  15. inspect_ai/_eval/loader.py +34 -11
  16. inspect_ai/_eval/run.py +13 -15
  17. inspect_ai/_eval/score.py +13 -3
  18. inspect_ai/_eval/task/generate.py +8 -9
  19. inspect_ai/_eval/task/log.py +55 -6
  20. inspect_ai/_eval/task/run.py +51 -10
  21. inspect_ai/_eval/task/task.py +23 -9
  22. inspect_ai/_util/constants.py +2 -0
  23. inspect_ai/_util/file.py +30 -1
  24. inspect_ai/_util/json.py +37 -1
  25. inspect_ai/_util/registry.py +1 -0
  26. inspect_ai/_util/vscode.py +37 -0
  27. inspect_ai/_view/server.py +113 -1
  28. inspect_ai/_view/www/App.css +7 -1
  29. inspect_ai/_view/www/dist/assets/index.css +813 -415
  30. inspect_ai/_view/www/dist/assets/index.js +54475 -32003
  31. inspect_ai/_view/www/eslint.config.mjs +1 -1
  32. inspect_ai/_view/www/log-schema.json +137 -31
  33. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  34. inspect_ai/_view/www/package.json +11 -2
  35. inspect_ai/_view/www/src/App.tsx +161 -853
  36. inspect_ai/_view/www/src/api/api-browser.ts +176 -5
  37. inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
  38. inspect_ai/_view/www/src/api/client-api.ts +66 -10
  39. inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
  40. inspect_ai/_view/www/src/api/types.ts +107 -2
  41. inspect_ai/_view/www/src/appearance/icons.ts +2 -0
  42. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
  43. inspect_ai/_view/www/src/components/Card.tsx +6 -4
  44. inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
  45. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
  46. inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
  47. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
  48. inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
  49. inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
  50. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
  51. inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
  52. inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
  53. inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
  54. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
  55. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
  56. inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
  57. inspect_ai/_view/www/src/components/Modal.module.css +38 -0
  58. inspect_ai/_view/www/src/components/Modal.tsx +77 -0
  59. inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
  60. inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
  61. inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
  62. inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
  63. inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
  64. inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
  65. inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
  66. inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
  67. inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
  68. inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
  69. inspect_ai/_view/www/src/index.tsx +26 -94
  70. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
  71. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
  72. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
  73. inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
  74. inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
  75. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
  76. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
  77. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
  78. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
  79. inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
  80. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
  81. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
  82. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
  83. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
  84. inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
  85. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
  86. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
  87. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
  88. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
  89. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
  90. inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
  91. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
  92. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
  93. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
  94. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
  95. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
  96. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
  97. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
  98. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
  99. inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
  100. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
  101. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
  102. inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
  103. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
  104. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
  105. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
  106. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
  107. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
  108. inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
  109. inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
  110. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
  111. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
  112. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
  113. inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
  114. inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
  115. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
  116. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
  117. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
  118. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
  119. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
  120. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
  121. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
  122. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
  123. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
  124. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
  125. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
  126. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
  127. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
  128. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
  129. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
  130. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
  131. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
  132. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
  133. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
  134. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
  135. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
  136. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
  137. inspect_ai/_view/www/src/scoring/utils.ts +87 -0
  138. inspect_ai/_view/www/src/state/appSlice.ts +244 -0
  139. inspect_ai/_view/www/src/state/hooks.ts +399 -0
  140. inspect_ai/_view/www/src/state/logPolling.ts +200 -0
  141. inspect_ai/_view/www/src/state/logSlice.ts +224 -0
  142. inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
  143. inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
  144. inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
  145. inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
  146. inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
  147. inspect_ai/_view/www/src/state/scrolling.ts +206 -0
  148. inspect_ai/_view/www/src/state/store.ts +168 -0
  149. inspect_ai/_view/www/src/state/store_filter.ts +84 -0
  150. inspect_ai/_view/www/src/state/utils.ts +23 -0
  151. inspect_ai/_view/www/src/storage/index.ts +26 -0
  152. inspect_ai/_view/www/src/types/log.d.ts +36 -26
  153. inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
  154. inspect_ai/_view/www/src/types.ts +94 -32
  155. inspect_ai/_view/www/src/utils/attachments.ts +58 -23
  156. inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
  157. inspect_ai/_view/www/src/utils/logger.ts +52 -0
  158. inspect_ai/_view/www/src/utils/polling.ts +100 -0
  159. inspect_ai/_view/www/src/utils/react.ts +30 -0
  160. inspect_ai/_view/www/src/utils/vscode.ts +1 -1
  161. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
  162. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
  163. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
  164. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
  165. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
  166. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
  167. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
  168. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
  169. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
  170. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
  171. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
  172. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
  173. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
  174. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
  175. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
  176. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
  177. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
  178. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
  179. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
  180. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
  181. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
  182. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
  183. inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
  184. inspect_ai/_view/www/src/workspace/types.ts +4 -3
  185. inspect_ai/_view/www/src/workspace/utils.ts +4 -4
  186. inspect_ai/_view/www/vite.config.js +6 -0
  187. inspect_ai/_view/www/yarn.lock +464 -355
  188. inspect_ai/agent/__init__.py +36 -0
  189. inspect_ai/agent/_agent.py +268 -0
  190. inspect_ai/agent/_as_solver.py +72 -0
  191. inspect_ai/agent/_as_tool.py +122 -0
  192. inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
  193. inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
  194. inspect_ai/agent/_filter.py +46 -0
  195. inspect_ai/agent/_handoff.py +93 -0
  196. inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
  197. inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
  198. inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
  199. inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
  200. inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
  201. inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
  202. inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
  203. inspect_ai/agent/_react.py +241 -0
  204. inspect_ai/agent/_run.py +36 -0
  205. inspect_ai/agent/_types.py +81 -0
  206. inspect_ai/log/_condense.py +26 -0
  207. inspect_ai/log/_log.py +17 -5
  208. inspect_ai/log/_recorders/buffer/__init__.py +14 -0
  209. inspect_ai/log/_recorders/buffer/buffer.py +30 -0
  210. inspect_ai/log/_recorders/buffer/database.py +685 -0
  211. inspect_ai/log/_recorders/buffer/filestore.py +259 -0
  212. inspect_ai/log/_recorders/buffer/types.py +84 -0
  213. inspect_ai/log/_recorders/eval.py +2 -11
  214. inspect_ai/log/_recorders/types.py +30 -0
  215. inspect_ai/log/_transcript.py +32 -2
  216. inspect_ai/model/__init__.py +7 -1
  217. inspect_ai/model/_call_tools.py +257 -52
  218. inspect_ai/model/_chat_message.py +7 -4
  219. inspect_ai/model/_conversation.py +13 -62
  220. inspect_ai/model/_display.py +85 -0
  221. inspect_ai/model/_generate_config.py +2 -2
  222. inspect_ai/model/_model.py +114 -14
  223. inspect_ai/model/_model_output.py +14 -9
  224. inspect_ai/model/_openai.py +16 -4
  225. inspect_ai/model/_openai_computer_use.py +162 -0
  226. inspect_ai/model/_openai_responses.py +319 -165
  227. inspect_ai/model/_providers/anthropic.py +20 -21
  228. inspect_ai/model/_providers/azureai.py +24 -13
  229. inspect_ai/model/_providers/bedrock.py +1 -7
  230. inspect_ai/model/_providers/cloudflare.py +3 -3
  231. inspect_ai/model/_providers/goodfire.py +2 -6
  232. inspect_ai/model/_providers/google.py +11 -10
  233. inspect_ai/model/_providers/groq.py +6 -3
  234. inspect_ai/model/_providers/hf.py +7 -3
  235. inspect_ai/model/_providers/mistral.py +7 -10
  236. inspect_ai/model/_providers/openai.py +47 -17
  237. inspect_ai/model/_providers/openai_o1.py +11 -4
  238. inspect_ai/model/_providers/openai_responses.py +12 -14
  239. inspect_ai/model/_providers/providers.py +2 -2
  240. inspect_ai/model/_providers/together.py +12 -2
  241. inspect_ai/model/_providers/util/chatapi.py +7 -2
  242. inspect_ai/model/_providers/util/hf_handler.py +4 -2
  243. inspect_ai/model/_providers/util/llama31.py +4 -2
  244. inspect_ai/model/_providers/vertex.py +11 -9
  245. inspect_ai/model/_providers/vllm.py +4 -4
  246. inspect_ai/scorer/__init__.py +2 -0
  247. inspect_ai/scorer/_metrics/__init__.py +2 -0
  248. inspect_ai/scorer/_metrics/grouped.py +84 -0
  249. inspect_ai/scorer/_score.py +26 -6
  250. inspect_ai/solver/__init__.py +2 -2
  251. inspect_ai/solver/_basic_agent.py +22 -9
  252. inspect_ai/solver/_bridge.py +31 -0
  253. inspect_ai/solver/_chain.py +20 -12
  254. inspect_ai/solver/_fork.py +5 -1
  255. inspect_ai/solver/_human_agent.py +52 -0
  256. inspect_ai/solver/_prompt.py +3 -1
  257. inspect_ai/solver/_run.py +59 -0
  258. inspect_ai/solver/_solver.py +14 -4
  259. inspect_ai/solver/_task_state.py +5 -3
  260. inspect_ai/tool/_tool_call.py +15 -8
  261. inspect_ai/tool/_tool_def.py +17 -12
  262. inspect_ai/tool/_tool_support_helpers.py +4 -4
  263. inspect_ai/tool/_tool_with.py +14 -11
  264. inspect_ai/tool/_tools/_bash_session.py +11 -2
  265. inspect_ai/tool/_tools/_computer/_common.py +18 -2
  266. inspect_ai/tool/_tools/_computer/_computer.py +18 -2
  267. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
  268. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
  269. inspect_ai/tool/_tools/_think.py +1 -1
  270. inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
  271. inspect_ai/util/__init__.py +2 -0
  272. inspect_ai/util/_anyio.py +27 -0
  273. inspect_ai/util/_sandbox/__init__.py +2 -1
  274. inspect_ai/util/_sandbox/context.py +32 -7
  275. inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
  276. inspect_ai/util/_sandbox/docker/compose.py +2 -2
  277. inspect_ai/util/_sandbox/docker/docker.py +12 -1
  278. inspect_ai/util/_store_model.py +30 -7
  279. inspect_ai/util/_subprocess.py +13 -3
  280. inspect_ai/util/_subtask.py +1 -0
  281. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
  282. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
  283. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
  284. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
  285. /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
  286. /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
  287. /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
  288. /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
  289. /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
  290. /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
  291. /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
  292. /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
  293. /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
  294. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
  295. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
  296. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
  297. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ from typing import Any, Iterator, Literal, cast
4
4
 
5
5
  from shortuuid import uuid
6
6
 
7
+ from inspect_ai._display.core.display import TaskDisplayMetric
7
8
  from inspect_ai._eval.task.util import slice_dataset
8
9
  from inspect_ai._util.constants import PKG_NAME
9
10
  from inspect_ai._util.datetime import iso_now
@@ -34,6 +35,9 @@ from inspect_ai.log._log import (
34
35
  eval_config_defaults,
35
36
  )
36
37
  from inspect_ai.log._recorders import Recorder
38
+ from inspect_ai.log._recorders.buffer import SampleBufferDatabase
39
+ from inspect_ai.log._recorders.types import SampleEvent, SampleSummary
40
+ from inspect_ai.log._transcript import Event
37
41
  from inspect_ai.model import (
38
42
  GenerateConfig,
39
43
  Model,
@@ -53,6 +57,7 @@ class TaskLogger:
53
57
  task_name: str,
54
58
  task_version: int,
55
59
  task_file: str | None,
60
+ task_registry_name: str | None,
56
61
  task_id: str | None,
57
62
  run_id: str,
58
63
  solver: SolverSpec | None,
@@ -127,6 +132,7 @@ class TaskLogger:
127
132
  task_id=task_id if task_id else uuid(),
128
133
  task_version=task_version,
129
134
  task_file=task_file,
135
+ task_registry_name=task_registry_name,
130
136
  task_attribs=task_attribs,
131
137
  task_args=task_args,
132
138
  solver=solver.solver if solver else None,
@@ -159,10 +165,15 @@ class TaskLogger:
159
165
 
160
166
  # size of flush buffer (how many samples we buffer before hitting storage)
161
167
  self.flush_buffer = eval_config.log_buffer or recorder.default_log_buffer()
162
- self.flush_pending = 0
168
+ self.flush_pending: list[tuple[str | int, int]] = []
163
169
 
164
170
  async def init(self) -> None:
165
171
  self._location = await self.recorder.log_init(self.eval)
172
+ self._buffer_db = SampleBufferDatabase(
173
+ location=self._location,
174
+ log_images=self.eval.config.log_images is not False,
175
+ log_shared=self.eval.config.log_shared,
176
+ )
166
177
 
167
178
  @property
168
179
  def location(self) -> str:
@@ -174,22 +185,53 @@ class TaskLogger:
174
185
 
175
186
  async def log_start(self, plan: EvalPlan) -> None:
176
187
  await self.recorder.log_start(self.eval, plan)
188
+ await self.recorder.flush(self.eval)
189
+
190
+ async def start_sample(self, sample: SampleSummary) -> None:
191
+ self._buffer_db.start_sample(sample)
192
+
193
+ def log_sample_event(self, id: str | int, epoch: int, event: Event) -> None:
194
+ # log the sample event
195
+ self._buffer_db.log_events([SampleEvent(id=id, epoch=epoch, event=event)])
177
196
 
178
- async def log_sample(self, sample: EvalSample, *, flush: bool) -> None:
197
+ async def complete_sample(self, sample: EvalSample, *, flush: bool) -> None:
179
198
  # log the sample
180
199
  await self.recorder.log_sample(self.eval, sample)
181
200
 
201
+ # mark complete
202
+ self._buffer_db.complete_sample(
203
+ SampleSummary(
204
+ id=sample.id,
205
+ epoch=sample.epoch,
206
+ input=sample.input,
207
+ target=sample.target,
208
+ completed=True,
209
+ scores=sample.scores,
210
+ error=sample.error.message if sample.error is not None else None,
211
+ limit=f"{sample.limit.type}" if sample.limit is not None else None,
212
+ )
213
+ )
214
+
182
215
  # flush if requested
183
216
  if flush:
184
- self.flush_pending += 1
185
- if self.flush_pending >= self.flush_buffer:
217
+ self.flush_pending.append((sample.id, sample.epoch))
218
+ if len(self.flush_pending) >= self.flush_buffer:
219
+ # flush to disk
186
220
  await self.recorder.flush(self.eval)
187
- self.flush_pending = 0
221
+
222
+ # notify the event db it can remove these
223
+ self._buffer_db.remove_samples(self.flush_pending)
224
+
225
+ # Clear
226
+ self.flush_pending.clear()
188
227
 
189
228
  # track sucessful samples logged
190
229
  if sample.error is None:
191
230
  self._samples_completed += 1
192
231
 
232
+ def update_metrics(self, metrics: list[TaskDisplayMetric]) -> None:
233
+ self._buffer_db.update_metrics(metrics)
234
+
193
235
  async def log_finish(
194
236
  self,
195
237
  status: Literal["success", "cancelled", "error"],
@@ -198,10 +240,17 @@ class TaskLogger:
198
240
  reductions: list[EvalSampleReductions] | None = None,
199
241
  error: EvalError | None = None,
200
242
  ) -> EvalLog:
201
- return await self.recorder.log_finish(
243
+ # finish and get log
244
+ log = await self.recorder.log_finish(
202
245
  self.eval, status, stats, results, reductions, error
203
246
  )
204
247
 
248
+ # cleanup the events db
249
+ self._buffer_db.cleanup()
250
+
251
+ # return log
252
+ return log
253
+
205
254
 
206
255
  async def log_start(
207
256
  logger: TaskLogger,
@@ -19,7 +19,7 @@ from inspect_ai._display import (
19
19
  TaskSuccess,
20
20
  display,
21
21
  )
22
- from inspect_ai._display.core.display import TaskDisplay, TaskDisplayMetric
22
+ from inspect_ai._display.core.display import TaskDisplayMetric
23
23
  from inspect_ai._util._async import tg_collect
24
24
  from inspect_ai._util.constants import (
25
25
  DEFAULT_EPOCHS,
@@ -29,6 +29,7 @@ from inspect_ai._util.constants import (
29
29
  from inspect_ai._util.datetime import iso_now
30
30
  from inspect_ai._util.error import exception_message
31
31
  from inspect_ai._util.hooks import send_telemetry
32
+ from inspect_ai._util.json import to_json_str_safe
32
33
  from inspect_ai._util.registry import (
33
34
  is_registry_object,
34
35
  registry_log_name,
@@ -51,13 +52,17 @@ from inspect_ai.log import (
51
52
  from inspect_ai.log._condense import condense_sample
52
53
  from inspect_ai.log._file import eval_log_json_str
53
54
  from inspect_ai.log._log import EvalSampleLimit, EvalSampleReductions, eval_error
54
- from inspect_ai.log._samples import active_sample
55
+ from inspect_ai.log._recorders.types import SampleSummary
56
+ from inspect_ai.log._samples import (
57
+ active_sample,
58
+ )
55
59
  from inspect_ai.log._transcript import (
56
60
  ErrorEvent,
57
61
  SampleInitEvent,
58
62
  SampleLimitEvent,
59
63
  ScoreEvent,
60
64
  StepEvent,
65
+ Transcript,
61
66
  transcript,
62
67
  )
63
68
  from inspect_ai.model import (
@@ -264,8 +269,13 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
264
269
 
265
270
  # track when samples complete and update progress as we go
266
271
  progress_results: list[dict[str, SampleScore]] = []
272
+
273
+ def update_metrics(metrics: list[TaskDisplayMetric]) -> None:
274
+ td.update_metrics(metrics)
275
+ logger.update_metrics(metrics)
276
+
267
277
  update_metrics_display = update_metrics_display_fn(
268
- td,
278
+ update_metrics,
269
279
  display_metrics=profile.eval_config.score_display is not False,
270
280
  )
271
281
 
@@ -423,7 +433,7 @@ async def task_run(options: TaskRunOptions) -> EvalLog:
423
433
 
424
434
 
425
435
  def update_metrics_display_fn(
426
- td: TaskDisplay,
436
+ update_fn: Callable[[list[TaskDisplayMetric]], None],
427
437
  initial_interval: float = 0,
428
438
  min_interval: float = 0.9,
429
439
  display_metrics: bool = True,
@@ -463,7 +473,7 @@ def update_metrics_display_fn(
463
473
  )
464
474
 
465
475
  # Name, reducer, value
466
- task_metrics = []
476
+ task_metrics: list[TaskDisplayMetric] = []
467
477
  if len(results.scores) > 0:
468
478
  for score in results.scores:
469
479
  for key, metric in score.metrics.items():
@@ -475,7 +485,7 @@ def update_metrics_display_fn(
475
485
  reducer=score.reducer,
476
486
  )
477
487
  )
478
- td.update_metrics(task_metrics)
488
+ update_fn(task_metrics)
479
489
 
480
490
  # determine how long to wait before recomputing metrics
481
491
  time_end = time.perf_counter()
@@ -516,7 +526,7 @@ async def task_run_sample(
516
526
 
517
527
  # log if requested
518
528
  if logger:
519
- await logger.log_sample(previous_sample, flush=False)
529
+ await logger.complete_sample(previous_sample, flush=False)
520
530
 
521
531
  # return score
522
532
  sample_scores = (
@@ -539,10 +549,19 @@ async def task_run_sample(
539
549
  semaphore if semaphore else contextlib.nullcontext()
540
550
  )
541
551
 
552
+ # validate that we have sample_id (mostly for the typechecker)
553
+ sample_id = sample.id
554
+ if sample_id is None:
555
+ raise ValueError("sample must have id to run")
556
+
542
557
  # initialise subtask and scoring context
543
558
  init_sample_model_usage()
544
559
  set_sample_state(state)
545
- sample_transcript = init_subtask(SAMPLE_SUBTASK, state.store)
560
+ sample_transcript: Transcript = init_subtask(SAMPLE_SUBTASK, state.store)
561
+ if logger:
562
+ sample_transcript._subscribe(
563
+ lambda event: logger.log_sample_event(sample_id, state.epoch, event)
564
+ )
546
565
  if scorers:
547
566
  init_scoring_context(scorers, Target(sample.target))
548
567
 
@@ -626,6 +645,28 @@ async def task_run_sample(
626
645
  # mark started
627
646
  active.started = datetime.now().timestamp()
628
647
 
648
+ if logger is not None:
649
+ await logger.start_sample(
650
+ SampleSummary(
651
+ id=sample_id,
652
+ epoch=state.epoch,
653
+ input=sample.input,
654
+ target=sample.target,
655
+ )
656
+ )
657
+
658
+ # sample init event (remove file bodies as they have content or absolute paths)
659
+ event_sample = sample.model_copy(
660
+ update=dict(files={k: "" for k in sample.files.keys()})
661
+ if sample.files
662
+ else None
663
+ )
664
+ transcript()._event(
665
+ SampleInitEvent(
666
+ sample=event_sample, state=state_jsonable(state)
667
+ )
668
+ )
669
+
629
670
  # set progress for plan then run it
630
671
  state = await plan(state, generate)
631
672
 
@@ -824,7 +865,7 @@ async def log_sample(
824
865
  id = sample.id
825
866
  if id is None:
826
867
  raise ValueError(
827
- f"Samples without IDs cannot be logged: {sample.model_dump_json()}"
868
+ f"Samples without IDs cannot be logged: {to_json_str_safe(sample)}"
828
869
  )
829
870
 
830
871
  # construct sample for logging
@@ -866,7 +907,7 @@ async def log_sample(
866
907
  limit=limit,
867
908
  )
868
909
 
869
- await logger.log_sample(condense_sample(eval_sample, log_images), flush=True)
910
+ await logger.complete_sample(condense_sample(eval_sample, log_images), flush=True)
870
911
 
871
912
 
872
913
  async def resolve_dataset(
@@ -1,4 +1,3 @@
1
- from copy import deepcopy
2
1
  from dataclasses import dataclass
3
2
  from logging import getLogger
4
3
  from typing import Any, Awaitable, Callable, Sequence, cast
@@ -9,6 +8,8 @@ from typing_extensions import TypedDict, Unpack
9
8
  from inspect_ai._util.logger import warn_once
10
9
  from inspect_ai._util.notgiven import NOT_GIVEN, NotGiven
11
10
  from inspect_ai._util.registry import is_registry_object, registry_info
11
+ from inspect_ai.agent._agent import Agent, is_agent
12
+ from inspect_ai.agent._as_solver import as_solver
12
13
  from inspect_ai.approval._policy import ApprovalPolicy, approval_policies_from_config
13
14
  from inspect_ai.dataset import Dataset, MemoryDataset, Sample
14
15
  from inspect_ai.log import EvalLog
@@ -47,7 +48,7 @@ class Task:
47
48
  self,
48
49
  dataset: Dataset | Sequence[Sample] | None = None,
49
50
  setup: Solver | list[Solver] | None = None,
50
- solver: Solver | list[Solver] = generate(),
51
+ solver: Solver | Agent | list[Solver] = generate(),
51
52
  cleanup: Callable[[TaskState], Awaitable[None]] | None = None,
52
53
  scorer: Scorer | list[Scorer] | None = None,
53
54
  metrics: list[Metric] | dict[str, list[Metric]] | None = None,
@@ -158,6 +159,13 @@ class Task:
158
159
  else:
159
160
  return "task"
160
161
 
162
+ @property
163
+ def registry_name(self) -> str | None:
164
+ if is_registry_object(self):
165
+ return registry_info(self).name
166
+ else:
167
+ return None
168
+
161
169
  @property
162
170
  def attribs(self) -> dict[str, Any]:
163
171
  if is_registry_object(self):
@@ -191,8 +199,12 @@ def task_with(
191
199
  ) -> Task:
192
200
  """Task adapted with alternate values for one or more options.
193
201
 
202
+ This function modifies the passed task in place and returns it.
203
+ If you want to create multiple variations of a single task using
204
+ `task_with()` you should create the underlying task multiple times.
205
+
194
206
  Args:
195
- task: Task to adapt (it is deep copied prior to mutating options)
207
+ task: Task to adapt
196
208
  dataset: Dataset to evaluate
197
209
  setup: Setup step (always run even when the main `solver` is replaced).
198
210
  solver: Solver or list of solvers. Defaults to generate(), a normal call to the model.
@@ -227,11 +239,8 @@ def task_with(
227
239
  metadata: Additional metadata to associate with the task.
228
240
 
229
241
  Returns:
230
- Task: Task adapted with alternate options.
242
+ Task: Passed `task` with modifications.
231
243
  """
232
- # deep copy the task
233
- task = deepcopy(task)
234
-
235
244
  if not isinstance(dataset, NotGiven):
236
245
  task.dataset = resolve_dataset(dataset)
237
246
  if not isinstance(setup, NotGiven):
@@ -340,8 +349,13 @@ def resolve_dataset(dataset: Dataset | Sequence[Sample] | None) -> Dataset:
340
349
  return dataset if isinstance(dataset, Dataset) else MemoryDataset(list(dataset))
341
350
 
342
351
 
343
- def resolve_solver(solver: Solver | list[Solver]) -> Solver:
344
- return chain(solver) if isinstance(solver, list) else solver
352
+ def resolve_solver(solver: Solver | Agent | list[Solver]) -> Solver:
353
+ if isinstance(solver, list):
354
+ return chain(solver)
355
+ elif is_agent(solver):
356
+ return as_solver(solver)
357
+ else:
358
+ return cast(Solver, solver)
345
359
 
346
360
 
347
361
  def resolve_model(model: str | Model | None) -> Model | None:
@@ -25,8 +25,10 @@ ALL_LOG_LEVELS = [
25
25
  ]
26
26
  DEFAULT_LOG_LEVEL = "warning"
27
27
  DEFAULT_LOG_LEVEL_TRANSCRIPT = "info"
28
+ DEFAULT_LOG_SHARED = 10
28
29
  ALL_LOG_FORMATS = ["eval", "json"]
29
30
  DEFAULT_LOG_FORMAT: Literal["eval", "json"] = "eval"
31
+ JSON_LOG_FORMAT = "json"
30
32
  EVAL_LOG_FORMAT = "eval"
31
33
  DEFAULT_DISPLAY = "full"
32
34
  LOG_SCHEMA_VERSION = 2
inspect_ai/_util/file.py CHANGED
@@ -13,7 +13,7 @@ from urllib.parse import urlparse
13
13
  import fsspec # type: ignore # type: ignore
14
14
  from fsspec.core import split_protocol # type: ignore # type: ignore
15
15
  from fsspec.implementations.local import make_path_posix # type: ignore
16
- from pydantic import BaseModel
16
+ from pydantic import BaseModel, Field
17
17
  from s3fs import S3FileSystem # type: ignore
18
18
  from shortuuid import uuid
19
19
 
@@ -158,6 +158,9 @@ class FileInfo(BaseModel):
158
158
  mtime: float | None
159
159
  """File modification time (None if the file is a directory on S3)."""
160
160
 
161
+ etag: str | None = Field(default=None)
162
+ """Etag (provided by some remote filesystems)"""
163
+
161
164
 
162
165
  class FileSystem:
163
166
  def __init__(self, fs: Any) -> None:
@@ -178,6 +181,9 @@ class FileSystem:
178
181
  ) -> None:
179
182
  self.fs.rm(path, recursive=recursive, maxdepth=maxdepth)
180
183
 
184
+ def mv(self, lpath: str, rpath: str) -> None:
185
+ self.fs.mv(lpath, rpath)
186
+
181
187
  def mkdir(self, path: str, exist_ok: bool = False) -> None:
182
188
  if self.is_s3():
183
189
  # try to avoid calling create_bucket on s3 filesystems (as that requires distinct
@@ -199,6 +205,9 @@ class FileSystem:
199
205
  def info(self, path: str, **kwargs: dict[str, Any]) -> FileInfo:
200
206
  return self._file_info(self.fs.info(path, **kwargs))
201
207
 
208
+ def path_as_uri(self, path: str) -> str:
209
+ return str(self.fs.unstrip_protocol(path))
210
+
202
211
  def ls(
203
212
  self, path: str, recursive: bool = False, **kwargs: dict[str, Any]
204
213
  ) -> list[FileInfo]:
@@ -267,11 +276,18 @@ class FileSystem:
267
276
  else:
268
277
  file["mtime"] = None
269
278
 
279
+ # S3 filesystems provided an ETag
280
+ if "ETag" in file.keys():
281
+ etag: str | None = file["ETag"].strip('"')
282
+ else:
283
+ etag = None
284
+
270
285
  return FileInfo(
271
286
  name=file["name"],
272
287
  type=file["type"],
273
288
  size=file["size"],
274
289
  mtime=file["mtime"],
290
+ etag=etag,
275
291
  )
276
292
 
277
293
 
@@ -306,6 +322,19 @@ def absolute_file_path(file: str) -> str:
306
322
  return file
307
323
 
308
324
 
325
+ def to_uri(path_or_uri: str) -> str:
326
+ # Check if it's already a URI
327
+ parsed = urlparse(path_or_uri)
328
+
329
+ if parsed.scheme:
330
+ # Already has a scheme, return as is
331
+ return path_or_uri
332
+
333
+ # It's a file path, convert to URI
334
+ path_obj = Path(path_or_uri).absolute()
335
+ return path_obj.as_uri()
336
+
337
+
309
338
  def default_fs_options(file: str) -> dict[str, Any]:
310
339
  scheme = urlparse(file).scheme
311
340
  if (
inspect_ai/_util/json.py CHANGED
@@ -6,7 +6,10 @@ from typing import (
6
6
 
7
7
  import jsonpatch
8
8
  from pydantic import BaseModel, Field, JsonValue
9
- from pydantic_core import to_jsonable_python
9
+ from pydantic_core import to_json, to_jsonable_python
10
+
11
+ JSONType = Literal["string", "integer", "number", "boolean", "array", "object", "null"]
12
+ """Valid types within JSON schema."""
10
13
 
11
14
 
12
15
  def jsonable_python(x: Any) -> Any:
@@ -23,6 +26,39 @@ def jsonable_dict(x: Any) -> dict[str, JsonValue]:
23
26
  )
24
27
 
25
28
 
29
+ def to_json_safe(x: Any) -> bytes:
30
+ return to_json(value=x, indent=2, exclude_none=True, fallback=lambda _x: None)
31
+
32
+
33
+ def to_json_str_safe(x: Any) -> str:
34
+ return to_json_safe(x).decode()
35
+
36
+
37
+ def python_type_to_json_type(python_type: str | None) -> JSONType:
38
+ match python_type:
39
+ case "str":
40
+ return "string"
41
+ case "int":
42
+ return "integer"
43
+ case "float":
44
+ return "number"
45
+ case "bool":
46
+ return "boolean"
47
+ case "list":
48
+ return "array"
49
+ case "dict":
50
+ return "object"
51
+ case "None":
52
+ return "null"
53
+ # treat 'unknown' as string as anything can be converted to string
54
+ case None:
55
+ return "string"
56
+ case _:
57
+ raise ValueError(
58
+ f"Unsupported type: {python_type} for Python to JSON conversion."
59
+ )
60
+
61
+
26
62
  class JsonChange(BaseModel):
27
63
  """Describes a change to data using JSON Patch format."""
28
64
 
@@ -21,6 +21,7 @@ RegistryType = Literal[
21
21
  "scorer",
22
22
  "metric",
23
23
  "tool",
24
+ "agent",
24
25
  "sandboxenv",
25
26
  "score_reducer",
26
27
  "approver",
@@ -1,13 +1,19 @@
1
1
  import os
2
+ from logging import getLogger
2
3
  from pathlib import Path
3
4
  from typing import Any
4
5
 
5
6
  from pydantic import BaseModel, Field
6
7
  from pydantic_core import to_json
8
+ from semver import Version
7
9
  from shortuuid import uuid
8
10
 
9
11
  from .appdirs import inspect_data_dir
10
12
 
13
+ logger = getLogger(__name__)
14
+
15
+ EXTENSION_COMMAND_VERSIONS = {"inspect.openLogViewer": Version(0, 3, 61)}
16
+
11
17
 
12
18
  class VSCodeCommand(BaseModel):
13
19
  command: str
@@ -34,6 +40,25 @@ def can_execute_vscode_commands() -> bool:
34
40
  return vs_code_commands_dir() is not None
35
41
 
36
42
 
43
+ def can_execute_vscode_command(command: str) -> bool:
44
+ if not can_execute_vscode_commands():
45
+ return False
46
+
47
+ required_version = EXTENSION_COMMAND_VERSIONS.get(command)
48
+ if required_version is None:
49
+ return True
50
+ else:
51
+ return has_vscode_version(required_version)
52
+
53
+
54
+ def has_vscode_version(required_version: Version) -> bool:
55
+ current_version = vscode_extension_version()
56
+ if current_version is None:
57
+ return False
58
+ else:
59
+ return current_version.is_compatible(required_version)
60
+
61
+
37
62
  def vs_code_commands_dir() -> Path | None:
38
63
  workspace_id = vscode_workspace_id()
39
64
  if workspace_id:
@@ -49,3 +74,15 @@ def vs_code_commands_dir() -> Path | None:
49
74
 
50
75
  def vscode_workspace_id() -> str | None:
51
76
  return os.environ.get("INSPECT_WORKSPACE_ID", None)
77
+
78
+
79
+ def vscode_extension_version() -> Version | None:
80
+ version = os.environ.get("INSPECT_VSCODE_EXT_VERSION", None)
81
+ if version is not None:
82
+ try:
83
+ return Version.parse(version)
84
+ except Exception:
85
+ logger.warning(f"Invalid Inspect vscode extension version: {version}")
86
+ return None
87
+ else:
88
+ return None