inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (297) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/eval.py +35 -2
  3. inspect_ai/_cli/util.py +44 -1
  4. inspect_ai/_display/core/config.py +1 -1
  5. inspect_ai/_display/core/display.py +13 -4
  6. inspect_ai/_display/core/results.py +1 -1
  7. inspect_ai/_display/textual/app.py +14 -3
  8. inspect_ai/_display/textual/display.py +4 -0
  9. inspect_ai/_display/textual/widgets/samples.py +9 -3
  10. inspect_ai/_display/textual/widgets/task_detail.py +8 -8
  11. inspect_ai/_display/textual/widgets/tasks.py +17 -1
  12. inspect_ai/_display/textual/widgets/vscode.py +44 -0
  13. inspect_ai/_eval/eval.py +74 -25
  14. inspect_ai/_eval/evalset.py +22 -18
  15. inspect_ai/_eval/loader.py +34 -11
  16. inspect_ai/_eval/run.py +13 -15
  17. inspect_ai/_eval/score.py +13 -3
  18. inspect_ai/_eval/task/generate.py +8 -9
  19. inspect_ai/_eval/task/log.py +55 -6
  20. inspect_ai/_eval/task/run.py +51 -10
  21. inspect_ai/_eval/task/task.py +23 -9
  22. inspect_ai/_util/constants.py +2 -0
  23. inspect_ai/_util/file.py +30 -1
  24. inspect_ai/_util/json.py +37 -1
  25. inspect_ai/_util/registry.py +1 -0
  26. inspect_ai/_util/vscode.py +37 -0
  27. inspect_ai/_view/server.py +113 -1
  28. inspect_ai/_view/www/App.css +7 -1
  29. inspect_ai/_view/www/dist/assets/index.css +813 -415
  30. inspect_ai/_view/www/dist/assets/index.js +54475 -32003
  31. inspect_ai/_view/www/eslint.config.mjs +1 -1
  32. inspect_ai/_view/www/log-schema.json +137 -31
  33. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  34. inspect_ai/_view/www/package.json +11 -2
  35. inspect_ai/_view/www/src/App.tsx +161 -853
  36. inspect_ai/_view/www/src/api/api-browser.ts +176 -5
  37. inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
  38. inspect_ai/_view/www/src/api/client-api.ts +66 -10
  39. inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
  40. inspect_ai/_view/www/src/api/types.ts +107 -2
  41. inspect_ai/_view/www/src/appearance/icons.ts +2 -0
  42. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
  43. inspect_ai/_view/www/src/components/Card.tsx +6 -4
  44. inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
  45. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
  46. inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
  47. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
  48. inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
  49. inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
  50. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
  51. inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
  52. inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
  53. inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
  54. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
  55. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
  56. inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
  57. inspect_ai/_view/www/src/components/Modal.module.css +38 -0
  58. inspect_ai/_view/www/src/components/Modal.tsx +77 -0
  59. inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
  60. inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
  61. inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
  62. inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
  63. inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
  64. inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
  65. inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
  66. inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
  67. inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
  68. inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
  69. inspect_ai/_view/www/src/index.tsx +26 -94
  70. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
  71. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
  72. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
  73. inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
  74. inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
  75. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
  76. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
  77. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
  78. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
  79. inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
  80. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
  81. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
  82. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
  83. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
  84. inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
  85. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
  86. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
  87. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
  88. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
  89. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
  90. inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
  91. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
  92. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
  93. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
  94. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
  95. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
  96. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
  97. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
  98. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
  99. inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
  100. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
  101. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
  102. inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
  103. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
  104. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
  105. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
  106. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
  107. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
  108. inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
  109. inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
  110. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
  111. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
  112. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
  113. inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
  114. inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
  115. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
  116. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
  117. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
  118. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
  119. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
  120. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
  121. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
  122. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
  123. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
  124. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
  125. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
  126. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
  127. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
  128. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
  129. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
  130. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
  131. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
  132. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
  133. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
  134. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
  135. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
  136. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
  137. inspect_ai/_view/www/src/scoring/utils.ts +87 -0
  138. inspect_ai/_view/www/src/state/appSlice.ts +244 -0
  139. inspect_ai/_view/www/src/state/hooks.ts +399 -0
  140. inspect_ai/_view/www/src/state/logPolling.ts +200 -0
  141. inspect_ai/_view/www/src/state/logSlice.ts +224 -0
  142. inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
  143. inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
  144. inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
  145. inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
  146. inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
  147. inspect_ai/_view/www/src/state/scrolling.ts +206 -0
  148. inspect_ai/_view/www/src/state/store.ts +168 -0
  149. inspect_ai/_view/www/src/state/store_filter.ts +84 -0
  150. inspect_ai/_view/www/src/state/utils.ts +23 -0
  151. inspect_ai/_view/www/src/storage/index.ts +26 -0
  152. inspect_ai/_view/www/src/types/log.d.ts +36 -26
  153. inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
  154. inspect_ai/_view/www/src/types.ts +94 -32
  155. inspect_ai/_view/www/src/utils/attachments.ts +58 -23
  156. inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
  157. inspect_ai/_view/www/src/utils/logger.ts +52 -0
  158. inspect_ai/_view/www/src/utils/polling.ts +100 -0
  159. inspect_ai/_view/www/src/utils/react.ts +30 -0
  160. inspect_ai/_view/www/src/utils/vscode.ts +1 -1
  161. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
  162. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
  163. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
  164. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
  165. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
  166. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
  167. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
  168. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
  169. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
  170. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
  171. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
  172. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
  173. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
  174. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
  175. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
  176. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
  177. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
  178. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
  179. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
  180. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
  181. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
  182. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
  183. inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
  184. inspect_ai/_view/www/src/workspace/types.ts +4 -3
  185. inspect_ai/_view/www/src/workspace/utils.ts +4 -4
  186. inspect_ai/_view/www/vite.config.js +6 -0
  187. inspect_ai/_view/www/yarn.lock +464 -355
  188. inspect_ai/agent/__init__.py +36 -0
  189. inspect_ai/agent/_agent.py +268 -0
  190. inspect_ai/agent/_as_solver.py +72 -0
  191. inspect_ai/agent/_as_tool.py +122 -0
  192. inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
  193. inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
  194. inspect_ai/agent/_filter.py +46 -0
  195. inspect_ai/agent/_handoff.py +93 -0
  196. inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
  197. inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
  198. inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
  199. inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
  200. inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
  201. inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
  202. inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
  203. inspect_ai/agent/_react.py +241 -0
  204. inspect_ai/agent/_run.py +36 -0
  205. inspect_ai/agent/_types.py +81 -0
  206. inspect_ai/log/_condense.py +26 -0
  207. inspect_ai/log/_log.py +17 -5
  208. inspect_ai/log/_recorders/buffer/__init__.py +14 -0
  209. inspect_ai/log/_recorders/buffer/buffer.py +30 -0
  210. inspect_ai/log/_recorders/buffer/database.py +685 -0
  211. inspect_ai/log/_recorders/buffer/filestore.py +259 -0
  212. inspect_ai/log/_recorders/buffer/types.py +84 -0
  213. inspect_ai/log/_recorders/eval.py +2 -11
  214. inspect_ai/log/_recorders/types.py +30 -0
  215. inspect_ai/log/_transcript.py +32 -2
  216. inspect_ai/model/__init__.py +7 -1
  217. inspect_ai/model/_call_tools.py +257 -52
  218. inspect_ai/model/_chat_message.py +7 -4
  219. inspect_ai/model/_conversation.py +13 -62
  220. inspect_ai/model/_display.py +85 -0
  221. inspect_ai/model/_generate_config.py +2 -2
  222. inspect_ai/model/_model.py +114 -14
  223. inspect_ai/model/_model_output.py +14 -9
  224. inspect_ai/model/_openai.py +16 -4
  225. inspect_ai/model/_openai_computer_use.py +162 -0
  226. inspect_ai/model/_openai_responses.py +319 -165
  227. inspect_ai/model/_providers/anthropic.py +20 -21
  228. inspect_ai/model/_providers/azureai.py +24 -13
  229. inspect_ai/model/_providers/bedrock.py +1 -7
  230. inspect_ai/model/_providers/cloudflare.py +3 -3
  231. inspect_ai/model/_providers/goodfire.py +2 -6
  232. inspect_ai/model/_providers/google.py +11 -10
  233. inspect_ai/model/_providers/groq.py +6 -3
  234. inspect_ai/model/_providers/hf.py +7 -3
  235. inspect_ai/model/_providers/mistral.py +7 -10
  236. inspect_ai/model/_providers/openai.py +47 -17
  237. inspect_ai/model/_providers/openai_o1.py +11 -4
  238. inspect_ai/model/_providers/openai_responses.py +12 -14
  239. inspect_ai/model/_providers/providers.py +2 -2
  240. inspect_ai/model/_providers/together.py +12 -2
  241. inspect_ai/model/_providers/util/chatapi.py +7 -2
  242. inspect_ai/model/_providers/util/hf_handler.py +4 -2
  243. inspect_ai/model/_providers/util/llama31.py +4 -2
  244. inspect_ai/model/_providers/vertex.py +11 -9
  245. inspect_ai/model/_providers/vllm.py +4 -4
  246. inspect_ai/scorer/__init__.py +2 -0
  247. inspect_ai/scorer/_metrics/__init__.py +2 -0
  248. inspect_ai/scorer/_metrics/grouped.py +84 -0
  249. inspect_ai/scorer/_score.py +26 -6
  250. inspect_ai/solver/__init__.py +2 -2
  251. inspect_ai/solver/_basic_agent.py +22 -9
  252. inspect_ai/solver/_bridge.py +31 -0
  253. inspect_ai/solver/_chain.py +20 -12
  254. inspect_ai/solver/_fork.py +5 -1
  255. inspect_ai/solver/_human_agent.py +52 -0
  256. inspect_ai/solver/_prompt.py +3 -1
  257. inspect_ai/solver/_run.py +59 -0
  258. inspect_ai/solver/_solver.py +14 -4
  259. inspect_ai/solver/_task_state.py +5 -3
  260. inspect_ai/tool/_tool_call.py +15 -8
  261. inspect_ai/tool/_tool_def.py +17 -12
  262. inspect_ai/tool/_tool_support_helpers.py +4 -4
  263. inspect_ai/tool/_tool_with.py +14 -11
  264. inspect_ai/tool/_tools/_bash_session.py +11 -2
  265. inspect_ai/tool/_tools/_computer/_common.py +18 -2
  266. inspect_ai/tool/_tools/_computer/_computer.py +18 -2
  267. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
  268. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
  269. inspect_ai/tool/_tools/_think.py +1 -1
  270. inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
  271. inspect_ai/util/__init__.py +2 -0
  272. inspect_ai/util/_anyio.py +27 -0
  273. inspect_ai/util/_sandbox/__init__.py +2 -1
  274. inspect_ai/util/_sandbox/context.py +32 -7
  275. inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
  276. inspect_ai/util/_sandbox/docker/compose.py +2 -2
  277. inspect_ai/util/_sandbox/docker/docker.py +12 -1
  278. inspect_ai/util/_store_model.py +30 -7
  279. inspect_ai/util/_subprocess.py +13 -3
  280. inspect_ai/util/_subtask.py +1 -0
  281. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
  282. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
  283. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
  284. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
  285. /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
  286. /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
  287. /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
  288. /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
  289. /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
  290. /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
  291. /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
  292. /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
  293. /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
  294. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
  295. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
  296. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
  297. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
inspect_ai/__init__.py CHANGED
@@ -10,7 +10,8 @@ from inspect_ai._eval.score import score, score_async
10
10
  from inspect_ai._eval.task import Epochs, Task, TaskInfo, task_with
11
11
  from inspect_ai._eval.task.tasks import Tasks
12
12
  from inspect_ai._util.constants import PKG_NAME
13
- from inspect_ai.solver._human_agent.agent import human_agent
13
+ from inspect_ai.agent._human.agent import human_cli
14
+ from inspect_ai.solver._human_agent import human_agent
14
15
 
15
16
  __version__ = importlib_version(PKG_NAME)
16
17
 
inspect_ai/_cli/eval.py CHANGED
@@ -10,6 +10,7 @@ from inspect_ai._util.constants import (
10
10
  ALL_LOG_LEVELS,
11
11
  DEFAULT_EPOCHS,
12
12
  DEFAULT_LOG_LEVEL_TRANSCRIPT,
13
+ DEFAULT_LOG_SHARED,
13
14
  DEFAULT_MAX_CONNECTIONS,
14
15
  )
15
16
  from inspect_ai._util.file import filesystem
@@ -25,7 +26,12 @@ from .common import (
25
26
  common_options,
26
27
  process_common_options,
27
28
  )
28
- from .util import parse_cli_args, parse_cli_config, parse_sandbox
29
+ from .util import (
30
+ int_or_bool_flag_callback,
31
+ parse_cli_args,
32
+ parse_cli_config,
33
+ parse_sandbox,
34
+ )
29
35
 
30
36
  MAX_SAMPLES_HELP = "Maximum number of samples to run in parallel (default is running all samples in parallel)"
31
37
  MAX_TASKS_HELP = "Maximum number of tasks to run in parallel (default is 1)"
@@ -41,6 +47,7 @@ LOG_IMAGES_HELP = (
41
47
  "Include base64 encoded versions of filename or URL based images in the log file."
42
48
  )
43
49
  LOG_BUFFER_HELP = "Number of samples to buffer before writing log file. If not specified, an appropriate default for the format and filesystem is chosen (10 for most all cases, 100 for JSON logs on remote filesystems)."
50
+ LOG_SHARED_HELP = "Sync sample events to log directory so that users on other systems can see log updates in realtime (defaults to no syncing). If enabled will sync every 10 seconds (or pass a value to sync every `n` seconds)."
44
51
  NO_SCORE_HELP = (
45
52
  "Do not score model output (use the inspect score command to score output later)"
46
53
  )
@@ -266,6 +273,15 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
266
273
  @click.option(
267
274
  "--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
268
275
  )
276
+ @click.option(
277
+ "--log-shared",
278
+ is_flag=False,
279
+ flag_value="true",
280
+ default=None,
281
+ callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
282
+ help=LOG_SHARED_HELP,
283
+ envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
284
+ )
269
285
  @click.option(
270
286
  "--no-score",
271
287
  type=bool,
@@ -396,7 +412,7 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
396
412
  @click.option(
397
413
  "--reasoning-effort",
398
414
  type=click.Choice(["low", "medium", "high"]),
399
- help="Constrains effort on reasoning for reasoning models. Open AI o-series models only.",
415
+ help="Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o-series models only.",
400
416
  envvar="INSPECT_EVAL_REASONING_EFFORT",
401
417
  )
402
418
  @click.option(
@@ -503,6 +519,7 @@ def eval_command(
503
519
  no_log_samples: bool | None,
504
520
  log_images: bool | None,
505
521
  log_buffer: int | None,
522
+ log_shared: int | None,
506
523
  no_score: bool | None,
507
524
  no_score_display: bool | None,
508
525
  log_format: Literal["eval", "json"] | None,
@@ -556,6 +573,7 @@ def eval_command(
556
573
  no_log_samples=no_log_samples,
557
574
  log_images=log_images,
558
575
  log_buffer=log_buffer,
576
+ log_shared=log_shared,
559
577
  no_score=no_score,
560
578
  no_score_display=no_score_display,
561
579
  is_eval_set=False,
@@ -670,6 +688,7 @@ def eval_set_command(
670
688
  no_log_samples: bool | None,
671
689
  log_images: bool | None,
672
690
  log_buffer: int | None,
691
+ log_shared: int | None,
673
692
  no_score: bool | None,
674
693
  no_score_display: bool | None,
675
694
  bundle_dir: str | None,
@@ -728,6 +747,7 @@ def eval_set_command(
728
747
  no_log_samples=no_log_samples,
729
748
  log_images=log_images,
730
749
  log_buffer=log_buffer,
750
+ log_shared=log_shared,
731
751
  no_score=no_score,
732
752
  no_score_display=no_score_display,
733
753
  is_eval_set=True,
@@ -783,6 +803,7 @@ def eval_exec(
783
803
  no_log_samples: bool | None,
784
804
  log_images: bool | None,
785
805
  log_buffer: int | None,
806
+ log_shared: int | None,
786
807
  no_score: bool | None,
787
808
  no_score_display: bool | None,
788
809
  is_eval_set: bool = False,
@@ -865,6 +886,7 @@ def eval_exec(
865
886
  log_samples=log_samples,
866
887
  log_images=log_images,
867
888
  log_buffer=log_buffer,
889
+ log_shared=log_shared,
868
890
  score=score,
869
891
  score_display=score_display,
870
892
  )
@@ -1004,6 +1026,15 @@ def parse_comma_separated(value: str | None) -> list[str] | None:
1004
1026
  @click.option(
1005
1027
  "--log-buffer", type=int, help=LOG_BUFFER_HELP, envvar="INSPECT_EVAL_LOG_BUFFER"
1006
1028
  )
1029
+ @click.option(
1030
+ "--log-shared",
1031
+ is_flag=False,
1032
+ flag_value="true",
1033
+ default=None,
1034
+ callback=int_or_bool_flag_callback(DEFAULT_LOG_SHARED),
1035
+ help=LOG_SHARED_HELP,
1036
+ envvar=["INSPECT_LOG_SHARED", "INSPECT_EVAL_LOG_SHARED"],
1037
+ )
1007
1038
  @click.option(
1008
1039
  "--no-score",
1009
1040
  type=bool,
@@ -1052,6 +1083,7 @@ def eval_retry_command(
1052
1083
  no_log_samples: bool | None,
1053
1084
  log_images: bool | None,
1054
1085
  log_buffer: int | None,
1086
+ log_shared: int | None,
1055
1087
  no_score: bool | None,
1056
1088
  no_score_display: bool | None,
1057
1089
  max_connections: int | None,
@@ -1099,6 +1131,7 @@ def eval_retry_command(
1099
1131
  log_samples=log_samples,
1100
1132
  log_images=log_images,
1101
1133
  log_buffer=log_buffer,
1134
+ log_shared=log_shared,
1102
1135
  score=score,
1103
1136
  score_display=score_display,
1104
1137
  max_retries=max_retries,
inspect_ai/_cli/util.py CHANGED
@@ -1,11 +1,54 @@
1
- from typing import Any
1
+ from typing import Any, Callable
2
2
 
3
+ import click
3
4
  import yaml
4
5
 
5
6
  from inspect_ai._util.config import resolve_args
6
7
  from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
7
8
 
8
9
 
10
+ def int_or_bool_flag_callback(
11
+ true_value: int, false_value: int = 0
12
+ ) -> Callable[[click.Context, click.Parameter, Any], int]:
13
+ def callback(ctx: click.Context, param: click.Parameter, value: Any) -> int:
14
+ """Callback to parse the an option that can either be a boolean flag or integer.
15
+
16
+ Desired behavior:
17
+ - Not specified at all -> false_value
18
+ - Specified with no value -> true_value
19
+ - Specified with "true"/"false" -> true_value or false_value respectively
20
+ - Specified with an integer -> that integer
21
+ """
22
+ # 1. If this parameter was never given on the command line,
23
+ # then we return 0.
24
+ source = ctx.get_parameter_source(param.name) if param.name else ""
25
+ if source == click.core.ParameterSource.DEFAULT:
26
+ # Means the user did NOT specify the flag at all
27
+ return false_value
28
+
29
+ # 2. The user did specify the flag. If value is None,
30
+ # that means they used the flag with no argument, e.g. --my-flag
31
+ if value is None:
32
+ return true_value
33
+
34
+ # 3. If there is a value, try to parse booleans or an integer.
35
+ lower_val = value.lower()
36
+ if lower_val in ("true", "yes", "1"):
37
+ return true_value
38
+ elif lower_val in ("false", "no", "0"):
39
+ return false_value
40
+ else:
41
+ # 4. Otherwise, assume it is an integer
42
+ try:
43
+ return int(value)
44
+ except ValueError:
45
+ raise click.BadParameter(
46
+ f"Expected 'true', 'false', or an integer for --{param.name}. Got: {value}"
47
+ )
48
+
49
+ return callback
50
+
51
+
9
52
  def parse_cli_config(
10
53
  args: tuple[str] | list[str] | None, config: str | None
11
54
  ) -> dict[str, Any]:
@@ -36,7 +36,7 @@ def task_config(
36
36
  value = value if isinstance(value, list) else [value]
37
37
  value = [str(v) for v in value]
38
38
  config_print.append(f"{name}: {','.join(value)}")
39
- elif name not in ["limit", "model", "response_schema"]:
39
+ elif name not in ["limit", "model", "response_schema", "log_shared"]:
40
40
  if isinstance(value, list):
41
41
  value = ",".join([str(v) for v in value])
42
42
  if isinstance(value, str):
@@ -15,6 +15,7 @@ from typing import (
15
15
  )
16
16
 
17
17
  import rich
18
+ from pydantic import BaseModel, Field, field_validator
18
19
  from rich.console import Console
19
20
 
20
21
  from inspect_ai.log import EvalConfig, EvalResults, EvalStats
@@ -104,12 +105,20 @@ class TaskScreen(contextlib.AbstractContextManager["TaskScreen"]):
104
105
  raise NotImplementedError("input_panel not implemented by current display")
105
106
 
106
107
 
107
- @dataclass
108
- class TaskDisplayMetric:
108
+ class TaskDisplayMetric(BaseModel):
109
109
  scorer: str
110
110
  name: str
111
- value: float | int
112
- reducer: str | None
111
+ value: float | int | None = Field(default=None)
112
+ reducer: str | None = Field(default=None)
113
+
114
+ @field_validator("value", mode="before")
115
+ @classmethod
116
+ def handle_null_value(cls, v: Any) -> Union[float, int, None]:
117
+ if v is None:
118
+ return None
119
+ if isinstance(v, float | int):
120
+ return v
121
+ raise ValueError(f"Expected float, int, or None, got {type(v)}")
113
122
 
114
123
 
115
124
  @runtime_checkable
@@ -180,7 +180,7 @@ def task_metric(metrics: list[TaskDisplayMetric], width: int | None = None) -> s
180
180
  )
181
181
 
182
182
  metric = metrics[0]
183
- if np.isnan(metric.value):
183
+ if metric.value is None or np.isnan(metric.value):
184
184
  value = " n/a"
185
185
  else:
186
186
  value = f"{metric.value:.2f}"
@@ -58,10 +58,12 @@ class TaskScreenResult(Generic[TR]):
58
58
  value: TR | BaseException,
59
59
  tasks: list[TaskWithResult],
60
60
  output: list[str],
61
+ warnings: list[str],
61
62
  ) -> None:
62
63
  self.value = value
63
64
  self.tasks = tasks
64
65
  self.output = output
66
+ self.warnings = warnings
65
67
 
66
68
 
67
69
  class TaskScreenApp(App[TR]):
@@ -86,6 +88,7 @@ class TaskScreenApp(App[TR]):
86
88
  self._worker: Worker[TR] | None = None
87
89
  self._error: BaseException | None = None
88
90
  self._output: list[str] = []
91
+ self._warnings: list[str] = []
89
92
 
90
93
  # task screen
91
94
  self._total_tasks = 0
@@ -120,7 +123,12 @@ class TaskScreenApp(App[TR]):
120
123
  value = CancelledError()
121
124
 
122
125
  # return result w/ output
123
- return TaskScreenResult(value=value, tasks=self._app_tasks, output=self._output)
126
+ return TaskScreenResult(
127
+ value=value,
128
+ tasks=self._app_tasks,
129
+ output=self._output,
130
+ warnings=self._warnings,
131
+ )
124
132
 
125
133
  async def on_load(self) -> None:
126
134
  # events used to synchronise loading
@@ -349,8 +357,11 @@ class TaskScreenApp(App[TR]):
349
357
  if text.endswith("\n"):
350
358
  text = text[:-1]
351
359
 
352
- # track output (for printing at the end)
353
- self._output.append(text)
360
+ # track output and warnings (for printing at the end)
361
+ if "WARNING" in text:
362
+ self._warnings.append(text)
363
+ else:
364
+ self._output.append(text)
354
365
 
355
366
  # write to console view
356
367
  self.query_one(ConsoleView).write_ansi(text)
@@ -42,6 +42,10 @@ class TextualDisplay(Display):
42
42
  # print tasks
43
43
  rich.print(tasks_results(result.tasks))
44
44
 
45
+ # print warnings
46
+ if result.warnings:
47
+ print("\n".join(result.warnings))
48
+
45
49
  # raise error as required
46
50
  if isinstance(result.value, BaseException):
47
51
  raise result.value
@@ -17,7 +17,7 @@ from textual.widgets import (
17
17
  OptionList,
18
18
  Static,
19
19
  )
20
- from textual.widgets.option_list import Option
20
+ from textual.widgets.option_list import Option, OptionDoesNotExist
21
21
 
22
22
  from inspect_ai._display.textual.widgets.port_mappings import get_url
23
23
  from inspect_ai._util.format import format_progress_time
@@ -124,7 +124,7 @@ class SamplesList(OptionList):
124
124
  def set_samples(self, samples: list[ActiveSample]) -> None:
125
125
  # check for a highlighted sample (make sure we don't remove it)
126
126
  highlighted_id = (
127
- self.get_option_at_index(self.highlighted).id
127
+ self.get_id_at_index(self.highlighted)
128
128
  if self.highlighted is not None
129
129
  else None
130
130
  )
@@ -179,12 +179,18 @@ class SamplesList(OptionList):
179
179
  self.scroll_to_highlight()
180
180
 
181
181
  def sample_for_highlighted(self, highlighted: int) -> ActiveSample | None:
182
- highlighted_id = self.get_option_at_index(highlighted).id
182
+ highlighted_id = self.get_id_at_index(highlighted)
183
183
  if highlighted_id is not None:
184
184
  return sample_for_id(self.samples, highlighted_id)
185
185
  else:
186
186
  return None
187
187
 
188
+ def get_id_at_index(self, index: int) -> str | None:
189
+ try:
190
+ return self.get_option_at_index(index).id
191
+ except OptionDoesNotExist:
192
+ return None
193
+
188
194
 
189
195
  class SampleVNC(Horizontal):
190
196
  DEFAULT_CSS = """
@@ -14,7 +14,7 @@ from inspect_ai._display.core.display import TaskDisplayMetric
14
14
  @dataclass
15
15
  class TaskMetric:
16
16
  name: str
17
- value: float
17
+ value: float | int | None
18
18
 
19
19
 
20
20
  class TaskDetail(Widget):
@@ -221,21 +221,21 @@ class TaskMetrics(Widget):
221
221
  self.recompute_grid()
222
222
 
223
223
  def on_mount(self) -> None:
224
- self.recompute_grid()
224
+ self.recompute_grid(True)
225
225
 
226
- def recompute_grid(self) -> None:
227
- if not self.is_mounted:
226
+ def recompute_grid(self, force: bool = False) -> None:
227
+ if not self.is_mounted and not force:
228
228
  return
229
-
230
229
  grid = self.query_one(f"#{self.grid_id()}")
231
230
 
232
231
  grid.remove_children()
233
232
  for metric in self.metrics:
234
233
  # Add the value static but keep it around
235
234
  # for future updates
236
- self.value_widgets[metric.name] = Static(
237
- self._metric_value(metric.value), markup=False
238
- )
235
+ if metric.value is not None:
236
+ self.value_widgets[metric.name] = Static(
237
+ self._metric_value(metric.value), markup=False
238
+ )
239
239
 
240
240
  grid.mount(Static(metric.name, markup=False))
241
241
  grid.mount(self.value_widgets[metric.name])
@@ -17,6 +17,11 @@ from inspect_ai._display.core.results import task_metric
17
17
  from inspect_ai._display.textual.widgets.clock import Clock
18
18
  from inspect_ai._display.textual.widgets.task_detail import TaskDetail
19
19
  from inspect_ai._display.textual.widgets.toggle import Toggle
20
+ from inspect_ai._display.textual.widgets.vscode import conditional_vscode_link
21
+ from inspect_ai._util.file import to_uri
22
+ from inspect_ai._util.vscode import (
23
+ VSCodeCommand,
24
+ )
20
25
 
21
26
  from ...core.display import (
22
27
  Progress,
@@ -151,7 +156,7 @@ class TaskProgressView(Widget):
151
156
  height: auto;
152
157
  width: 1fr;
153
158
  layout: grid;
154
- grid-size: 8 2;
159
+ grid-size: 9 2;
155
160
  grid-columns: auto auto auto auto 1fr auto auto auto;
156
161
  grid-rows: auto auto;
157
162
  grid-gutter: 0 1;
@@ -200,6 +205,15 @@ class TaskProgressView(Widget):
200
205
 
201
206
  self.sample_count_width: int = sample_count_width
202
207
  self.display_metrics = display_metrics
208
+ self.view_log_link = conditional_vscode_link(
209
+ "[View Log]",
210
+ VSCodeCommand(
211
+ command="inspect.openLogViewer",
212
+ args=[to_uri(task.profile.log_location)]
213
+ if task.profile.log_location
214
+ else [],
215
+ ),
216
+ )
203
217
 
204
218
  metrics: reactive[list[TaskDisplayMetric] | None] = reactive(None)
205
219
  metrics_width: reactive[int | None] = reactive(None)
@@ -222,6 +236,8 @@ class TaskProgressView(Widget):
222
236
  yield self.count_display
223
237
  yield self.metrics_display
224
238
  yield Clock()
239
+ yield self.view_log_link
240
+
225
241
  yield self.task_detail
226
242
 
227
243
  @on(Toggle.Toggled)
@@ -0,0 +1,44 @@
1
+ from textual.widget import Widget
2
+ from textual.widgets import Link, Static
3
+
4
+ from inspect_ai._util.vscode import (
5
+ VSCodeCommand,
6
+ can_execute_vscode_command,
7
+ execute_vscode_commands,
8
+ )
9
+
10
+
11
+ def conditional_vscode_link(text: str, command: VSCodeCommand) -> Widget:
12
+ if can_execute_vscode_command(command.command):
13
+ vscode_link = VSCodeLink(text)
14
+ vscode_link.commands = [command]
15
+ return vscode_link
16
+ else:
17
+ return Static()
18
+
19
+
20
+ class VSCodeLink(Link):
21
+ def __init__(
22
+ self,
23
+ text: str,
24
+ *,
25
+ url: str | None = None,
26
+ tooltip: str | None = None,
27
+ name: str | None = None,
28
+ id: str | None = None,
29
+ classes: str | None = None,
30
+ disabled: bool = False,
31
+ ) -> None:
32
+ super().__init__(
33
+ text,
34
+ url=url,
35
+ tooltip=tooltip,
36
+ name=name,
37
+ id=id,
38
+ classes=classes,
39
+ disabled=disabled,
40
+ )
41
+ self.commands: list[VSCodeCommand] = []
42
+
43
+ def on_click(self) -> None:
44
+ execute_vscode_commands(self.commands)