inspect-ai 0.3.81__py3-none-any.whl → 0.3.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (297) hide show
  1. inspect_ai/__init__.py +2 -1
  2. inspect_ai/_cli/eval.py +35 -2
  3. inspect_ai/_cli/util.py +44 -1
  4. inspect_ai/_display/core/config.py +1 -1
  5. inspect_ai/_display/core/display.py +13 -4
  6. inspect_ai/_display/core/results.py +1 -1
  7. inspect_ai/_display/textual/app.py +14 -3
  8. inspect_ai/_display/textual/display.py +4 -0
  9. inspect_ai/_display/textual/widgets/samples.py +9 -3
  10. inspect_ai/_display/textual/widgets/task_detail.py +8 -8
  11. inspect_ai/_display/textual/widgets/tasks.py +17 -1
  12. inspect_ai/_display/textual/widgets/vscode.py +44 -0
  13. inspect_ai/_eval/eval.py +74 -25
  14. inspect_ai/_eval/evalset.py +22 -18
  15. inspect_ai/_eval/loader.py +34 -11
  16. inspect_ai/_eval/run.py +13 -15
  17. inspect_ai/_eval/score.py +13 -3
  18. inspect_ai/_eval/task/generate.py +8 -9
  19. inspect_ai/_eval/task/log.py +55 -6
  20. inspect_ai/_eval/task/run.py +51 -10
  21. inspect_ai/_eval/task/task.py +23 -9
  22. inspect_ai/_util/constants.py +2 -0
  23. inspect_ai/_util/file.py +30 -1
  24. inspect_ai/_util/json.py +37 -1
  25. inspect_ai/_util/registry.py +1 -0
  26. inspect_ai/_util/vscode.py +37 -0
  27. inspect_ai/_view/server.py +113 -1
  28. inspect_ai/_view/www/App.css +7 -1
  29. inspect_ai/_view/www/dist/assets/index.css +813 -415
  30. inspect_ai/_view/www/dist/assets/index.js +54475 -32003
  31. inspect_ai/_view/www/eslint.config.mjs +1 -1
  32. inspect_ai/_view/www/log-schema.json +137 -31
  33. inspect_ai/_view/www/node_modules/flatted/python/flatted.py +149 -0
  34. inspect_ai/_view/www/package.json +11 -2
  35. inspect_ai/_view/www/src/App.tsx +161 -853
  36. inspect_ai/_view/www/src/api/api-browser.ts +176 -5
  37. inspect_ai/_view/www/src/api/api-vscode.ts +75 -1
  38. inspect_ai/_view/www/src/api/client-api.ts +66 -10
  39. inspect_ai/_view/www/src/api/jsonrpc.ts +2 -0
  40. inspect_ai/_view/www/src/api/types.ts +107 -2
  41. inspect_ai/_view/www/src/appearance/icons.ts +2 -0
  42. inspect_ai/_view/www/src/components/AsciinemaPlayer.tsx +3 -3
  43. inspect_ai/_view/www/src/components/Card.tsx +6 -4
  44. inspect_ai/_view/www/src/components/DownloadPanel.tsx +2 -2
  45. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +56 -61
  46. inspect_ai/_view/www/src/components/FindBand.tsx +17 -9
  47. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +1 -1
  48. inspect_ai/_view/www/src/components/JsonPanel.tsx +14 -24
  49. inspect_ai/_view/www/src/components/LargeModal.tsx +2 -35
  50. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +27 -11
  51. inspect_ai/_view/www/src/components/LinkButton.module.css +16 -0
  52. inspect_ai/_view/www/src/components/LinkButton.tsx +33 -0
  53. inspect_ai/_view/www/src/components/LiveVirtualList.module.css +11 -0
  54. inspect_ai/_view/www/src/components/LiveVirtualList.tsx +177 -0
  55. inspect_ai/_view/www/src/components/MarkdownDiv.tsx +116 -26
  56. inspect_ai/_view/www/src/components/MessageBand.tsx +14 -9
  57. inspect_ai/_view/www/src/components/Modal.module.css +38 -0
  58. inspect_ai/_view/www/src/components/Modal.tsx +77 -0
  59. inspect_ai/_view/www/src/components/MorePopOver.tsx +3 -3
  60. inspect_ai/_view/www/src/components/NavPills.tsx +20 -8
  61. inspect_ai/_view/www/src/components/NoContentsPanel.module.css +12 -0
  62. inspect_ai/_view/www/src/components/NoContentsPanel.tsx +20 -0
  63. inspect_ai/_view/www/src/components/ProgressBar.module.css +5 -4
  64. inspect_ai/_view/www/src/components/ProgressBar.tsx +3 -2
  65. inspect_ai/_view/www/src/components/PulsingDots.module.css +81 -0
  66. inspect_ai/_view/www/src/components/PulsingDots.tsx +45 -0
  67. inspect_ai/_view/www/src/components/TabSet.tsx +4 -37
  68. inspect_ai/_view/www/src/components/ToolButton.tsx +3 -4
  69. inspect_ai/_view/www/src/index.tsx +26 -94
  70. inspect_ai/_view/www/src/logfile/remoteLogFile.ts +9 -1
  71. inspect_ai/_view/www/src/logfile/remoteZipFile.ts +30 -4
  72. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +4 -6
  73. inspect_ai/_view/www/src/plan/DetailStep.module.css +4 -0
  74. inspect_ai/_view/www/src/plan/DetailStep.tsx +6 -3
  75. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +1 -1
  76. inspect_ai/_view/www/src/plan/SolverDetailView.module.css +2 -1
  77. inspect_ai/_view/www/src/samples/InlineSampleDisplay.module.css +9 -1
  78. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +74 -28
  79. inspect_ai/_view/www/src/samples/SampleDialog.tsx +58 -22
  80. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +4 -0
  81. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +135 -104
  82. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +10 -0
  83. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +83 -36
  84. inspect_ai/_view/www/src/samples/SamplesTools.tsx +35 -30
  85. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -1
  86. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +1 -1
  87. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +45 -53
  88. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +6 -1
  89. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +5 -0
  90. inspect_ai/_view/www/src/samples/chat/messages.ts +36 -0
  91. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.module.css +3 -0
  92. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +11 -1
  93. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +22 -46
  94. inspect_ai/_view/www/src/samples/descriptor/samplesDescriptor.tsx +34 -20
  95. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.module.css +3 -3
  96. inspect_ai/_view/www/src/samples/descriptor/score/BooleanScoreDescriptor.tsx +1 -1
  97. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.module.css +4 -4
  98. inspect_ai/_view/www/src/samples/descriptor/score/ObjectScoreDescriptor.tsx +10 -10
  99. inspect_ai/_view/www/src/samples/descriptor/types.ts +6 -5
  100. inspect_ai/_view/www/src/samples/list/SampleFooter.module.css +22 -3
  101. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +27 -2
  102. inspect_ai/_view/www/src/samples/list/SampleList.tsx +122 -85
  103. inspect_ai/_view/www/src/samples/list/SampleRow.module.css +6 -0
  104. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +28 -15
  105. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +29 -18
  106. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +28 -28
  107. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +19 -9
  108. inspect_ai/_view/www/src/samples/sampleDataAdapter.ts +33 -0
  109. inspect_ai/_view/www/src/samples/sampleLimit.ts +2 -2
  110. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +12 -27
  111. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.module.css +38 -0
  112. inspect_ai/_view/www/src/samples/scores/SampleScoresGrid.tsx +118 -0
  113. inspect_ai/_view/www/src/samples/scores/{SampleScoreView.module.css → SampleScoresView.module.css} +10 -1
  114. inspect_ai/_view/www/src/samples/scores/SampleScoresView.tsx +78 -0
  115. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +0 -13
  116. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +0 -13
  117. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +0 -13
  118. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +4 -0
  119. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +10 -24
  120. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +0 -13
  121. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +4 -22
  122. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +15 -24
  123. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +0 -13
  124. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +6 -28
  125. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +24 -34
  126. inspect_ai/_view/www/src/samples/transcript/ToolEventView.module.css +4 -0
  127. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +33 -17
  128. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +197 -338
  129. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.module.css +16 -0
  130. inspect_ai/_view/www/src/samples/transcript/TranscriptVirtualListComponent.tsx +44 -0
  131. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +7 -4
  132. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +81 -60
  133. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.module.css +23 -0
  134. inspect_ai/_view/www/src/samples/transcript/event/EventProgressPanel.tsx +27 -0
  135. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +29 -1
  136. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +102 -72
  137. inspect_ai/_view/www/src/scoring/utils.ts +87 -0
  138. inspect_ai/_view/www/src/state/appSlice.ts +244 -0
  139. inspect_ai/_view/www/src/state/hooks.ts +399 -0
  140. inspect_ai/_view/www/src/state/logPolling.ts +200 -0
  141. inspect_ai/_view/www/src/state/logSlice.ts +224 -0
  142. inspect_ai/_view/www/src/state/logsPolling.ts +118 -0
  143. inspect_ai/_view/www/src/state/logsSlice.ts +181 -0
  144. inspect_ai/_view/www/src/state/samplePolling.ts +314 -0
  145. inspect_ai/_view/www/src/state/sampleSlice.ts +140 -0
  146. inspect_ai/_view/www/src/state/sampleUtils.ts +21 -0
  147. inspect_ai/_view/www/src/state/scrolling.ts +206 -0
  148. inspect_ai/_view/www/src/state/store.ts +168 -0
  149. inspect_ai/_view/www/src/state/store_filter.ts +84 -0
  150. inspect_ai/_view/www/src/state/utils.ts +23 -0
  151. inspect_ai/_view/www/src/storage/index.ts +26 -0
  152. inspect_ai/_view/www/src/types/log.d.ts +36 -26
  153. inspect_ai/_view/www/src/types/markdown-it-katex.d.ts +21 -0
  154. inspect_ai/_view/www/src/types.ts +94 -32
  155. inspect_ai/_view/www/src/utils/attachments.ts +58 -23
  156. inspect_ai/_view/www/src/utils/json-worker.ts +79 -12
  157. inspect_ai/_view/www/src/utils/logger.ts +52 -0
  158. inspect_ai/_view/www/src/utils/polling.ts +100 -0
  159. inspect_ai/_view/www/src/utils/react.ts +30 -0
  160. inspect_ai/_view/www/src/utils/vscode.ts +1 -1
  161. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +184 -217
  162. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +11 -53
  163. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +8 -18
  164. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -0
  165. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +40 -22
  166. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +16 -1
  167. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +159 -103
  168. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.module.css +32 -0
  169. inspect_ai/_view/www/src/workspace/navbar/RunningStatusPanel.tsx +32 -0
  170. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.module.css +35 -0
  171. inspect_ai/_view/www/src/workspace/navbar/ScoreGrid.tsx +117 -0
  172. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +12 -14
  173. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +6 -2
  174. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +4 -4
  175. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.module.css +3 -2
  176. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +28 -13
  177. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +5 -10
  178. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +4 -4
  179. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.module.css +22 -0
  180. inspect_ai/_view/www/src/workspace/tabs/RunningNoSamples.tsx +19 -0
  181. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +128 -115
  182. inspect_ai/_view/www/src/workspace/tabs/grouping.ts +37 -5
  183. inspect_ai/_view/www/src/workspace/tabs/types.ts +4 -0
  184. inspect_ai/_view/www/src/workspace/types.ts +4 -3
  185. inspect_ai/_view/www/src/workspace/utils.ts +4 -4
  186. inspect_ai/_view/www/vite.config.js +6 -0
  187. inspect_ai/_view/www/yarn.lock +464 -355
  188. inspect_ai/agent/__init__.py +36 -0
  189. inspect_ai/agent/_agent.py +268 -0
  190. inspect_ai/agent/_as_solver.py +72 -0
  191. inspect_ai/agent/_as_tool.py +122 -0
  192. inspect_ai/{solver → agent}/_bridge/bridge.py +23 -37
  193. inspect_ai/{solver → agent}/_bridge/patch.py +9 -8
  194. inspect_ai/agent/_filter.py +46 -0
  195. inspect_ai/agent/_handoff.py +93 -0
  196. inspect_ai/{solver/_human_agent → agent/_human}/agent.py +11 -12
  197. inspect_ai/{solver/_human_agent → agent/_human}/commands/__init__.py +2 -3
  198. inspect_ai/{solver/_human_agent → agent/_human}/commands/clock.py +3 -1
  199. inspect_ai/{solver/_human_agent → agent/_human}/commands/score.py +5 -5
  200. inspect_ai/{solver/_human_agent → agent/_human}/install.py +6 -3
  201. inspect_ai/{solver/_human_agent → agent/_human}/service.py +7 -3
  202. inspect_ai/{solver/_human_agent → agent/_human}/state.py +5 -5
  203. inspect_ai/agent/_react.py +241 -0
  204. inspect_ai/agent/_run.py +36 -0
  205. inspect_ai/agent/_types.py +81 -0
  206. inspect_ai/log/_condense.py +26 -0
  207. inspect_ai/log/_log.py +17 -5
  208. inspect_ai/log/_recorders/buffer/__init__.py +14 -0
  209. inspect_ai/log/_recorders/buffer/buffer.py +30 -0
  210. inspect_ai/log/_recorders/buffer/database.py +685 -0
  211. inspect_ai/log/_recorders/buffer/filestore.py +259 -0
  212. inspect_ai/log/_recorders/buffer/types.py +84 -0
  213. inspect_ai/log/_recorders/eval.py +2 -11
  214. inspect_ai/log/_recorders/types.py +30 -0
  215. inspect_ai/log/_transcript.py +32 -2
  216. inspect_ai/model/__init__.py +7 -1
  217. inspect_ai/model/_call_tools.py +257 -52
  218. inspect_ai/model/_chat_message.py +7 -4
  219. inspect_ai/model/_conversation.py +13 -62
  220. inspect_ai/model/_display.py +85 -0
  221. inspect_ai/model/_generate_config.py +2 -2
  222. inspect_ai/model/_model.py +114 -14
  223. inspect_ai/model/_model_output.py +14 -9
  224. inspect_ai/model/_openai.py +16 -4
  225. inspect_ai/model/_openai_computer_use.py +162 -0
  226. inspect_ai/model/_openai_responses.py +319 -165
  227. inspect_ai/model/_providers/anthropic.py +20 -21
  228. inspect_ai/model/_providers/azureai.py +24 -13
  229. inspect_ai/model/_providers/bedrock.py +1 -7
  230. inspect_ai/model/_providers/cloudflare.py +3 -3
  231. inspect_ai/model/_providers/goodfire.py +2 -6
  232. inspect_ai/model/_providers/google.py +11 -10
  233. inspect_ai/model/_providers/groq.py +6 -3
  234. inspect_ai/model/_providers/hf.py +7 -3
  235. inspect_ai/model/_providers/mistral.py +7 -10
  236. inspect_ai/model/_providers/openai.py +47 -17
  237. inspect_ai/model/_providers/openai_o1.py +11 -4
  238. inspect_ai/model/_providers/openai_responses.py +12 -14
  239. inspect_ai/model/_providers/providers.py +2 -2
  240. inspect_ai/model/_providers/together.py +12 -2
  241. inspect_ai/model/_providers/util/chatapi.py +7 -2
  242. inspect_ai/model/_providers/util/hf_handler.py +4 -2
  243. inspect_ai/model/_providers/util/llama31.py +4 -2
  244. inspect_ai/model/_providers/vertex.py +11 -9
  245. inspect_ai/model/_providers/vllm.py +4 -4
  246. inspect_ai/scorer/__init__.py +2 -0
  247. inspect_ai/scorer/_metrics/__init__.py +2 -0
  248. inspect_ai/scorer/_metrics/grouped.py +84 -0
  249. inspect_ai/scorer/_score.py +26 -6
  250. inspect_ai/solver/__init__.py +2 -2
  251. inspect_ai/solver/_basic_agent.py +22 -9
  252. inspect_ai/solver/_bridge.py +31 -0
  253. inspect_ai/solver/_chain.py +20 -12
  254. inspect_ai/solver/_fork.py +5 -1
  255. inspect_ai/solver/_human_agent.py +52 -0
  256. inspect_ai/solver/_prompt.py +3 -1
  257. inspect_ai/solver/_run.py +59 -0
  258. inspect_ai/solver/_solver.py +14 -4
  259. inspect_ai/solver/_task_state.py +5 -3
  260. inspect_ai/tool/_tool_call.py +15 -8
  261. inspect_ai/tool/_tool_def.py +17 -12
  262. inspect_ai/tool/_tool_support_helpers.py +4 -4
  263. inspect_ai/tool/_tool_with.py +14 -11
  264. inspect_ai/tool/_tools/_bash_session.py +11 -2
  265. inspect_ai/tool/_tools/_computer/_common.py +18 -2
  266. inspect_ai/tool/_tools/_computer/_computer.py +18 -2
  267. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +2 -0
  268. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +17 -0
  269. inspect_ai/tool/_tools/_think.py +1 -1
  270. inspect_ai/tool/_tools/_web_browser/_web_browser.py +103 -62
  271. inspect_ai/util/__init__.py +2 -0
  272. inspect_ai/util/_anyio.py +27 -0
  273. inspect_ai/util/_sandbox/__init__.py +2 -1
  274. inspect_ai/util/_sandbox/context.py +32 -7
  275. inspect_ai/util/_sandbox/docker/cleanup.py +4 -0
  276. inspect_ai/util/_sandbox/docker/compose.py +2 -2
  277. inspect_ai/util/_sandbox/docker/docker.py +12 -1
  278. inspect_ai/util/_store_model.py +30 -7
  279. inspect_ai/util/_subprocess.py +13 -3
  280. inspect_ai/util/_subtask.py +1 -0
  281. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/METADATA +1 -1
  282. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/RECORD +295 -229
  283. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +0 -169
  284. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +0 -22
  285. /inspect_ai/{solver → agent}/_bridge/__init__.py +0 -0
  286. /inspect_ai/{solver/_human_agent → agent/_human}/__init__.py +0 -0
  287. /inspect_ai/{solver/_human_agent → agent/_human}/commands/command.py +0 -0
  288. /inspect_ai/{solver/_human_agent → agent/_human}/commands/instructions.py +0 -0
  289. /inspect_ai/{solver/_human_agent → agent/_human}/commands/note.py +0 -0
  290. /inspect_ai/{solver/_human_agent → agent/_human}/commands/status.py +0 -0
  291. /inspect_ai/{solver/_human_agent → agent/_human}/commands/submit.py +0 -0
  292. /inspect_ai/{solver/_human_agent → agent/_human}/panel.py +0 -0
  293. /inspect_ai/{solver/_human_agent → agent/_human}/view.py +0 -0
  294. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/WHEEL +0 -0
  295. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/entry_points.txt +0 -0
  296. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/licenses/LICENSE +0 -0
  297. {inspect_ai-0.3.81.dist-info → inspect_ai-0.3.83.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@ export type Task = string;
13
13
  export type TaskId = string;
14
14
  export type TaskVersion = number;
15
15
  export type TaskFile = string | null;
16
+ export type TaskRegistryName = string | null;
16
17
  export type Solver = string | null;
17
18
  export type SolverArgs = {} | null;
18
19
  export type Tags = string[] | null;
@@ -44,6 +45,7 @@ export type SandboxCleanup = boolean | null;
44
45
  export type LogSamples = boolean | null;
45
46
  export type LogImages = boolean | null;
46
47
  export type LogBuffer = number | null;
48
+ export type LogShared = number | null;
47
49
  export type ScoreDisplay = boolean | null;
48
50
  export type Type1 = "git";
49
51
  export type Origin = string;
@@ -160,6 +162,7 @@ export type Content =
160
162
  )[];
161
163
  export type Type3 = "text";
162
164
  export type Text = string;
165
+ export type Refusal = boolean | null;
163
166
  export type Type4 = "reasoning";
164
167
  export type Reasoning = string;
165
168
  export type Signature = string | null;
@@ -203,12 +206,11 @@ export type Role2 = "assistant";
203
206
  export type ToolCalls = ToolCall[] | null;
204
207
  export type Id4 = string;
205
208
  export type Function = string;
206
- export type Type8 = string;
207
- export type InternalName = string | null;
208
209
  export type ParseError = string | null;
209
210
  export type Title = string | null;
210
211
  export type Format2 = "text" | "markdown";
211
212
  export type Content3 = string;
213
+ export type Model1 = string | null;
212
214
  export type Id5 = string | null;
213
215
  export type Content4 =
214
216
  | string
@@ -223,8 +225,7 @@ export type Source3 = ("input" | "generate") | null;
223
225
  export type Role3 = "tool";
224
226
  export type ToolCallId1 = string | null;
225
227
  export type Function1 = string | null;
226
- export type InternalName1 = string | null;
227
- export type Type9 =
228
+ export type Type8 =
228
229
  | "parsing"
229
230
  | "timeout"
230
231
  | "unicode_decode"
@@ -245,7 +246,7 @@ export type Messages = (
245
246
  | ChatMessageAssistant
246
247
  | ChatMessageTool
247
248
  )[];
248
- export type Model1 = string;
249
+ export type Model2 = string;
249
250
  export type StopReason =
250
251
  | "stop"
251
252
  | "max_tokens"
@@ -304,7 +305,7 @@ export type Timestamp1 = string;
304
305
  export type WorkingStart1 = number;
305
306
  export type Pending1 = boolean | null;
306
307
  export type Event1 = "sample_limit";
307
- export type Type10 =
308
+ export type Type9 =
308
309
  | "message"
309
310
  | "time"
310
311
  | "working"
@@ -344,7 +345,7 @@ export type Timestamp5 = string;
344
345
  export type WorkingStart5 = number;
345
346
  export type Pending5 = boolean | null;
346
347
  export type Event5 = "model";
347
- export type Model2 = string;
348
+ export type Model3 = string;
348
349
  export type Input3 = (
349
350
  | ChatMessageSystem
350
351
  | ChatMessageUser
@@ -353,7 +354,7 @@ export type Input3 = (
353
354
  )[];
354
355
  export type Name8 = string;
355
356
  export type Description2 = string;
356
- export type Type11 = "object";
357
+ export type Type10 = "object";
357
358
  export type Required1 = string[];
358
359
  export type Additionalproperties1 = boolean;
359
360
  export type Tools1 = ToolInfo[];
@@ -368,10 +369,9 @@ export type Timestamp6 = string;
368
369
  export type WorkingStart6 = number;
369
370
  export type Pending6 = boolean | null;
370
371
  export type Event6 = "tool";
371
- export type Type12 = "function";
372
+ export type Type11 = "function";
372
373
  export type Id7 = string;
373
374
  export type Function2 = string;
374
- export type InternalName2 = string | null;
375
375
  export type Result1 =
376
376
  | string
377
377
  | number
@@ -447,14 +447,14 @@ export type WorkingStart13 = number;
447
447
  export type Pending13 = boolean | null;
448
448
  export type Event13 = "step";
449
449
  export type Action1 = "begin" | "end";
450
- export type Type13 = string | null;
450
+ export type Type12 = string | null;
451
451
  export type Name11 = string;
452
452
  export type Timestamp14 = string;
453
453
  export type WorkingStart14 = number;
454
454
  export type Pending14 = boolean | null;
455
455
  export type Event14 = "subtask";
456
456
  export type Name12 = string;
457
- export type Type14 = string | null;
457
+ export type Type13 = string | null;
458
458
  export type Events2 = (
459
459
  | SampleInitEvent
460
460
  | SampleLimitEvent
@@ -493,6 +493,8 @@ export type Events1 = (
493
493
  )[];
494
494
  export type Completed3 = string | null;
495
495
  export type WorkingTime2 = number | null;
496
+ export type Agent = string | null;
497
+ export type Failed = boolean | null;
496
498
  export type Events = (
497
499
  | SampleInitEvent
498
500
  | SampleLimitEvent
@@ -513,7 +515,7 @@ export type Events = (
513
515
  export type TotalTime = number | null;
514
516
  export type WorkingTime3 = number | null;
515
517
  export type Uuid = string | null;
516
- export type Type15 =
518
+ export type Type14 =
517
519
  | "context"
518
520
  | "time"
519
521
  | "working"
@@ -565,6 +567,7 @@ export interface EvalSpec {
565
567
  task_id: TaskId;
566
568
  task_version: TaskVersion;
567
569
  task_file: TaskFile;
570
+ task_registry_name: TaskRegistryName;
568
571
  task_attribs: TaskAttribs;
569
572
  task_args: TaskArgs;
570
573
  solver: Solver;
@@ -627,6 +630,7 @@ export interface EvalConfig {
627
630
  log_samples: LogSamples;
628
631
  log_images: LogImages;
629
632
  log_buffer: LogBuffer;
633
+ log_shared: LogShared;
630
634
  score_display: ScoreDisplay;
631
635
  }
632
636
  export interface ApprovalPolicyConfig {
@@ -845,6 +849,7 @@ export interface ChatMessageSystem {
845
849
  id: Id1;
846
850
  content: Content;
847
851
  source: Source;
852
+ internal: unknown;
848
853
  role: Role;
849
854
  }
850
855
  /**
@@ -853,6 +858,7 @@ export interface ChatMessageSystem {
853
858
  export interface ContentText {
854
859
  type: Type3;
855
860
  text: Text;
861
+ refusal: Refusal;
856
862
  }
857
863
  /**
858
864
  * Reasoning content.
@@ -896,6 +902,7 @@ export interface ChatMessageUser {
896
902
  id: Id2;
897
903
  content: Content1;
898
904
  source: Source1;
905
+ internal: unknown;
899
906
  role: Role1;
900
907
  tool_call_id: ToolCallId;
901
908
  }
@@ -906,15 +913,16 @@ export interface ChatMessageAssistant {
906
913
  id: Id3;
907
914
  content: Content2;
908
915
  source: Source2;
916
+ internal: unknown;
909
917
  role: Role2;
910
918
  tool_calls: ToolCalls;
919
+ model: Model1;
911
920
  }
912
921
  export interface ToolCall {
913
922
  id: Id4;
914
923
  function: Function;
915
924
  arguments: Arguments;
916
- type: Type8;
917
- internal_name: InternalName;
925
+ internal: unknown;
918
926
  parse_error: ParseError;
919
927
  view: ToolCallContent | null;
920
928
  }
@@ -934,21 +942,21 @@ export interface ChatMessageTool {
934
942
  id: Id5;
935
943
  content: Content4;
936
944
  source: Source3;
945
+ internal: unknown;
937
946
  role: Role3;
938
947
  tool_call_id: ToolCallId1;
939
948
  function: Function1;
940
- internal_name: InternalName1;
941
949
  error: ToolCallError | null;
942
950
  }
943
951
  export interface ToolCallError {
944
- type: Type9;
952
+ type: Type8;
945
953
  message: Message1;
946
954
  }
947
955
  /**
948
956
  * Output from model generation.
949
957
  */
950
958
  export interface ModelOutput {
951
- model: Model1;
959
+ model: Model2;
952
960
  choices: Choices1;
953
961
  usage: ModelUsage1 | null;
954
962
  time: Time;
@@ -1029,7 +1037,7 @@ export interface SampleLimitEvent {
1029
1037
  working_start: WorkingStart1;
1030
1038
  pending: Pending1;
1031
1039
  event: Event1;
1032
- type: Type10;
1040
+ type: Type9;
1033
1041
  message: Message2;
1034
1042
  limit: Limit1;
1035
1043
  }
@@ -1092,7 +1100,7 @@ export interface ModelEvent {
1092
1100
  working_start: WorkingStart5;
1093
1101
  pending: Pending5;
1094
1102
  event: Event5;
1095
- model: Model2;
1103
+ model: Model3;
1096
1104
  input: Input3;
1097
1105
  tools: Tools1;
1098
1106
  tool_choice: ToolChoice;
@@ -1139,7 +1147,7 @@ export interface ToolInfo {
1139
1147
  * Description of tool parameters object in JSON Schema format.
1140
1148
  */
1141
1149
  export interface ToolParams {
1142
- type: Type11;
1150
+ type: Type10;
1143
1151
  properties: Properties1;
1144
1152
  required: Required1;
1145
1153
  additionalProperties: Additionalproperties1;
@@ -1202,11 +1210,11 @@ export interface ToolEvent {
1202
1210
  working_start: WorkingStart6;
1203
1211
  pending: Pending6;
1204
1212
  event: Event6;
1205
- type: Type12;
1213
+ type: Type11;
1206
1214
  id: Id7;
1207
1215
  function: Function2;
1208
1216
  arguments: Arguments1;
1209
- internal_name: InternalName2;
1217
+ internal: unknown;
1210
1218
  view: ToolCallContent | null;
1211
1219
  result: Result1;
1212
1220
  truncated: Truncated;
@@ -1214,6 +1222,8 @@ export interface ToolEvent {
1214
1222
  events: Events1;
1215
1223
  completed: Completed3;
1216
1224
  working_time: WorkingTime2;
1225
+ agent: Agent;
1226
+ failed: Failed;
1217
1227
  }
1218
1228
  export interface Arguments1 {
1219
1229
  [k: string]: JsonValue;
@@ -1322,7 +1332,7 @@ export interface StepEvent {
1322
1332
  pending: Pending13;
1323
1333
  event: Event13;
1324
1334
  action: Action1;
1325
- type: Type13;
1335
+ type: Type12;
1326
1336
  name: Name11;
1327
1337
  }
1328
1338
  /**
@@ -1334,7 +1344,7 @@ export interface SubtaskEvent {
1334
1344
  pending: Pending14;
1335
1345
  event: Event14;
1336
1346
  name: Name12;
1337
- type: Type14;
1347
+ type: Type13;
1338
1348
  input: Input5;
1339
1349
  result: Result2;
1340
1350
  events: Events2;
@@ -1355,7 +1365,7 @@ export interface Attachments {
1355
1365
  * Limit encontered by sample.
1356
1366
  */
1357
1367
  export interface EvalSampleLimit {
1358
- type: Type15;
1368
+ type: Type14;
1359
1369
  limit: Limit2;
1360
1370
  }
1361
1371
  /**
@@ -0,0 +1,21 @@
1
+ declare module "markdown-it-katex" {
2
+ import MarkdownIt from "markdown-it";
3
+
4
+ interface KatexOptions {
5
+ throwOnError?: boolean;
6
+ errorColor?: string;
7
+ macros?: Record<string, string>;
8
+ fleqn?: boolean;
9
+ trust?: boolean;
10
+ output?: "html" | "htmlAndMathml" | "mathml";
11
+ minRuleThickness?: number;
12
+ colorIsTextColor?: boolean;
13
+ maxSize?: number;
14
+ maxExpand?: number;
15
+ strict?: boolean | string | Function;
16
+ }
17
+
18
+ const markdownItKatex: (md: MarkdownIt, options?: KatexOptions) => void;
19
+
20
+ export default markdownItKatex;
21
+ }
@@ -1,49 +1,105 @@
1
+ import { StateSnapshot } from "react-virtuoso";
1
2
  import {
3
+ AttachmentData,
2
4
  EvalLogHeader,
3
5
  EvalSummary,
6
+ EventData,
4
7
  LogFiles,
8
+ PendingSamples,
5
9
  SampleSummary,
6
10
  } from "./api/types";
7
- import { ContentImage, ContentText, EvalSample } from "./types/log";
8
-
9
- export interface ApplicationState {
10
- logs?: LogFiles;
11
- selectedLogIndex?: number;
12
- logHeaders?: Record<string, EvalLogHeader>;
13
- headersLoading?: boolean;
14
- selectedLog?: CurrentLog;
15
- selectedWorkspaceTab?: string;
16
- selectedSampleIndex?: number;
17
- selectedSample?: EvalSample;
18
- sampleStatus?: "loading" | "ok" | "error";
19
- sampleError?: Error;
20
- selectedSampleTab?: string;
21
- sampleScrollPosition?: number;
22
- showingSampleDialog?: boolean;
23
- status?: AppStatus;
24
- offcanvas?: boolean;
25
- showFind?: boolean;
26
- filter?: ScoreFilter;
27
- epoch?: string;
28
- sort?: string;
29
- scores?: ScoreLabel[];
11
+ import { ScorerInfo } from "./scoring/utils";
12
+ import {
13
+ ApprovalEvent,
14
+ ContentImage,
15
+ ContentText,
16
+ EvalSample,
17
+ InfoEvent,
18
+ LoggerEvent,
19
+ ModelEvent,
20
+ SampleInitEvent,
21
+ SampleLimitEvent,
22
+ SandboxEvent,
23
+ ScoreEvent,
24
+ StateEvent,
25
+ StepEvent,
26
+ StoreEvent,
27
+ SubtaskEvent,
28
+ ToolEvent,
29
+ } from "./types/log";
30
+
31
+ export interface AppState {
32
+ status: AppStatus;
33
+ offcanvas: boolean;
34
+ showFind: boolean;
35
+ tabs: {
36
+ workspace: string;
37
+ sample: string;
38
+ };
39
+ dialogs: {
40
+ sample: boolean;
41
+ };
42
+ scrollPositions: Record<string, number>;
43
+ listPositions: Record<string, StateSnapshot>;
44
+ collapsed: Record<string, boolean>;
45
+ messages: Record<string, boolean>;
46
+ propertyBags: Record<string, Record<string, unknown>>;
47
+ }
48
+
49
+ export interface LogsState {
50
+ logs: LogFiles;
51
+ logHeaders: Record<string, EvalLogHeader>;
52
+ headersLoading: boolean;
53
+ selectedLogIndex: number;
54
+ }
55
+
56
+ export interface LogState {
57
+ loadedLog?: string;
58
+
59
+ selectedSampleIndex: number;
60
+ selectedLogSummary?: EvalSummary;
61
+ pendingSampleSummaries?: PendingSamples;
62
+
63
+ filter: ScoreFilter;
64
+ epoch: string;
65
+ sort: string;
30
66
  score?: ScoreLabel;
31
- filteredSamples?: SampleSummary[];
32
- groupBy?: "none" | "epoch" | "sample";
33
- groupByOrder?: "asc" | "desc";
34
- workspaceTabScrollPosition?: Record<string, number>;
67
+ scores?: ScorerInfo[];
35
68
  }
36
69
 
70
+ export type SampleStatus = "ok" | "loading" | "streaming" | "error";
71
+
72
+ export interface SampleState {
73
+ selectedSample: EvalSample | undefined;
74
+ sampleStatus: SampleStatus;
75
+ sampleError: Error | undefined;
76
+
77
+ // Events and attachments
78
+ runningEvents: Event[];
79
+ }
80
+
81
+ export type Event =
82
+ | SampleInitEvent
83
+ | SampleLimitEvent
84
+ | SandboxEvent
85
+ | StateEvent
86
+ | StoreEvent
87
+ | ModelEvent
88
+ | ToolEvent
89
+ | ApprovalEvent
90
+ | InputEvent
91
+ | ScoreEvent
92
+ | ErrorEvent
93
+ | LoggerEvent
94
+ | InfoEvent
95
+ | StepEvent
96
+ | SubtaskEvent;
97
+
37
98
  export interface AppStatus {
38
99
  loading: boolean;
39
100
  error?: Error;
40
101
  }
41
102
 
42
- export interface Capabilities {
43
- downloadFiles: boolean;
44
- webWorkers: boolean;
45
- }
46
-
47
103
  export interface CurrentLog {
48
104
  name: string;
49
105
  contents: EvalSummary;
@@ -69,3 +125,9 @@ export interface ContentTool {
69
125
  type: "tool";
70
126
  content: (ContentImage | ContentText)[];
71
127
  }
128
+
129
+ export interface RunningSampleData {
130
+ events: Map<string, EventData>;
131
+ attachments: Map<string, AttachmentData>;
132
+ summary?: SampleSummary;
133
+ }
@@ -1,40 +1,75 @@
1
- /**
2
- * Resolves individual value by replacing protocol references with attachment content
3
- */
4
- export const resolveAttachments = (
5
- value: any,
1
+ export const resolveAttachments = <T>(
2
+ value: T,
6
3
  attachments: Record<string, string>,
7
- ): any => {
8
- const kContentProtocol = "tc://";
9
- const kAttachmentProtocol = "attachment://";
4
+ ): T => {
5
+ const CONTENT_PROTOCOL = "tc://";
6
+ const ATTACHMENT_PROTOCOL = "attachment://";
7
+
8
+ // Handle null or undefined early
9
+ if (value === null || value === undefined) {
10
+ return value;
11
+ }
10
12
 
11
13
  // Handle arrays recursively
12
14
  if (Array.isArray(value)) {
13
- return value.map((v) => resolveAttachments(v, attachments));
15
+ let hasChanged = false;
16
+ const resolvedArray = value.map((v) => {
17
+ const resolved = resolveAttachments(v, attachments);
18
+ if (resolved !== v) hasChanged = true;
19
+ return resolved;
20
+ });
21
+
22
+ // Only return the new array if something actually changed
23
+ return hasChanged ? (resolvedArray as unknown as T) : value;
14
24
  }
15
25
 
16
- // Handle objects recursively
17
- if (value && typeof value === "object") {
26
+ // Handle objects recursively, but skip Date instances and other special object types
27
+ if (
28
+ typeof value === "object" &&
29
+ !(value instanceof Date) &&
30
+ !(value instanceof RegExp)
31
+ ) {
32
+ let hasChanged = false;
18
33
  const resolvedObject: Record<string, unknown> = {};
19
- for (const key of Object.keys(value)) {
20
- resolvedObject[key] = resolveAttachments(value[key], attachments);
34
+
35
+ for (const [key, val] of Object.entries(value)) {
36
+ const resolved = resolveAttachments(val, attachments);
37
+ resolvedObject[key] = resolved;
38
+
39
+ // Track if anything changed
40
+ if (resolved !== val) hasChanged = true;
21
41
  }
22
- return resolvedObject;
42
+
43
+ // Only return the new object if something actually changed
44
+ return hasChanged ? (resolvedObject as unknown as T) : value;
23
45
  }
24
46
 
25
47
  // Handle string values with protocol references
26
48
  if (typeof value === "string") {
27
- let resolvedValue = value;
28
- if (resolvedValue.startsWith(kContentProtocol)) {
29
- resolvedValue = resolvedValue.replace(
30
- kContentProtocol,
31
- kAttachmentProtocol,
32
- );
49
+ // Check if the string starts with the content protocol
50
+ if (value.startsWith(CONTENT_PROTOCOL)) {
51
+ const updatedValue = value.replace(CONTENT_PROTOCOL, ATTACHMENT_PROTOCOL);
52
+
53
+ // Now check if it's an attachment reference
54
+ if (updatedValue.startsWith(ATTACHMENT_PROTOCOL)) {
55
+ const attachmentId = updatedValue.slice(ATTACHMENT_PROTOCOL.length);
56
+ const attachment = attachments[attachmentId];
57
+
58
+ // Return the attachment content if it exists, otherwise return the original string
59
+ return (attachment !== undefined ? attachment : value) as unknown as T;
60
+ }
61
+
62
+ return updatedValue as unknown as T;
33
63
  }
34
- if (resolvedValue.startsWith(kAttachmentProtocol)) {
35
- return attachments[resolvedValue.replace(kAttachmentProtocol, "")];
64
+
65
+ // Check if it's directly an attachment reference
66
+ if (value.startsWith(ATTACHMENT_PROTOCOL)) {
67
+ const attachmentId = value.slice(ATTACHMENT_PROTOCOL.length);
68
+ const attachment = attachments[attachmentId];
69
+
70
+ // Return the attachment content if it exists, otherwise return the original string
71
+ return (attachment !== undefined ? attachment : value) as unknown as T;
36
72
  }
37
- return resolvedValue;
38
73
  }
39
74
 
40
75
  // Return unchanged for other types
@@ -1,43 +1,110 @@
1
1
  export const asyncJsonParse = async (text: string): Promise<any> => {
2
+ // Encode the input text
2
3
  const encoder = new TextEncoder();
3
4
  const encodedText = encoder.encode(text);
5
+
6
+ // Create a worker from the inline script
4
7
  const blob = new Blob([kWorkerCode], { type: "application/javascript" });
5
8
  const blobURL = URL.createObjectURL(blob);
6
9
  const worker = new Worker(blobURL);
10
+
7
11
  try {
8
12
  const result = new Promise((resolve, reject) => {
9
13
  worker.onmessage = function (e) {
10
14
  if (e.data.success) {
11
- resolve(e.data.result);
15
+ if (e.data.serialized) {
16
+ // Deserialize the result if it was sent as a transferable
17
+ const decoder = new TextDecoder();
18
+ const resultString = decoder.decode(e.data.result);
19
+ resolve(JSON.parse(resultString));
20
+ } else {
21
+ resolve(e.data.result);
22
+ }
12
23
  } else {
13
- reject(new Error(e.data.error));
24
+ const error = new Error(e.data.error);
25
+ if (e.data.stack) {
26
+ error.stack = e.data.stack;
27
+ }
28
+ reject(error);
14
29
  }
15
30
  };
31
+
16
32
  worker.onerror = function (error) {
17
- reject(new Error(error.message));
33
+ reject(new Error(`Worker error: ${error.message}`));
18
34
  };
19
35
  });
20
- worker.postMessage({ scriptContent: kJson5ScriptBase64, encodedText }, [
21
- encodedText.buffer,
22
- ]);
36
+
37
+ // Transfer the encoded text buffer to the worker
38
+ worker.postMessage(
39
+ {
40
+ scriptContent: kJson5ScriptBase64,
41
+ encodedText,
42
+ },
43
+ [encodedText.buffer],
44
+ );
45
+
23
46
  return await result;
24
47
  } finally {
48
+ // Clean up resources
25
49
  worker.terminate();
26
50
  URL.revokeObjectURL(blobURL);
27
51
  }
28
52
  };
29
53
 
30
54
  const kWorkerCode = `
55
+ // Store the JSON5 parser once loaded
56
+ let JSON5 = null;
57
+
31
58
  self.onmessage = function (e) {
32
- eval(atob(e.data.scriptContent));
33
- const { encodedText } = e.data;
34
- const decoder = new TextDecoder();
35
- const text = decoder.decode(encodedText);
59
+ const { encodedText, scriptContent } = e.data;
60
+
36
61
  try {
62
+ // Only load the JSON5 script if we haven't done so yet
63
+ if (!JSON5) {
64
+ const script = atob(scriptContent);
65
+
66
+ new Function(script)();
67
+ // Verify it was loaded properly
68
+ if (typeof self.JSON5 !== 'object' || typeof self.JSON5.parse !== 'function') {
69
+ throw new Error('Failed to initialize JSON5 parser');
70
+ }
71
+ JSON5 = self.JSON5;
72
+ }
73
+
74
+ // Decode the text using TextDecoder
75
+ const decoder = new TextDecoder();
76
+ const text = decoder.decode(encodedText);
77
+
78
+ // Parse with JSON5
37
79
  const result = JSON5.parse(text);
38
- postMessage({ success: true, result });
80
+
81
+ if (result && typeof result === 'object' &&
82
+ (Array.isArray(result) ? result.length > 10000 : Object.keys(result).length > 10000)) {
83
+
84
+ // Large result, use transferrable object
85
+ const resultString = JSON.stringify(result);
86
+ const encoder = new TextEncoder();
87
+ const serialized = encoder.encode(resultString);
88
+
89
+ postMessage({
90
+ success: true,
91
+ serialized: true,
92
+ result: serialized
93
+ }, [serialized.buffer]);
94
+ } else {
95
+ // Small results, send directly
96
+ postMessage({
97
+ success: true,
98
+ serialized: false,
99
+ result: result
100
+ });
101
+ }
39
102
  } catch (err) {
40
- postMessage({ success: false, error: err.message });
103
+ postMessage({
104
+ success: false,
105
+ error: err.message,
106
+ stack: err.stack || ''
107
+ });
41
108
  }
42
109
  };`;
43
110