inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. inspect_ai/_cli/eval.py +27 -9
  2. inspect_ai/_display/core/display.py +2 -0
  3. inspect_ai/_display/core/footer.py +13 -3
  4. inspect_ai/_display/plain/display.py +6 -2
  5. inspect_ai/_display/rich/display.py +19 -6
  6. inspect_ai/_display/textual/app.py +9 -3
  7. inspect_ai/_display/textual/display.py +4 -0
  8. inspect_ai/_display/textual/widgets/samples.py +4 -10
  9. inspect_ai/_display/textual/widgets/transcript.py +35 -18
  10. inspect_ai/_eval/eval.py +14 -2
  11. inspect_ai/_eval/evalset.py +6 -1
  12. inspect_ai/_eval/run.py +6 -0
  13. inspect_ai/_eval/task/run.py +49 -23
  14. inspect_ai/_eval/task/task.py +26 -3
  15. inspect_ai/_util/content.py +20 -1
  16. inspect_ai/_util/interrupt.py +6 -0
  17. inspect_ai/_util/logger.py +19 -0
  18. inspect_ai/_util/rich.py +7 -8
  19. inspect_ai/_util/text.py +13 -0
  20. inspect_ai/_util/transcript.py +20 -6
  21. inspect_ai/_util/working.py +50 -0
  22. inspect_ai/_view/www/App.css +6 -0
  23. inspect_ai/_view/www/dist/assets/index.css +171 -99
  24. inspect_ai/_view/www/dist/assets/index.js +5972 -2770
  25. inspect_ai/_view/www/eslint.config.mjs +24 -1
  26. inspect_ai/_view/www/log-schema.json +619 -21
  27. inspect_ai/_view/www/package.json +8 -3
  28. inspect_ai/_view/www/src/App.tsx +2 -2
  29. inspect_ai/_view/www/src/appearance/icons.ts +3 -1
  30. inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
  31. inspect_ai/_view/www/src/components/Card.tsx +9 -8
  32. inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
  33. inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
  34. inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
  35. inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
  36. inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
  37. inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
  38. inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
  39. inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
  40. inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
  41. inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
  42. inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
  43. inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
  44. inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
  45. inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
  46. inspect_ai/_view/www/src/index.tsx +2 -2
  47. inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
  48. inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
  49. inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
  50. inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
  51. inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
  52. inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
  53. inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
  54. inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
  55. inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
  56. inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
  57. inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
  58. inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
  59. inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
  60. inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
  61. inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
  62. inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
  63. inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
  64. inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
  65. inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
  66. inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
  67. inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
  68. inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
  69. inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
  70. inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
  71. inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
  72. inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
  73. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
  74. inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
  75. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
  76. inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
  77. inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
  78. inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
  79. inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
  80. inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
  81. inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
  82. inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
  83. inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
  84. inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
  85. inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
  86. inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
  87. inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
  88. inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
  89. inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
  90. inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
  91. inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
  92. inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
  93. inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
  94. inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
  95. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
  96. inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
  97. inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
  98. inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
  99. inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
  100. inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
  101. inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
  102. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
  103. inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
  104. inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
  105. inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
  106. inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
  107. inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
  108. inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
  109. inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
  110. inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
  111. inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
  112. inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
  113. inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
  114. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
  115. inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
  116. inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
  117. inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
  118. inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
  119. inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
  120. inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
  121. inspect_ai/_view/www/src/types/log.d.ts +312 -137
  122. inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
  123. inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
  124. inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
  125. inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
  126. inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
  127. inspect_ai/_view/www/src/utils/format.ts +8 -5
  128. inspect_ai/_view/www/src/utils/json.ts +24 -0
  129. inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
  130. inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
  131. inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
  132. inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
  133. inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
  134. inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
  135. inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
  136. inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
  137. inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
  138. inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
  139. inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
  140. inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
  141. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
  142. inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
  143. inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
  144. inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
  145. inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
  146. inspect_ai/_view/www/yarn.lock +241 -5
  147. inspect_ai/log/__init__.py +2 -0
  148. inspect_ai/log/_condense.py +4 -0
  149. inspect_ai/log/_log.py +72 -12
  150. inspect_ai/log/_recorders/eval.py +6 -1
  151. inspect_ai/log/_samples.py +5 -1
  152. inspect_ai/log/_transcript.py +89 -2
  153. inspect_ai/model/__init__.py +2 -0
  154. inspect_ai/model/_call_tools.py +8 -1
  155. inspect_ai/model/_chat_message.py +22 -7
  156. inspect_ai/model/_conversation.py +11 -9
  157. inspect_ai/model/_generate_config.py +25 -4
  158. inspect_ai/model/_model.py +164 -72
  159. inspect_ai/model/_model_call.py +10 -3
  160. inspect_ai/model/_model_output.py +3 -0
  161. inspect_ai/model/_openai.py +106 -40
  162. inspect_ai/model/_providers/anthropic.py +145 -26
  163. inspect_ai/model/_providers/bedrock.py +7 -0
  164. inspect_ai/model/_providers/cloudflare.py +20 -7
  165. inspect_ai/model/_providers/google.py +29 -8
  166. inspect_ai/model/_providers/groq.py +66 -27
  167. inspect_ai/model/_providers/hf.py +6 -0
  168. inspect_ai/model/_providers/mistral.py +78 -51
  169. inspect_ai/model/_providers/openai.py +66 -4
  170. inspect_ai/model/_providers/openai_o1.py +10 -0
  171. inspect_ai/model/_providers/providers.py +2 -2
  172. inspect_ai/model/_providers/util/tracker.py +92 -0
  173. inspect_ai/model/_providers/vllm.py +13 -5
  174. inspect_ai/model/_reasoning.py +15 -2
  175. inspect_ai/scorer/_model.py +23 -19
  176. inspect_ai/solver/_basic_agent.py +1 -3
  177. inspect_ai/solver/_bridge/patch.py +0 -2
  178. inspect_ai/solver/_human_agent/agent.py +14 -10
  179. inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
  180. inspect_ai/solver/_human_agent/commands/submit.py +76 -30
  181. inspect_ai/solver/_limit.py +4 -4
  182. inspect_ai/solver/_plan.py +0 -3
  183. inspect_ai/solver/_task_state.py +7 -0
  184. inspect_ai/tool/__init__.py +2 -0
  185. inspect_ai/tool/_tool.py +3 -1
  186. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
  187. inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
  188. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
  189. inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
  190. inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
  191. inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
  192. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
  193. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
  194. inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
  195. inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
  196. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
  197. inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
  198. inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
  199. inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
  200. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
  201. inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
  202. inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
  203. inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
  204. inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
  205. inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
  206. inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
  207. inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
  208. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
  209. inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
  210. inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
  211. inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
  212. inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
  213. inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
  214. inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
  215. inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
  216. inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
  217. inspect_ai/tool/_tools/_web_search.py +3 -3
  218. inspect_ai/util/__init__.py +2 -1
  219. inspect_ai/util/_concurrency.py +14 -8
  220. inspect_ai/util/_display.py +12 -0
  221. inspect_ai/util/_sandbox/context.py +15 -0
  222. inspect_ai/util/_sandbox/docker/docker.py +7 -5
  223. inspect_ai/util/_sandbox/environment.py +32 -1
  224. inspect_ai/util/_sandbox/events.py +183 -0
  225. inspect_ai/util/_sandbox/local.py +3 -3
  226. inspect_ai/util/_sandbox/self_check.py +131 -43
  227. inspect_ai/util/_subtask.py +11 -0
  228. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
  229. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
  230. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
  231. inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
  232. inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
  233. inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
  234. inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
  235. inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
  236. inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
  237. inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
  238. inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
  239. inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
  240. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
  241. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
  242. {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -39,6 +39,7 @@ export type FailOnError = boolean | number | null;
39
39
  export type MessageLimit = number | null;
40
40
  export type TokenLimit = number | null;
41
41
  export type TimeLimit = number | null;
42
+ export type WorkingLimit = number | null;
42
43
  export type MaxSamples = number | null;
43
44
  export type MaxTasks = number | null;
44
45
  export type MaxSubprocesses = number | null;
@@ -52,7 +53,30 @@ export type Type = "git";
52
53
  export type Origin = string;
53
54
  export type Commit = string;
54
55
  export type Metadata = {} | null;
56
+ export type Scorers = EvalScorer[] | null;
55
57
  export type Name2 = string;
58
+ export type Options = {} | null;
59
+ export type Metrics =
60
+ | (
61
+ | EvalMetricDefinition
62
+ | {
63
+ [k: string]: EvalMetricDefinition[];
64
+ }
65
+ )[]
66
+ | {
67
+ [k: string]: EvalMetricDefinition[];
68
+ }
69
+ | null;
70
+ export type Name3 = string;
71
+ export type Options1 = {} | null;
72
+ export type Metadata1 = {} | null;
73
+ export type Metrics1 =
74
+ | EvalMetricDefinition[]
75
+ | {
76
+ [k: string]: EvalMetricDefinition[];
77
+ }
78
+ | null;
79
+ export type Name4 = string;
56
80
  export type Solver1 = string;
57
81
  export type Steps = EvalPlanStep[];
58
82
  export type MaxRetries = number | null;
@@ -79,18 +103,19 @@ export type InternalTools = boolean | null;
79
103
  export type MaxToolOutput = number | null;
80
104
  export type CachePrompt = "auto" | boolean | null;
81
105
  export type ReasoningEffort = ("low" | "medium" | "high") | null;
82
- export type ReasoningHistory = boolean | null;
106
+ export type ReasoningTokens = number | null;
107
+ export type ReasoningHistory = ("none" | "all" | "last" | "auto") | null;
83
108
  export type TotalSamples = number;
84
109
  export type CompletedSamples = number;
85
- export type Name3 = string;
110
+ export type Name5 = string;
86
111
  export type Scorer = string;
87
112
  export type Reducer = string | null;
88
- export type Name4 = string;
113
+ export type Name6 = string;
89
114
  export type Value = number;
90
- export type Metadata1 = {} | null;
91
115
  export type Metadata2 = {} | null;
92
- export type Scores = EvalScore[];
93
116
  export type Metadata3 = {} | null;
117
+ export type Scores = EvalScore[];
118
+ export type Metadata4 = {} | null;
94
119
  export type StartedAt = string;
95
120
  export type CompletedAt = string;
96
121
  export type InputTokens = number;
@@ -98,6 +123,7 @@ export type OutputTokens = number;
98
123
  export type TotalTokens = number;
99
124
  export type InputTokensCacheWrite = number | null;
100
125
  export type InputTokensCacheRead = number | null;
126
+ export type ReasoningTokens1 = number | null;
101
127
  export type Message = string;
102
128
  export type Traceback = string;
103
129
  export type TracebackAnsi = string;
@@ -115,47 +141,74 @@ export type Input =
115
141
  export type Role = "system";
116
142
  export type Content =
117
143
  | string
118
- | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
144
+ | (
145
+ | ContentText
146
+ | ContentReasoning
147
+ | ContentImage
148
+ | ContentAudio
149
+ | ContentVideo
150
+ )[];
119
151
  export type Type1 = "text";
120
152
  export type Text = string;
121
- export type Type2 = "image";
153
+ export type Type2 = "reasoning";
154
+ export type Reasoning = string;
155
+ export type Signature = string | null;
156
+ export type Redacted = boolean;
157
+ export type Type3 = "image";
122
158
  export type Image = string;
123
159
  export type Detail = "auto" | "low" | "high";
124
- export type Type3 = "audio";
160
+ export type Type4 = "audio";
125
161
  export type Audio = string;
126
162
  export type Format = "wav" | "mp3";
127
- export type Type4 = "video";
163
+ export type Type5 = "video";
128
164
  export type Video = string;
129
165
  export type Format1 = "mp4" | "mpeg" | "mov";
130
166
  export type Source = ("input" | "generate") | null;
131
167
  export type Role1 = "user";
132
168
  export type Content1 =
133
169
  | string
134
- | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
170
+ | (
171
+ | ContentText
172
+ | ContentReasoning
173
+ | ContentImage
174
+ | ContentAudio
175
+ | ContentVideo
176
+ )[];
135
177
  export type Source1 = ("input" | "generate") | null;
136
178
  export type ToolCallId = string[] | null;
137
179
  export type Role2 = "assistant";
138
180
  export type Content2 =
139
181
  | string
140
- | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
182
+ | (
183
+ | ContentText
184
+ | ContentReasoning
185
+ | ContentImage
186
+ | ContentAudio
187
+ | ContentVideo
188
+ )[];
141
189
  export type Source2 = ("input" | "generate") | null;
142
190
  export type ToolCalls = ToolCall[] | null;
143
191
  export type Id1 = string;
144
192
  export type Function = string;
145
- export type Type5 = "function";
193
+ export type Type6 = "function";
146
194
  export type ParseError = string | null;
147
195
  export type Title = string | null;
148
196
  export type Format2 = "text" | "markdown";
149
197
  export type Content3 = string;
150
- export type Reasoning = string | null;
151
198
  export type Role3 = "tool";
152
199
  export type Content4 =
153
200
  | string
154
- | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
201
+ | (
202
+ | ContentText
203
+ | ContentReasoning
204
+ | ContentImage
205
+ | ContentAudio
206
+ | ContentVideo
207
+ )[];
155
208
  export type Source3 = ("input" | "generate") | null;
156
209
  export type ToolCallId1 = string | null;
157
210
  export type Function1 = string | null;
158
- export type Type6 =
211
+ export type Type7 =
159
212
  | "parsing"
160
213
  | "timeout"
161
214
  | "unicode_decode"
@@ -194,7 +247,7 @@ export type Bytes1 = number[] | null;
194
247
  export type Content5 = Logprob[];
195
248
  export type Choices1 = ChatCompletionChoice[];
196
249
  export type Time = number | null;
197
- export type Metadata4 = {} | null;
250
+ export type Metadata5 = {} | null;
198
251
  export type Error = string | null;
199
252
  export type Scores1 = {
200
253
  [k: string]: Score;
@@ -209,8 +262,9 @@ export type Value1 =
209
262
  };
210
263
  export type Answer = string | null;
211
264
  export type Explanation = string | null;
212
- export type Metadata5 = {} | null;
265
+ export type Metadata6 = {} | null;
213
266
  export type Timestamp = string;
267
+ export type WorkingStart = number;
214
268
  export type Pending = boolean | null;
215
269
  export type Event = "sample_init";
216
270
  export type Input1 =
@@ -224,43 +278,67 @@ export type Input1 =
224
278
  export type Choices2 = string[] | null;
225
279
  export type Target1 = string | string[];
226
280
  export type Id2 = number | string | null;
227
- export type Metadata7 = {} | null;
281
+ export type Metadata8 = {} | null;
228
282
  export type Files1 = {
229
283
  [k: string]: string;
230
284
  } | null;
231
285
  export type Setup1 = string | null;
232
286
  export type JsonValue = unknown;
233
287
  export type Timestamp1 = string;
288
+ export type WorkingStart1 = number;
234
289
  export type Pending1 = boolean | null;
235
290
  export type Event1 = "sample_limit";
236
- export type Type7 = "message" | "time" | "token" | "operator" | "custom";
291
+ export type Type8 =
292
+ | "message"
293
+ | "time"
294
+ | "working"
295
+ | "token"
296
+ | "operator"
297
+ | "custom";
237
298
  export type Message2 = string;
238
299
  export type Limit1 = number | null;
239
300
  export type Timestamp2 = string;
301
+ export type WorkingStart2 = number;
240
302
  export type Pending2 = boolean | null;
241
- export type Event2 = "state";
303
+ export type Event2 = "sandbox";
304
+ export type Action = "exec" | "read_file" | "write_file";
305
+ export type Cmd = string | null;
306
+ export type Options2 = {
307
+ [k: string]: JsonValue;
308
+ } | null;
309
+ export type File = string | null;
310
+ export type Input2 = string | null;
311
+ export type Result = number | null;
312
+ export type Output = string | null;
313
+ export type Completed = string | null;
314
+ export type Timestamp3 = string;
315
+ export type WorkingStart3 = number;
316
+ export type Pending3 = boolean | null;
317
+ export type Event3 = "state";
242
318
  export type Op = "remove" | "add" | "replace" | "move" | "test" | "copy";
243
319
  export type Path = string;
244
320
  export type From = string | null;
245
321
  export type Changes = JsonChange[];
246
- export type Timestamp3 = string;
247
- export type Pending3 = boolean | null;
248
- export type Event3 = "store";
249
- export type Changes1 = JsonChange[];
250
322
  export type Timestamp4 = string;
323
+ export type WorkingStart4 = number;
251
324
  export type Pending4 = boolean | null;
252
- export type Event4 = "model";
325
+ export type Event4 = "store";
326
+ export type Changes1 = JsonChange[];
327
+ export type Timestamp5 = string;
328
+ export type WorkingStart5 = number;
329
+ export type Pending5 = boolean | null;
330
+ export type Event5 = "model";
253
331
  export type Model2 = string;
254
- export type Input2 = (
332
+ export type Input3 = (
255
333
  | ChatMessageSystem
256
334
  | ChatMessageUser
257
335
  | ChatMessageAssistant
258
336
  | ChatMessageTool
259
337
  )[];
260
- export type Name5 = string;
338
+ export type Name7 = string;
261
339
  export type Description = string;
262
- export type Type8 = "object";
263
- export type Type9 =
340
+ export type Type9 = "object";
341
+ export type Type10 =
264
342
  | ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
265
343
  | null;
266
344
  export type Description1 = string | null;
@@ -275,28 +353,40 @@ export type Required1 = string[];
275
353
  export type Additionalproperties1 = boolean;
276
354
  export type Tools1 = ToolInfo[];
277
355
  export type ToolChoice = ("auto" | "any" | "none") | ToolFunction;
278
- export type Name6 = string;
356
+ export type Name8 = string;
279
357
  export type Error1 = string | null;
280
358
  export type Cache = ("read" | "write") | null;
281
- export type Timestamp5 = string;
282
- export type Pending5 = boolean | null;
283
- export type Event5 = "tool";
284
- export type Type10 = "function";
359
+ export type Time1 = number | null;
360
+ export type Completed1 = string | null;
361
+ export type WorkingTime = number | null;
362
+ export type Timestamp6 = string;
363
+ export type WorkingStart6 = number;
364
+ export type Pending6 = boolean | null;
365
+ export type Event6 = "tool";
366
+ export type Type11 = "function";
285
367
  export type Id3 = string;
286
368
  export type Function2 = string;
287
- export type Result =
369
+ export type Result1 =
288
370
  | string
289
371
  | number
290
372
  | boolean
291
373
  | ContentText
374
+ | ContentReasoning
292
375
  | ContentImage
293
376
  | ContentAudio
294
377
  | ContentVideo
295
- | (ContentText | ContentImage | ContentAudio | ContentVideo)[];
378
+ | (
379
+ | ContentText
380
+ | ContentReasoning
381
+ | ContentImage
382
+ | ContentAudio
383
+ | ContentVideo
384
+ )[];
296
385
  export type Truncated = [unknown, unknown] | null;
297
- export type Timestamp6 = string;
298
- export type Pending6 = boolean | null;
299
- export type Event6 = "approval";
386
+ export type Timestamp7 = string;
387
+ export type WorkingStart7 = number;
388
+ export type Pending7 = boolean | null;
389
+ export type Event7 = "approval";
300
390
  export type Message3 = string;
301
391
  export type Approver = string;
302
392
  export type Decision =
@@ -306,23 +396,27 @@ export type Decision =
306
396
  | "escalate"
307
397
  | "terminate";
308
398
  export type Explanation1 = string | null;
309
- export type Timestamp7 = string;
310
- export type Pending7 = boolean | null;
311
- export type Event7 = "input";
312
- export type Input3 = string;
313
- export type InputAnsi = string;
314
399
  export type Timestamp8 = string;
400
+ export type WorkingStart8 = number;
315
401
  export type Pending8 = boolean | null;
316
- export type Event8 = "score";
317
- export type Target2 = string | string[] | null;
318
- export type Intermediate = boolean;
402
+ export type Event8 = "input";
403
+ export type Input4 = string;
404
+ export type InputAnsi = string;
319
405
  export type Timestamp9 = string;
406
+ export type WorkingStart9 = number;
320
407
  export type Pending9 = boolean | null;
321
- export type Event9 = "error";
408
+ export type Event9 = "score";
409
+ export type Target2 = string | string[] | null;
410
+ export type Intermediate = boolean;
322
411
  export type Timestamp10 = string;
412
+ export type WorkingStart10 = number;
323
413
  export type Pending10 = boolean | null;
324
- export type Event10 = "logger";
325
- export type Name7 = string | null;
414
+ export type Event10 = "error";
415
+ export type Timestamp11 = string;
416
+ export type WorkingStart11 = number;
417
+ export type Pending11 = boolean | null;
418
+ export type Event11 = "logger";
419
+ export type Name9 = string | null;
326
420
  export type Level =
327
421
  | "debug"
328
422
  | "trace"
@@ -337,24 +431,28 @@ export type Created1 = number;
337
431
  export type Filename = string;
338
432
  export type Module = string;
339
433
  export type Lineno = number;
340
- export type Timestamp11 = string;
341
- export type Pending11 = boolean | null;
342
- export type Event11 = "info";
343
- export type Source4 = string | null;
344
434
  export type Timestamp12 = string;
435
+ export type WorkingStart12 = number;
345
436
  export type Pending12 = boolean | null;
346
- export type Event12 = "step";
347
- export type Action = "begin" | "end";
348
- export type Type11 = string | null;
349
- export type Name8 = string;
437
+ export type Event12 = "info";
438
+ export type Source4 = string | null;
350
439
  export type Timestamp13 = string;
440
+ export type WorkingStart13 = number;
351
441
  export type Pending13 = boolean | null;
352
- export type Event13 = "subtask";
353
- export type Name9 = string;
442
+ export type Event13 = "step";
443
+ export type Action1 = "begin" | "end";
354
444
  export type Type12 = string | null;
445
+ export type Name10 = string;
446
+ export type Timestamp14 = string;
447
+ export type WorkingStart14 = number;
448
+ export type Pending14 = boolean | null;
449
+ export type Event14 = "subtask";
450
+ export type Name11 = string;
451
+ export type Type13 = string | null;
355
452
  export type Events2 = (
356
453
  | SampleInitEvent
357
454
  | SampleLimitEvent
455
+ | SandboxEvent
358
456
  | StateEvent
359
457
  | StoreEvent
360
458
  | ModelEvent
@@ -368,9 +466,12 @@ export type Events2 = (
368
466
  | StepEvent
369
467
  | SubtaskEvent
370
468
  )[];
469
+ export type Completed2 = string | null;
470
+ export type WorkingTime1 = number | null;
371
471
  export type Events1 = (
372
472
  | SampleInitEvent
373
473
  | SampleLimitEvent
474
+ | SandboxEvent
374
475
  | StateEvent
375
476
  | StoreEvent
376
477
  | ModelEvent
@@ -384,9 +485,12 @@ export type Events1 = (
384
485
  | StepEvent
385
486
  | SubtaskEvent
386
487
  )[];
488
+ export type Completed3 = string | null;
489
+ export type WorkingTime2 = number | null;
387
490
  export type Events = (
388
491
  | SampleInitEvent
389
492
  | SampleLimitEvent
493
+ | SandboxEvent
390
494
  | StateEvent
391
495
  | StoreEvent
392
496
  | ModelEvent
@@ -400,9 +504,13 @@ export type Events = (
400
504
  | StepEvent
401
505
  | SubtaskEvent
402
506
  )[];
403
- export type Type13 =
507
+ export type TotalTime = number | null;
508
+ export type WorkingTime3 = number | null;
509
+ export type Uuid = string | null;
510
+ export type Type14 =
404
511
  | "context"
405
512
  | "time"
513
+ | "working"
406
514
  | "message"
407
515
  | "token"
408
516
  | "operator"
@@ -421,7 +529,7 @@ export type Value2 =
421
529
  };
422
530
  export type Answer1 = string | null;
423
531
  export type Explanation2 = string | null;
424
- export type Metadata8 = {} | null;
532
+ export type Metadata9 = {} | null;
425
533
  export type SampleId1 = string | number | null;
426
534
  export type Samples2 = EvalSampleScore[];
427
535
  export type Location1 = string;
@@ -465,6 +573,8 @@ export interface EvalSpec {
465
573
  revision: EvalRevision | null;
466
574
  packages: Packages;
467
575
  metadata: Metadata;
576
+ scorers: Scorers;
577
+ metrics: Metrics1;
468
578
  }
469
579
  export interface TaskAttribs {}
470
580
  export interface TaskArgs {}
@@ -492,6 +602,7 @@ export interface EvalConfig {
492
602
  message_limit: MessageLimit;
493
603
  token_limit: TokenLimit;
494
604
  time_limit: TimeLimit;
605
+ working_limit: WorkingLimit;
495
606
  max_samples: MaxSamples;
496
607
  max_tasks: MaxTasks;
497
608
  max_subprocesses: MaxSubprocesses;
@@ -538,11 +649,21 @@ export interface EvalRevision {
538
649
  export interface Packages {
539
650
  [k: string]: string;
540
651
  }
652
+ export interface EvalScorer {
653
+ name: Name2;
654
+ options: Options;
655
+ metrics: Metrics;
656
+ metadata: Metadata1;
657
+ }
658
+ export interface EvalMetricDefinition {
659
+ name: Name3;
660
+ options: Options1;
661
+ }
541
662
  /**
542
663
  * Plan (solvers) used in evaluation.
543
664
  */
544
665
  export interface EvalPlan {
545
- name: Name2;
666
+ name: Name4;
546
667
  steps: Steps;
547
668
  finish: EvalPlanStep | null;
548
669
  config: GenerateConfig;
@@ -581,6 +702,7 @@ export interface GenerateConfig {
581
702
  max_tool_output: MaxToolOutput;
582
703
  cache_prompt: CachePrompt;
583
704
  reasoning_effort: ReasoningEffort;
705
+ reasoning_tokens: ReasoningTokens;
584
706
  reasoning_history: ReasoningHistory;
585
707
  }
586
708
  /**
@@ -590,31 +712,31 @@ export interface EvalResults {
590
712
  total_samples: TotalSamples;
591
713
  completed_samples: CompletedSamples;
592
714
  scores: Scores;
593
- metadata: Metadata3;
715
+ metadata: Metadata4;
594
716
  }
595
717
  /**
596
718
  * Score for evaluation task.
597
719
  */
598
720
  export interface EvalScore {
599
- name: Name3;
721
+ name: Name5;
600
722
  scorer: Scorer;
601
723
  reducer: Reducer;
602
724
  params: Params2;
603
- metrics: Metrics;
604
- metadata: Metadata2;
725
+ metrics: Metrics2;
726
+ metadata: Metadata3;
605
727
  }
606
728
  export interface Params2 {}
607
- export interface Metrics {
729
+ export interface Metrics2 {
608
730
  [k: string]: EvalMetric;
609
731
  }
610
732
  /**
611
733
  * Metric for evaluation score.
612
734
  */
613
735
  export interface EvalMetric {
614
- name: Name4;
736
+ name: Name6;
615
737
  value: Value;
616
738
  params: Params3;
617
- metadata: Metadata1;
739
+ metadata: Metadata2;
618
740
  }
619
741
  export interface Params3 {}
620
742
  /**
@@ -637,6 +759,7 @@ export interface ModelUsage1 {
637
759
  total_tokens: TotalTokens;
638
760
  input_tokens_cache_write: InputTokensCacheWrite;
639
761
  input_tokens_cache_read: InputTokensCacheRead;
762
+ reasoning_tokens: ReasoningTokens1;
640
763
  }
641
764
  /**
642
765
  * Eval error details.
@@ -661,10 +784,13 @@ export interface EvalSample {
661
784
  messages: Messages;
662
785
  output: ModelOutput;
663
786
  scores: Scores1;
664
- metadata: Metadata6;
787
+ metadata: Metadata7;
665
788
  store: Store;
666
789
  events: Events;
667
790
  model_usage: ModelUsage2;
791
+ total_time: TotalTime;
792
+ working_time: WorkingTime3;
793
+ uuid: Uuid;
668
794
  error: EvalError | null;
669
795
  attachments: Attachments;
670
796
  limit: EvalSampleLimit | null;
@@ -684,11 +810,22 @@ export interface ContentText {
684
810
  type: Type1;
685
811
  text: Text;
686
812
  }
813
+ /**
814
+ * Reasoning content.
815
+ *
816
+ * See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
817
+ */
818
+ export interface ContentReasoning {
819
+ type: Type2;
820
+ reasoning: Reasoning;
821
+ signature: Signature;
822
+ redacted: Redacted;
823
+ }
687
824
  /**
688
825
  * Image content.
689
826
  */
690
827
  export interface ContentImage {
691
- type: Type2;
828
+ type: Type3;
692
829
  image: Image;
693
830
  detail: Detail;
694
831
  }
@@ -696,7 +833,7 @@ export interface ContentImage {
696
833
  * Audio content.
697
834
  */
698
835
  export interface ContentAudio {
699
- type: Type3;
836
+ type: Type4;
700
837
  audio: Audio;
701
838
  format: Format;
702
839
  }
@@ -704,7 +841,7 @@ export interface ContentAudio {
704
841
  * Video content.
705
842
  */
706
843
  export interface ContentVideo {
707
- type: Type4;
844
+ type: Type5;
708
845
  video: Video;
709
846
  format: Format1;
710
847
  }
@@ -725,13 +862,12 @@ export interface ChatMessageAssistant {
725
862
  content: Content2;
726
863
  source: Source2;
727
864
  tool_calls: ToolCalls;
728
- reasoning: Reasoning;
729
865
  }
730
866
  export interface ToolCall {
731
867
  id: Id1;
732
868
  function: Function;
733
869
  arguments: Arguments;
734
- type: Type5;
870
+ type: Type6;
735
871
  parse_error: ParseError;
736
872
  view: ToolCallContent | null;
737
873
  }
@@ -756,7 +892,7 @@ export interface ChatMessageTool {
756
892
  error: ToolCallError | null;
757
893
  }
758
894
  export interface ToolCallError {
759
- type: Type6;
895
+ type: Type7;
760
896
  message: Message1;
761
897
  }
762
898
  /**
@@ -767,7 +903,7 @@ export interface ModelOutput {
767
903
  choices: Choices1;
768
904
  usage: ModelUsage1 | null;
769
905
  time: Time;
770
- metadata: Metadata4;
906
+ metadata: Metadata5;
771
907
  error: Error;
772
908
  }
773
909
  /**
@@ -808,15 +944,16 @@ export interface Score {
808
944
  value: Value1;
809
945
  answer: Answer;
810
946
  explanation: Explanation;
811
- metadata: Metadata5;
947
+ metadata: Metadata6;
812
948
  }
813
- export interface Metadata6 {}
949
+ export interface Metadata7 {}
814
950
  export interface Store {}
815
951
  /**
816
952
  * Beginning of processing a Sample.
817
953
  */
818
954
  export interface SampleInitEvent {
819
955
  timestamp: Timestamp;
956
+ working_start: WorkingStart;
820
957
  pending: Pending;
821
958
  event: Event;
822
959
  sample: Sample;
@@ -830,7 +967,7 @@ export interface Sample {
830
967
  choices: Choices2;
831
968
  target: Target1;
832
969
  id: Id2;
833
- metadata: Metadata7;
970
+ metadata: Metadata8;
834
971
  sandbox: SandboxEnvironmentSpec | null;
835
972
  files: Files1;
836
973
  setup: Setup1;
@@ -840,19 +977,38 @@ export interface Sample {
840
977
  */
841
978
  export interface SampleLimitEvent {
842
979
  timestamp: Timestamp1;
980
+ working_start: WorkingStart1;
843
981
  pending: Pending1;
844
982
  event: Event1;
845
- type: Type7;
983
+ type: Type8;
846
984
  message: Message2;
847
985
  limit: Limit1;
848
986
  }
849
987
  /**
850
- * Change to the current `TaskState`
988
+ * Sandbox execution or I/O
851
989
  */
852
- export interface StateEvent {
990
+ export interface SandboxEvent {
853
991
  timestamp: Timestamp2;
992
+ working_start: WorkingStart2;
854
993
  pending: Pending2;
855
994
  event: Event2;
995
+ action: Action;
996
+ cmd: Cmd;
997
+ options: Options2;
998
+ file: File;
999
+ input: Input2;
1000
+ result: Result;
1001
+ output: Output;
1002
+ completed: Completed;
1003
+ }
1004
+ /**
1005
+ * Change to the current `TaskState`
1006
+ */
1007
+ export interface StateEvent {
1008
+ timestamp: Timestamp3;
1009
+ working_start: WorkingStart3;
1010
+ pending: Pending3;
1011
+ event: Event3;
856
1012
  changes: Changes;
857
1013
  }
858
1014
  /**
@@ -873,20 +1029,22 @@ export interface JsonChange {
873
1029
  * Change to data within the current `Store`.
874
1030
  */
875
1031
  export interface StoreEvent {
876
- timestamp: Timestamp3;
877
- pending: Pending3;
878
- event: Event3;
1032
+ timestamp: Timestamp4;
1033
+ working_start: WorkingStart4;
1034
+ pending: Pending4;
1035
+ event: Event4;
879
1036
  changes: Changes1;
880
1037
  }
881
1038
  /**
882
1039
  * Call to a language model.
883
1040
  */
884
1041
  export interface ModelEvent {
885
- timestamp: Timestamp4;
886
- pending: Pending4;
887
- event: Event4;
1042
+ timestamp: Timestamp5;
1043
+ working_start: WorkingStart5;
1044
+ pending: Pending5;
1045
+ event: Event5;
888
1046
  model: Model2;
889
- input: Input2;
1047
+ input: Input3;
890
1048
  tools: Tools1;
891
1049
  tool_choice: ToolChoice;
892
1050
  config: GenerateConfig1;
@@ -894,6 +1052,8 @@ export interface ModelEvent {
894
1052
  error: Error1;
895
1053
  cache: Cache;
896
1054
  call: ModelCall | null;
1055
+ completed: Completed1;
1056
+ working_time: WorkingTime;
897
1057
  }
898
1058
  /**
899
1059
  * Specification of a tool (JSON Schema compatible)
@@ -922,7 +1082,7 @@ export interface ModelEvent {
922
1082
  * ```
923
1083
  */
924
1084
  export interface ToolInfo {
925
- name: Name5;
1085
+ name: Name7;
926
1086
  description: Description;
927
1087
  parameters: ToolParams;
928
1088
  }
@@ -930,7 +1090,7 @@ export interface ToolInfo {
930
1090
  * Description of tool parameters object in JSON Schema format.
931
1091
  */
932
1092
  export interface ToolParams {
933
- type: Type8;
1093
+ type: Type9;
934
1094
  properties: Properties;
935
1095
  required: Required1;
936
1096
  additionalProperties: Additionalproperties1;
@@ -942,7 +1102,7 @@ export interface Properties {
942
1102
  * Description of tool parameter in JSON Schema format.
943
1103
  */
944
1104
  export interface ToolParam {
945
- type: Type9;
1105
+ type: Type10;
946
1106
  description: Description1;
947
1107
  default: Default;
948
1108
  enum: Enum;
@@ -956,7 +1116,7 @@ export interface Default {
956
1116
  [k: string]: unknown;
957
1117
  }
958
1118
  export interface ToolFunction {
959
- name: Name6;
1119
+ name: Name8;
960
1120
  }
961
1121
  /**
962
1122
  * Model generation options.
@@ -984,6 +1144,7 @@ export interface GenerateConfig1 {
984
1144
  max_tool_output: MaxToolOutput;
985
1145
  cache_prompt: CachePrompt;
986
1146
  reasoning_effort: ReasoningEffort;
1147
+ reasoning_tokens: ReasoningTokens;
987
1148
  reasoning_history: ReasoningHistory;
988
1149
  }
989
1150
  /**
@@ -992,6 +1153,7 @@ export interface GenerateConfig1 {
992
1153
  export interface ModelCall {
993
1154
  request: Request;
994
1155
  response: Response;
1156
+ time: Time1;
995
1157
  }
996
1158
  export interface Request {
997
1159
  [k: string]: JsonValue;
@@ -1003,18 +1165,21 @@ export interface Response {
1003
1165
  * Call to a tool.
1004
1166
  */
1005
1167
  export interface ToolEvent {
1006
- timestamp: Timestamp5;
1007
- pending: Pending5;
1008
- event: Event5;
1009
- type: Type10;
1168
+ timestamp: Timestamp6;
1169
+ working_start: WorkingStart6;
1170
+ pending: Pending6;
1171
+ event: Event6;
1172
+ type: Type11;
1010
1173
  id: Id3;
1011
1174
  function: Function2;
1012
1175
  arguments: Arguments1;
1013
1176
  view: ToolCallContent | null;
1014
- result: Result;
1177
+ result: Result1;
1015
1178
  truncated: Truncated;
1016
1179
  error: ToolCallError | null;
1017
1180
  events: Events1;
1181
+ completed: Completed3;
1182
+ working_time: WorkingTime2;
1018
1183
  }
1019
1184
  export interface Arguments1 {
1020
1185
  [k: string]: JsonValue;
@@ -1023,9 +1188,10 @@ export interface Arguments1 {
1023
1188
  * Tool approval.
1024
1189
  */
1025
1190
  export interface ApprovalEvent {
1026
- timestamp: Timestamp6;
1027
- pending: Pending6;
1028
- event: Event6;
1191
+ timestamp: Timestamp7;
1192
+ working_start: WorkingStart7;
1193
+ pending: Pending7;
1194
+ event: Event7;
1029
1195
  message: Message3;
1030
1196
  call: ToolCall;
1031
1197
  view: ToolCallView | null;
@@ -1048,10 +1214,11 @@ export interface ToolCallView {
1048
1214
  * Input screen interaction.
1049
1215
  */
1050
1216
  export interface InputEvent {
1051
- timestamp: Timestamp7;
1052
- pending: Pending7;
1053
- event: Event7;
1054
- input: Input3;
1217
+ timestamp: Timestamp8;
1218
+ working_start: WorkingStart8;
1219
+ pending: Pending8;
1220
+ event: Event8;
1221
+ input: Input4;
1055
1222
  input_ansi: InputAnsi;
1056
1223
  }
1057
1224
  /**
@@ -1061,9 +1228,10 @@ export interface InputEvent {
1061
1228
  * resulting from a call to `score`.
1062
1229
  */
1063
1230
  export interface ScoreEvent {
1064
- timestamp: Timestamp8;
1065
- pending: Pending8;
1066
- event: Event8;
1231
+ timestamp: Timestamp9;
1232
+ working_start: WorkingStart9;
1233
+ pending: Pending9;
1234
+ event: Event9;
1067
1235
  score: Score;
1068
1236
  target: Target2;
1069
1237
  intermediate: Intermediate;
@@ -1072,25 +1240,27 @@ export interface ScoreEvent {
1072
1240
  * Event with sample error.
1073
1241
  */
1074
1242
  export interface ErrorEvent {
1075
- timestamp: Timestamp9;
1076
- pending: Pending9;
1077
- event: Event9;
1243
+ timestamp: Timestamp10;
1244
+ working_start: WorkingStart10;
1245
+ pending: Pending10;
1246
+ event: Event10;
1078
1247
  error: EvalError;
1079
1248
  }
1080
1249
  /**
1081
1250
  * Log message recorded with Python logger.
1082
1251
  */
1083
1252
  export interface LoggerEvent {
1084
- timestamp: Timestamp10;
1085
- pending: Pending10;
1086
- event: Event10;
1253
+ timestamp: Timestamp11;
1254
+ working_start: WorkingStart11;
1255
+ pending: Pending11;
1256
+ event: Event11;
1087
1257
  message: LoggingMessage;
1088
1258
  }
1089
1259
  /**
1090
1260
  * Message written to Python log.
1091
1261
  */
1092
1262
  export interface LoggingMessage {
1093
- name: Name7;
1263
+ name: Name9;
1094
1264
  level: Level;
1095
1265
  message: Message4;
1096
1266
  created: Created1;
@@ -1102,9 +1272,10 @@ export interface LoggingMessage {
1102
1272
  * Event with custom info/data.
1103
1273
  */
1104
1274
  export interface InfoEvent {
1105
- timestamp: Timestamp11;
1106
- pending: Pending11;
1107
- event: Event11;
1275
+ timestamp: Timestamp12;
1276
+ working_start: WorkingStart12;
1277
+ pending: Pending12;
1278
+ event: Event12;
1108
1279
  source: Source4;
1109
1280
  data: JsonValue;
1110
1281
  }
@@ -1112,28 +1283,32 @@ export interface InfoEvent {
1112
1283
  * Step within current sample or subtask.
1113
1284
  */
1114
1285
  export interface StepEvent {
1115
- timestamp: Timestamp12;
1116
- pending: Pending12;
1117
- event: Event12;
1118
- action: Action;
1119
- type: Type11;
1120
- name: Name8;
1286
+ timestamp: Timestamp13;
1287
+ working_start: WorkingStart13;
1288
+ pending: Pending13;
1289
+ event: Event13;
1290
+ action: Action1;
1291
+ type: Type12;
1292
+ name: Name10;
1121
1293
  }
1122
1294
  /**
1123
1295
  * Subtask spawned.
1124
1296
  */
1125
1297
  export interface SubtaskEvent {
1126
- timestamp: Timestamp13;
1127
- pending: Pending13;
1128
- event: Event13;
1129
- name: Name9;
1130
- type: Type12;
1131
- input: Input4;
1132
- result: Result1;
1298
+ timestamp: Timestamp14;
1299
+ working_start: WorkingStart14;
1300
+ pending: Pending14;
1301
+ event: Event14;
1302
+ name: Name11;
1303
+ type: Type13;
1304
+ input: Input5;
1305
+ result: Result2;
1133
1306
  events: Events2;
1307
+ completed: Completed2;
1308
+ working_time: WorkingTime1;
1134
1309
  }
1135
- export interface Input4 {}
1136
- export interface Result1 {
1310
+ export interface Input5 {}
1311
+ export interface Result2 {
1137
1312
  [k: string]: unknown;
1138
1313
  }
1139
1314
  export interface ModelUsage2 {
@@ -1146,7 +1321,7 @@ export interface Attachments {
1146
1321
  * Limit encontered by sample.
1147
1322
  */
1148
1323
  export interface EvalSampleLimit {
1149
- type: Type13;
1324
+ type: Type14;
1150
1325
  limit: Limit2;
1151
1326
  }
1152
1327
  /**
@@ -1164,6 +1339,6 @@ export interface EvalSampleScore {
1164
1339
  value: Value2;
1165
1340
  answer: Answer1;
1166
1341
  explanation: Explanation2;
1167
- metadata: Metadata8;
1342
+ metadata: Metadata9;
1168
1343
  sample_id: SampleId1;
1169
1344
  }