inspect-ai 0.3.69__py3-none-any.whl → 0.3.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -9
- inspect_ai/_display/core/display.py +2 -0
- inspect_ai/_display/core/footer.py +13 -3
- inspect_ai/_display/plain/display.py +6 -2
- inspect_ai/_display/rich/display.py +19 -6
- inspect_ai/_display/textual/app.py +9 -3
- inspect_ai/_display/textual/display.py +4 -0
- inspect_ai/_display/textual/widgets/samples.py +4 -10
- inspect_ai/_display/textual/widgets/transcript.py +35 -18
- inspect_ai/_eval/eval.py +14 -2
- inspect_ai/_eval/evalset.py +6 -1
- inspect_ai/_eval/run.py +6 -0
- inspect_ai/_eval/task/run.py +49 -23
- inspect_ai/_eval/task/task.py +26 -3
- inspect_ai/_util/content.py +20 -1
- inspect_ai/_util/interrupt.py +6 -0
- inspect_ai/_util/logger.py +19 -0
- inspect_ai/_util/rich.py +7 -8
- inspect_ai/_util/text.py +13 -0
- inspect_ai/_util/transcript.py +20 -6
- inspect_ai/_util/working.py +50 -0
- inspect_ai/_view/www/App.css +6 -0
- inspect_ai/_view/www/dist/assets/index.css +171 -99
- inspect_ai/_view/www/dist/assets/index.js +5972 -2770
- inspect_ai/_view/www/eslint.config.mjs +24 -1
- inspect_ai/_view/www/log-schema.json +619 -21
- inspect_ai/_view/www/package.json +8 -3
- inspect_ai/_view/www/src/App.tsx +2 -2
- inspect_ai/_view/www/src/appearance/icons.ts +3 -1
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +4 -3
- inspect_ai/_view/www/src/components/Card.tsx +9 -8
- inspect_ai/_view/www/src/components/DownloadButton.tsx +2 -1
- inspect_ai/_view/www/src/components/EmptyPanel.tsx +2 -2
- inspect_ai/_view/www/src/components/ErrorPanel.tsx +4 -3
- inspect_ai/_view/www/src/components/ExpandablePanel.tsx +13 -5
- inspect_ai/_view/www/src/components/FindBand.tsx +3 -3
- inspect_ai/_view/www/src/components/HumanBaselineView.tsx +3 -3
- inspect_ai/_view/www/src/components/LabeledValue.tsx +5 -4
- inspect_ai/_view/www/src/components/LargeModal.tsx +18 -13
- inspect_ai/_view/www/src/components/{LightboxCarousel.css → LightboxCarousel.module.css} +22 -18
- inspect_ai/_view/www/src/components/LightboxCarousel.tsx +36 -27
- inspect_ai/_view/www/src/components/MessageBand.tsx +2 -1
- inspect_ai/_view/www/src/components/NavPills.tsx +9 -8
- inspect_ai/_view/www/src/components/ProgressBar.tsx +2 -1
- inspect_ai/_view/www/src/components/TabSet.tsx +21 -15
- inspect_ai/_view/www/src/index.tsx +2 -2
- inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +11 -9
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +3 -2
- inspect_ai/_view/www/src/metadata/MetadataGrid.module.css +1 -0
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +16 -1
- inspect_ai/_view/www/src/plan/DatasetDetailView.tsx +3 -2
- inspect_ai/_view/www/src/plan/DetailStep.tsx +2 -1
- inspect_ai/_view/www/src/plan/PlanCard.tsx +2 -5
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +6 -9
- inspect_ai/_view/www/src/plan/ScorerDetailView.tsx +2 -1
- inspect_ai/_view/www/src/plan/SolverDetailView.tsx +3 -3
- inspect_ai/_view/www/src/samples/InlineSampleDisplay.tsx +2 -2
- inspect_ai/_view/www/src/samples/SampleDialog.tsx +3 -3
- inspect_ai/_view/www/src/samples/SampleDisplay.module.css +9 -1
- inspect_ai/_view/www/src/samples/SampleDisplay.tsx +30 -3
- inspect_ai/_view/www/src/samples/SampleSummaryView.module.css +4 -0
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +25 -4
- inspect_ai/_view/www/src/samples/SamplesTools.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +3 -19
- inspect_ai/_view/www/src/samples/chat/ChatMessageRenderer.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatMessageRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatView.tsx +2 -1
- inspect_ai/_view/www/src/samples/chat/ChatViewVirtualList.tsx +22 -7
- inspect_ai/_view/www/src/samples/chat/MessageContent.tsx +35 -6
- inspect_ai/_view/www/src/samples/chat/MessageContents.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/messages.ts +15 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolCallView.tsx +13 -4
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.tsx +18 -19
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.module.css +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/ToolOutput.tsx +4 -3
- inspect_ai/_view/www/src/samples/chat/tools/ToolTitle.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +2 -3
- inspect_ai/_view/www/src/samples/error/SampleErrorView.tsx +3 -2
- inspect_ai/_view/www/src/samples/list/SampleFooter.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleHeader.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleList.tsx +57 -45
- inspect_ai/_view/www/src/samples/list/SampleRow.tsx +2 -1
- inspect_ai/_view/www/src/samples/list/SampleSeparator.tsx +2 -1
- inspect_ai/_view/www/src/samples/sample-tools/EpochFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/sample-tools/SelectScorer.tsx +4 -3
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +2 -5
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +2 -2
- inspect_ai/_view/www/src/samples/scores/SampleScoreView.tsx +2 -1
- inspect_ai/_view/www/src/samples/scores/SampleScores.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/ApprovalEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/ErrorEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/InputEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.module.css +4 -0
- inspect_ai/_view/www/src/samples/transcript/LoggerEventView.tsx +12 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +1 -1
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +25 -28
- inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/SampleLimitEventView.tsx +9 -4
- inspect_ai/_view/www/src/samples/transcript/SampleTranscript.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.module.css +32 -0
- inspect_ai/_view/www/src/samples/transcript/SandboxEventView.tsx +153 -0
- inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -5
- inspect_ai/_view/www/src/samples/transcript/SubtaskEventView.tsx +18 -14
- inspect_ai/_view/www/src/samples/transcript/ToolEventView.tsx +5 -5
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.tsx +53 -16
- inspect_ai/_view/www/src/samples/transcript/event/EventNav.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventNavs.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/event/EventPanel.tsx +6 -3
- inspect_ai/_view/www/src/samples/transcript/event/EventRow.tsx +3 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventSection.tsx +2 -2
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.module.css +28 -0
- inspect_ai/_view/www/src/samples/transcript/event/EventTimingPanel.tsx +115 -0
- inspect_ai/_view/www/src/samples/transcript/event/utils.ts +29 -0
- inspect_ai/_view/www/src/samples/transcript/state/StateDiffView.tsx +2 -1
- inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +3 -3
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +11 -8
- inspect_ai/_view/www/src/samples/transcript/types.ts +3 -1
- inspect_ai/_view/www/src/types/log.d.ts +312 -137
- inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +6 -10
- inspect_ai/_view/www/src/usage/ModelUsagePanel.module.css +4 -0
- inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +32 -9
- inspect_ai/_view/www/src/usage/TokenTable.tsx +4 -6
- inspect_ai/_view/www/src/usage/UsageCard.tsx +2 -1
- inspect_ai/_view/www/src/utils/format.ts +8 -5
- inspect_ai/_view/www/src/utils/json.ts +24 -0
- inspect_ai/_view/www/src/workspace/WorkSpace.tsx +6 -5
- inspect_ai/_view/www/src/workspace/WorkSpaceView.tsx +18 -8
- inspect_ai/_view/www/src/workspace/error/TaskErrorPanel.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/Navbar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +3 -3
- inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +4 -3
- inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +5 -4
- inspect_ai/_view/www/src/workspace/navbar/StatusPanel.tsx +5 -8
- inspect_ai/_view/www/src/workspace/sidebar/EvalStatus.tsx +5 -4
- inspect_ai/_view/www/src/workspace/sidebar/LogDirectoryTitleView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/Sidebar.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -2
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +2 -1
- inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -2
- inspect_ai/_view/www/src/workspace/tabs/JsonTab.tsx +2 -5
- inspect_ai/_view/www/src/workspace/tabs/SamplesTab.tsx +12 -11
- inspect_ai/_view/www/yarn.lock +241 -5
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_condense.py +4 -0
- inspect_ai/log/_log.py +72 -12
- inspect_ai/log/_recorders/eval.py +6 -1
- inspect_ai/log/_samples.py +5 -1
- inspect_ai/log/_transcript.py +89 -2
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +8 -1
- inspect_ai/model/_chat_message.py +22 -7
- inspect_ai/model/_conversation.py +11 -9
- inspect_ai/model/_generate_config.py +25 -4
- inspect_ai/model/_model.py +164 -72
- inspect_ai/model/_model_call.py +10 -3
- inspect_ai/model/_model_output.py +3 -0
- inspect_ai/model/_openai.py +106 -40
- inspect_ai/model/_providers/anthropic.py +145 -26
- inspect_ai/model/_providers/bedrock.py +7 -0
- inspect_ai/model/_providers/cloudflare.py +20 -7
- inspect_ai/model/_providers/google.py +29 -8
- inspect_ai/model/_providers/groq.py +66 -27
- inspect_ai/model/_providers/hf.py +6 -0
- inspect_ai/model/_providers/mistral.py +78 -51
- inspect_ai/model/_providers/openai.py +66 -4
- inspect_ai/model/_providers/openai_o1.py +10 -0
- inspect_ai/model/_providers/providers.py +2 -2
- inspect_ai/model/_providers/util/tracker.py +92 -0
- inspect_ai/model/_providers/vllm.py +13 -5
- inspect_ai/model/_reasoning.py +15 -2
- inspect_ai/scorer/_model.py +23 -19
- inspect_ai/solver/_basic_agent.py +1 -3
- inspect_ai/solver/_bridge/patch.py +0 -2
- inspect_ai/solver/_human_agent/agent.py +14 -10
- inspect_ai/solver/_human_agent/commands/__init__.py +7 -3
- inspect_ai/solver/_human_agent/commands/submit.py +76 -30
- inspect_ai/solver/_limit.py +4 -4
- inspect_ai/solver/_plan.py +0 -3
- inspect_ai/solver/_task_state.py +7 -0
- inspect_ai/tool/__init__.py +2 -0
- inspect_ai/tool/_tool.py +3 -1
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +8 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +24 -0
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +25 -0
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +5 -6
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +10 -11
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +71 -0
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +323 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +5 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +279 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +9 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +293 -0
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +94 -0
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +2 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +50 -0
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +31 -359
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +280 -0
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +65 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +146 -0
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +64 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +180 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +15 -9
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +15 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +44 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +39 -0
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +198 -48
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +26 -25
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +178 -39
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +38 -19
- inspect_ai/tool/_tools/_web_search.py +3 -3
- inspect_ai/util/__init__.py +2 -1
- inspect_ai/util/_concurrency.py +14 -8
- inspect_ai/util/_display.py +12 -0
- inspect_ai/util/_sandbox/context.py +15 -0
- inspect_ai/util/_sandbox/docker/docker.py +7 -5
- inspect_ai/util/_sandbox/environment.py +32 -1
- inspect_ai/util/_sandbox/events.py +183 -0
- inspect_ai/util/_sandbox/local.py +3 -3
- inspect_ai/util/_sandbox/self_check.py +131 -43
- inspect_ai/util/_subtask.py +11 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/METADATA +3 -3
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/RECORD +233 -211
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/WHEEL +1 -1
- inspect_ai/_view/www/src/components/VirtualList.module.css +0 -19
- inspect_ai/_view/www/src/components/VirtualList.tsx +0 -292
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_node.py +0 -312
- inspect_ai/tool/_tools/_web_browser/_resources/dm_env_servicer.py +0 -275
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.png +0 -0
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_node.py +0 -176
- inspect_ai/tool/_tools/_web_browser/_resources/test_dm_env_servicer.py +0 -135
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_environment.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/web_environment.py +0 -184
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/LICENSE +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.69.dist-info → inspect_ai-0.3.71.dist-info}/top_level.txt +0 -0
@@ -39,6 +39,7 @@ export type FailOnError = boolean | number | null;
|
|
39
39
|
export type MessageLimit = number | null;
|
40
40
|
export type TokenLimit = number | null;
|
41
41
|
export type TimeLimit = number | null;
|
42
|
+
export type WorkingLimit = number | null;
|
42
43
|
export type MaxSamples = number | null;
|
43
44
|
export type MaxTasks = number | null;
|
44
45
|
export type MaxSubprocesses = number | null;
|
@@ -52,7 +53,30 @@ export type Type = "git";
|
|
52
53
|
export type Origin = string;
|
53
54
|
export type Commit = string;
|
54
55
|
export type Metadata = {} | null;
|
56
|
+
export type Scorers = EvalScorer[] | null;
|
55
57
|
export type Name2 = string;
|
58
|
+
export type Options = {} | null;
|
59
|
+
export type Metrics =
|
60
|
+
| (
|
61
|
+
| EvalMetricDefinition
|
62
|
+
| {
|
63
|
+
[k: string]: EvalMetricDefinition[];
|
64
|
+
}
|
65
|
+
)[]
|
66
|
+
| {
|
67
|
+
[k: string]: EvalMetricDefinition[];
|
68
|
+
}
|
69
|
+
| null;
|
70
|
+
export type Name3 = string;
|
71
|
+
export type Options1 = {} | null;
|
72
|
+
export type Metadata1 = {} | null;
|
73
|
+
export type Metrics1 =
|
74
|
+
| EvalMetricDefinition[]
|
75
|
+
| {
|
76
|
+
[k: string]: EvalMetricDefinition[];
|
77
|
+
}
|
78
|
+
| null;
|
79
|
+
export type Name4 = string;
|
56
80
|
export type Solver1 = string;
|
57
81
|
export type Steps = EvalPlanStep[];
|
58
82
|
export type MaxRetries = number | null;
|
@@ -79,18 +103,19 @@ export type InternalTools = boolean | null;
|
|
79
103
|
export type MaxToolOutput = number | null;
|
80
104
|
export type CachePrompt = "auto" | boolean | null;
|
81
105
|
export type ReasoningEffort = ("low" | "medium" | "high") | null;
|
82
|
-
export type
|
106
|
+
export type ReasoningTokens = number | null;
|
107
|
+
export type ReasoningHistory = ("none" | "all" | "last" | "auto") | null;
|
83
108
|
export type TotalSamples = number;
|
84
109
|
export type CompletedSamples = number;
|
85
|
-
export type
|
110
|
+
export type Name5 = string;
|
86
111
|
export type Scorer = string;
|
87
112
|
export type Reducer = string | null;
|
88
|
-
export type
|
113
|
+
export type Name6 = string;
|
89
114
|
export type Value = number;
|
90
|
-
export type Metadata1 = {} | null;
|
91
115
|
export type Metadata2 = {} | null;
|
92
|
-
export type Scores = EvalScore[];
|
93
116
|
export type Metadata3 = {} | null;
|
117
|
+
export type Scores = EvalScore[];
|
118
|
+
export type Metadata4 = {} | null;
|
94
119
|
export type StartedAt = string;
|
95
120
|
export type CompletedAt = string;
|
96
121
|
export type InputTokens = number;
|
@@ -98,6 +123,7 @@ export type OutputTokens = number;
|
|
98
123
|
export type TotalTokens = number;
|
99
124
|
export type InputTokensCacheWrite = number | null;
|
100
125
|
export type InputTokensCacheRead = number | null;
|
126
|
+
export type ReasoningTokens1 = number | null;
|
101
127
|
export type Message = string;
|
102
128
|
export type Traceback = string;
|
103
129
|
export type TracebackAnsi = string;
|
@@ -115,47 +141,74 @@ export type Input =
|
|
115
141
|
export type Role = "system";
|
116
142
|
export type Content =
|
117
143
|
| string
|
118
|
-
| (
|
144
|
+
| (
|
145
|
+
| ContentText
|
146
|
+
| ContentReasoning
|
147
|
+
| ContentImage
|
148
|
+
| ContentAudio
|
149
|
+
| ContentVideo
|
150
|
+
)[];
|
119
151
|
export type Type1 = "text";
|
120
152
|
export type Text = string;
|
121
|
-
export type Type2 = "
|
153
|
+
export type Type2 = "reasoning";
|
154
|
+
export type Reasoning = string;
|
155
|
+
export type Signature = string | null;
|
156
|
+
export type Redacted = boolean;
|
157
|
+
export type Type3 = "image";
|
122
158
|
export type Image = string;
|
123
159
|
export type Detail = "auto" | "low" | "high";
|
124
|
-
export type
|
160
|
+
export type Type4 = "audio";
|
125
161
|
export type Audio = string;
|
126
162
|
export type Format = "wav" | "mp3";
|
127
|
-
export type
|
163
|
+
export type Type5 = "video";
|
128
164
|
export type Video = string;
|
129
165
|
export type Format1 = "mp4" | "mpeg" | "mov";
|
130
166
|
export type Source = ("input" | "generate") | null;
|
131
167
|
export type Role1 = "user";
|
132
168
|
export type Content1 =
|
133
169
|
| string
|
134
|
-
| (
|
170
|
+
| (
|
171
|
+
| ContentText
|
172
|
+
| ContentReasoning
|
173
|
+
| ContentImage
|
174
|
+
| ContentAudio
|
175
|
+
| ContentVideo
|
176
|
+
)[];
|
135
177
|
export type Source1 = ("input" | "generate") | null;
|
136
178
|
export type ToolCallId = string[] | null;
|
137
179
|
export type Role2 = "assistant";
|
138
180
|
export type Content2 =
|
139
181
|
| string
|
140
|
-
| (
|
182
|
+
| (
|
183
|
+
| ContentText
|
184
|
+
| ContentReasoning
|
185
|
+
| ContentImage
|
186
|
+
| ContentAudio
|
187
|
+
| ContentVideo
|
188
|
+
)[];
|
141
189
|
export type Source2 = ("input" | "generate") | null;
|
142
190
|
export type ToolCalls = ToolCall[] | null;
|
143
191
|
export type Id1 = string;
|
144
192
|
export type Function = string;
|
145
|
-
export type
|
193
|
+
export type Type6 = "function";
|
146
194
|
export type ParseError = string | null;
|
147
195
|
export type Title = string | null;
|
148
196
|
export type Format2 = "text" | "markdown";
|
149
197
|
export type Content3 = string;
|
150
|
-
export type Reasoning = string | null;
|
151
198
|
export type Role3 = "tool";
|
152
199
|
export type Content4 =
|
153
200
|
| string
|
154
|
-
| (
|
201
|
+
| (
|
202
|
+
| ContentText
|
203
|
+
| ContentReasoning
|
204
|
+
| ContentImage
|
205
|
+
| ContentAudio
|
206
|
+
| ContentVideo
|
207
|
+
)[];
|
155
208
|
export type Source3 = ("input" | "generate") | null;
|
156
209
|
export type ToolCallId1 = string | null;
|
157
210
|
export type Function1 = string | null;
|
158
|
-
export type
|
211
|
+
export type Type7 =
|
159
212
|
| "parsing"
|
160
213
|
| "timeout"
|
161
214
|
| "unicode_decode"
|
@@ -194,7 +247,7 @@ export type Bytes1 = number[] | null;
|
|
194
247
|
export type Content5 = Logprob[];
|
195
248
|
export type Choices1 = ChatCompletionChoice[];
|
196
249
|
export type Time = number | null;
|
197
|
-
export type
|
250
|
+
export type Metadata5 = {} | null;
|
198
251
|
export type Error = string | null;
|
199
252
|
export type Scores1 = {
|
200
253
|
[k: string]: Score;
|
@@ -209,8 +262,9 @@ export type Value1 =
|
|
209
262
|
};
|
210
263
|
export type Answer = string | null;
|
211
264
|
export type Explanation = string | null;
|
212
|
-
export type
|
265
|
+
export type Metadata6 = {} | null;
|
213
266
|
export type Timestamp = string;
|
267
|
+
export type WorkingStart = number;
|
214
268
|
export type Pending = boolean | null;
|
215
269
|
export type Event = "sample_init";
|
216
270
|
export type Input1 =
|
@@ -224,43 +278,67 @@ export type Input1 =
|
|
224
278
|
export type Choices2 = string[] | null;
|
225
279
|
export type Target1 = string | string[];
|
226
280
|
export type Id2 = number | string | null;
|
227
|
-
export type
|
281
|
+
export type Metadata8 = {} | null;
|
228
282
|
export type Files1 = {
|
229
283
|
[k: string]: string;
|
230
284
|
} | null;
|
231
285
|
export type Setup1 = string | null;
|
232
286
|
export type JsonValue = unknown;
|
233
287
|
export type Timestamp1 = string;
|
288
|
+
export type WorkingStart1 = number;
|
234
289
|
export type Pending1 = boolean | null;
|
235
290
|
export type Event1 = "sample_limit";
|
236
|
-
export type
|
291
|
+
export type Type8 =
|
292
|
+
| "message"
|
293
|
+
| "time"
|
294
|
+
| "working"
|
295
|
+
| "token"
|
296
|
+
| "operator"
|
297
|
+
| "custom";
|
237
298
|
export type Message2 = string;
|
238
299
|
export type Limit1 = number | null;
|
239
300
|
export type Timestamp2 = string;
|
301
|
+
export type WorkingStart2 = number;
|
240
302
|
export type Pending2 = boolean | null;
|
241
|
-
export type Event2 = "
|
303
|
+
export type Event2 = "sandbox";
|
304
|
+
export type Action = "exec" | "read_file" | "write_file";
|
305
|
+
export type Cmd = string | null;
|
306
|
+
export type Options2 = {
|
307
|
+
[k: string]: JsonValue;
|
308
|
+
} | null;
|
309
|
+
export type File = string | null;
|
310
|
+
export type Input2 = string | null;
|
311
|
+
export type Result = number | null;
|
312
|
+
export type Output = string | null;
|
313
|
+
export type Completed = string | null;
|
314
|
+
export type Timestamp3 = string;
|
315
|
+
export type WorkingStart3 = number;
|
316
|
+
export type Pending3 = boolean | null;
|
317
|
+
export type Event3 = "state";
|
242
318
|
export type Op = "remove" | "add" | "replace" | "move" | "test" | "copy";
|
243
319
|
export type Path = string;
|
244
320
|
export type From = string | null;
|
245
321
|
export type Changes = JsonChange[];
|
246
|
-
export type Timestamp3 = string;
|
247
|
-
export type Pending3 = boolean | null;
|
248
|
-
export type Event3 = "store";
|
249
|
-
export type Changes1 = JsonChange[];
|
250
322
|
export type Timestamp4 = string;
|
323
|
+
export type WorkingStart4 = number;
|
251
324
|
export type Pending4 = boolean | null;
|
252
|
-
export type Event4 = "
|
325
|
+
export type Event4 = "store";
|
326
|
+
export type Changes1 = JsonChange[];
|
327
|
+
export type Timestamp5 = string;
|
328
|
+
export type WorkingStart5 = number;
|
329
|
+
export type Pending5 = boolean | null;
|
330
|
+
export type Event5 = "model";
|
253
331
|
export type Model2 = string;
|
254
|
-
export type
|
332
|
+
export type Input3 = (
|
255
333
|
| ChatMessageSystem
|
256
334
|
| ChatMessageUser
|
257
335
|
| ChatMessageAssistant
|
258
336
|
| ChatMessageTool
|
259
337
|
)[];
|
260
|
-
export type
|
338
|
+
export type Name7 = string;
|
261
339
|
export type Description = string;
|
262
|
-
export type
|
263
|
-
export type
|
340
|
+
export type Type9 = "object";
|
341
|
+
export type Type10 =
|
264
342
|
| ("string" | "integer" | "number" | "boolean" | "array" | "object" | "null")
|
265
343
|
| null;
|
266
344
|
export type Description1 = string | null;
|
@@ -275,28 +353,40 @@ export type Required1 = string[];
|
|
275
353
|
export type Additionalproperties1 = boolean;
|
276
354
|
export type Tools1 = ToolInfo[];
|
277
355
|
export type ToolChoice = ("auto" | "any" | "none") | ToolFunction;
|
278
|
-
export type
|
356
|
+
export type Name8 = string;
|
279
357
|
export type Error1 = string | null;
|
280
358
|
export type Cache = ("read" | "write") | null;
|
281
|
-
export type
|
282
|
-
export type
|
283
|
-
export type
|
284
|
-
export type
|
359
|
+
export type Time1 = number | null;
|
360
|
+
export type Completed1 = string | null;
|
361
|
+
export type WorkingTime = number | null;
|
362
|
+
export type Timestamp6 = string;
|
363
|
+
export type WorkingStart6 = number;
|
364
|
+
export type Pending6 = boolean | null;
|
365
|
+
export type Event6 = "tool";
|
366
|
+
export type Type11 = "function";
|
285
367
|
export type Id3 = string;
|
286
368
|
export type Function2 = string;
|
287
|
-
export type
|
369
|
+
export type Result1 =
|
288
370
|
| string
|
289
371
|
| number
|
290
372
|
| boolean
|
291
373
|
| ContentText
|
374
|
+
| ContentReasoning
|
292
375
|
| ContentImage
|
293
376
|
| ContentAudio
|
294
377
|
| ContentVideo
|
295
|
-
| (
|
378
|
+
| (
|
379
|
+
| ContentText
|
380
|
+
| ContentReasoning
|
381
|
+
| ContentImage
|
382
|
+
| ContentAudio
|
383
|
+
| ContentVideo
|
384
|
+
)[];
|
296
385
|
export type Truncated = [unknown, unknown] | null;
|
297
|
-
export type
|
298
|
-
export type
|
299
|
-
export type
|
386
|
+
export type Timestamp7 = string;
|
387
|
+
export type WorkingStart7 = number;
|
388
|
+
export type Pending7 = boolean | null;
|
389
|
+
export type Event7 = "approval";
|
300
390
|
export type Message3 = string;
|
301
391
|
export type Approver = string;
|
302
392
|
export type Decision =
|
@@ -306,23 +396,27 @@ export type Decision =
|
|
306
396
|
| "escalate"
|
307
397
|
| "terminate";
|
308
398
|
export type Explanation1 = string | null;
|
309
|
-
export type Timestamp7 = string;
|
310
|
-
export type Pending7 = boolean | null;
|
311
|
-
export type Event7 = "input";
|
312
|
-
export type Input3 = string;
|
313
|
-
export type InputAnsi = string;
|
314
399
|
export type Timestamp8 = string;
|
400
|
+
export type WorkingStart8 = number;
|
315
401
|
export type Pending8 = boolean | null;
|
316
|
-
export type Event8 = "
|
317
|
-
export type
|
318
|
-
export type
|
402
|
+
export type Event8 = "input";
|
403
|
+
export type Input4 = string;
|
404
|
+
export type InputAnsi = string;
|
319
405
|
export type Timestamp9 = string;
|
406
|
+
export type WorkingStart9 = number;
|
320
407
|
export type Pending9 = boolean | null;
|
321
|
-
export type Event9 = "
|
408
|
+
export type Event9 = "score";
|
409
|
+
export type Target2 = string | string[] | null;
|
410
|
+
export type Intermediate = boolean;
|
322
411
|
export type Timestamp10 = string;
|
412
|
+
export type WorkingStart10 = number;
|
323
413
|
export type Pending10 = boolean | null;
|
324
|
-
export type Event10 = "
|
325
|
-
export type
|
414
|
+
export type Event10 = "error";
|
415
|
+
export type Timestamp11 = string;
|
416
|
+
export type WorkingStart11 = number;
|
417
|
+
export type Pending11 = boolean | null;
|
418
|
+
export type Event11 = "logger";
|
419
|
+
export type Name9 = string | null;
|
326
420
|
export type Level =
|
327
421
|
| "debug"
|
328
422
|
| "trace"
|
@@ -337,24 +431,28 @@ export type Created1 = number;
|
|
337
431
|
export type Filename = string;
|
338
432
|
export type Module = string;
|
339
433
|
export type Lineno = number;
|
340
|
-
export type Timestamp11 = string;
|
341
|
-
export type Pending11 = boolean | null;
|
342
|
-
export type Event11 = "info";
|
343
|
-
export type Source4 = string | null;
|
344
434
|
export type Timestamp12 = string;
|
435
|
+
export type WorkingStart12 = number;
|
345
436
|
export type Pending12 = boolean | null;
|
346
|
-
export type Event12 = "
|
347
|
-
export type
|
348
|
-
export type Type11 = string | null;
|
349
|
-
export type Name8 = string;
|
437
|
+
export type Event12 = "info";
|
438
|
+
export type Source4 = string | null;
|
350
439
|
export type Timestamp13 = string;
|
440
|
+
export type WorkingStart13 = number;
|
351
441
|
export type Pending13 = boolean | null;
|
352
|
-
export type Event13 = "
|
353
|
-
export type
|
442
|
+
export type Event13 = "step";
|
443
|
+
export type Action1 = "begin" | "end";
|
354
444
|
export type Type12 = string | null;
|
445
|
+
export type Name10 = string;
|
446
|
+
export type Timestamp14 = string;
|
447
|
+
export type WorkingStart14 = number;
|
448
|
+
export type Pending14 = boolean | null;
|
449
|
+
export type Event14 = "subtask";
|
450
|
+
export type Name11 = string;
|
451
|
+
export type Type13 = string | null;
|
355
452
|
export type Events2 = (
|
356
453
|
| SampleInitEvent
|
357
454
|
| SampleLimitEvent
|
455
|
+
| SandboxEvent
|
358
456
|
| StateEvent
|
359
457
|
| StoreEvent
|
360
458
|
| ModelEvent
|
@@ -368,9 +466,12 @@ export type Events2 = (
|
|
368
466
|
| StepEvent
|
369
467
|
| SubtaskEvent
|
370
468
|
)[];
|
469
|
+
export type Completed2 = string | null;
|
470
|
+
export type WorkingTime1 = number | null;
|
371
471
|
export type Events1 = (
|
372
472
|
| SampleInitEvent
|
373
473
|
| SampleLimitEvent
|
474
|
+
| SandboxEvent
|
374
475
|
| StateEvent
|
375
476
|
| StoreEvent
|
376
477
|
| ModelEvent
|
@@ -384,9 +485,12 @@ export type Events1 = (
|
|
384
485
|
| StepEvent
|
385
486
|
| SubtaskEvent
|
386
487
|
)[];
|
488
|
+
export type Completed3 = string | null;
|
489
|
+
export type WorkingTime2 = number | null;
|
387
490
|
export type Events = (
|
388
491
|
| SampleInitEvent
|
389
492
|
| SampleLimitEvent
|
493
|
+
| SandboxEvent
|
390
494
|
| StateEvent
|
391
495
|
| StoreEvent
|
392
496
|
| ModelEvent
|
@@ -400,9 +504,13 @@ export type Events = (
|
|
400
504
|
| StepEvent
|
401
505
|
| SubtaskEvent
|
402
506
|
)[];
|
403
|
-
export type
|
507
|
+
export type TotalTime = number | null;
|
508
|
+
export type WorkingTime3 = number | null;
|
509
|
+
export type Uuid = string | null;
|
510
|
+
export type Type14 =
|
404
511
|
| "context"
|
405
512
|
| "time"
|
513
|
+
| "working"
|
406
514
|
| "message"
|
407
515
|
| "token"
|
408
516
|
| "operator"
|
@@ -421,7 +529,7 @@ export type Value2 =
|
|
421
529
|
};
|
422
530
|
export type Answer1 = string | null;
|
423
531
|
export type Explanation2 = string | null;
|
424
|
-
export type
|
532
|
+
export type Metadata9 = {} | null;
|
425
533
|
export type SampleId1 = string | number | null;
|
426
534
|
export type Samples2 = EvalSampleScore[];
|
427
535
|
export type Location1 = string;
|
@@ -465,6 +573,8 @@ export interface EvalSpec {
|
|
465
573
|
revision: EvalRevision | null;
|
466
574
|
packages: Packages;
|
467
575
|
metadata: Metadata;
|
576
|
+
scorers: Scorers;
|
577
|
+
metrics: Metrics1;
|
468
578
|
}
|
469
579
|
export interface TaskAttribs {}
|
470
580
|
export interface TaskArgs {}
|
@@ -492,6 +602,7 @@ export interface EvalConfig {
|
|
492
602
|
message_limit: MessageLimit;
|
493
603
|
token_limit: TokenLimit;
|
494
604
|
time_limit: TimeLimit;
|
605
|
+
working_limit: WorkingLimit;
|
495
606
|
max_samples: MaxSamples;
|
496
607
|
max_tasks: MaxTasks;
|
497
608
|
max_subprocesses: MaxSubprocesses;
|
@@ -538,11 +649,21 @@ export interface EvalRevision {
|
|
538
649
|
export interface Packages {
|
539
650
|
[k: string]: string;
|
540
651
|
}
|
652
|
+
export interface EvalScorer {
|
653
|
+
name: Name2;
|
654
|
+
options: Options;
|
655
|
+
metrics: Metrics;
|
656
|
+
metadata: Metadata1;
|
657
|
+
}
|
658
|
+
export interface EvalMetricDefinition {
|
659
|
+
name: Name3;
|
660
|
+
options: Options1;
|
661
|
+
}
|
541
662
|
/**
|
542
663
|
* Plan (solvers) used in evaluation.
|
543
664
|
*/
|
544
665
|
export interface EvalPlan {
|
545
|
-
name:
|
666
|
+
name: Name4;
|
546
667
|
steps: Steps;
|
547
668
|
finish: EvalPlanStep | null;
|
548
669
|
config: GenerateConfig;
|
@@ -581,6 +702,7 @@ export interface GenerateConfig {
|
|
581
702
|
max_tool_output: MaxToolOutput;
|
582
703
|
cache_prompt: CachePrompt;
|
583
704
|
reasoning_effort: ReasoningEffort;
|
705
|
+
reasoning_tokens: ReasoningTokens;
|
584
706
|
reasoning_history: ReasoningHistory;
|
585
707
|
}
|
586
708
|
/**
|
@@ -590,31 +712,31 @@ export interface EvalResults {
|
|
590
712
|
total_samples: TotalSamples;
|
591
713
|
completed_samples: CompletedSamples;
|
592
714
|
scores: Scores;
|
593
|
-
metadata:
|
715
|
+
metadata: Metadata4;
|
594
716
|
}
|
595
717
|
/**
|
596
718
|
* Score for evaluation task.
|
597
719
|
*/
|
598
720
|
export interface EvalScore {
|
599
|
-
name:
|
721
|
+
name: Name5;
|
600
722
|
scorer: Scorer;
|
601
723
|
reducer: Reducer;
|
602
724
|
params: Params2;
|
603
|
-
metrics:
|
604
|
-
metadata:
|
725
|
+
metrics: Metrics2;
|
726
|
+
metadata: Metadata3;
|
605
727
|
}
|
606
728
|
export interface Params2 {}
|
607
|
-
export interface
|
729
|
+
export interface Metrics2 {
|
608
730
|
[k: string]: EvalMetric;
|
609
731
|
}
|
610
732
|
/**
|
611
733
|
* Metric for evaluation score.
|
612
734
|
*/
|
613
735
|
export interface EvalMetric {
|
614
|
-
name:
|
736
|
+
name: Name6;
|
615
737
|
value: Value;
|
616
738
|
params: Params3;
|
617
|
-
metadata:
|
739
|
+
metadata: Metadata2;
|
618
740
|
}
|
619
741
|
export interface Params3 {}
|
620
742
|
/**
|
@@ -637,6 +759,7 @@ export interface ModelUsage1 {
|
|
637
759
|
total_tokens: TotalTokens;
|
638
760
|
input_tokens_cache_write: InputTokensCacheWrite;
|
639
761
|
input_tokens_cache_read: InputTokensCacheRead;
|
762
|
+
reasoning_tokens: ReasoningTokens1;
|
640
763
|
}
|
641
764
|
/**
|
642
765
|
* Eval error details.
|
@@ -661,10 +784,13 @@ export interface EvalSample {
|
|
661
784
|
messages: Messages;
|
662
785
|
output: ModelOutput;
|
663
786
|
scores: Scores1;
|
664
|
-
metadata:
|
787
|
+
metadata: Metadata7;
|
665
788
|
store: Store;
|
666
789
|
events: Events;
|
667
790
|
model_usage: ModelUsage2;
|
791
|
+
total_time: TotalTime;
|
792
|
+
working_time: WorkingTime3;
|
793
|
+
uuid: Uuid;
|
668
794
|
error: EvalError | null;
|
669
795
|
attachments: Attachments;
|
670
796
|
limit: EvalSampleLimit | null;
|
@@ -684,11 +810,22 @@ export interface ContentText {
|
|
684
810
|
type: Type1;
|
685
811
|
text: Text;
|
686
812
|
}
|
813
|
+
/**
|
814
|
+
* Reasoning content.
|
815
|
+
*
|
816
|
+
* See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
|
817
|
+
*/
|
818
|
+
export interface ContentReasoning {
|
819
|
+
type: Type2;
|
820
|
+
reasoning: Reasoning;
|
821
|
+
signature: Signature;
|
822
|
+
redacted: Redacted;
|
823
|
+
}
|
687
824
|
/**
|
688
825
|
* Image content.
|
689
826
|
*/
|
690
827
|
export interface ContentImage {
|
691
|
-
type:
|
828
|
+
type: Type3;
|
692
829
|
image: Image;
|
693
830
|
detail: Detail;
|
694
831
|
}
|
@@ -696,7 +833,7 @@ export interface ContentImage {
|
|
696
833
|
* Audio content.
|
697
834
|
*/
|
698
835
|
export interface ContentAudio {
|
699
|
-
type:
|
836
|
+
type: Type4;
|
700
837
|
audio: Audio;
|
701
838
|
format: Format;
|
702
839
|
}
|
@@ -704,7 +841,7 @@ export interface ContentAudio {
|
|
704
841
|
* Video content.
|
705
842
|
*/
|
706
843
|
export interface ContentVideo {
|
707
|
-
type:
|
844
|
+
type: Type5;
|
708
845
|
video: Video;
|
709
846
|
format: Format1;
|
710
847
|
}
|
@@ -725,13 +862,12 @@ export interface ChatMessageAssistant {
|
|
725
862
|
content: Content2;
|
726
863
|
source: Source2;
|
727
864
|
tool_calls: ToolCalls;
|
728
|
-
reasoning: Reasoning;
|
729
865
|
}
|
730
866
|
export interface ToolCall {
|
731
867
|
id: Id1;
|
732
868
|
function: Function;
|
733
869
|
arguments: Arguments;
|
734
|
-
type:
|
870
|
+
type: Type6;
|
735
871
|
parse_error: ParseError;
|
736
872
|
view: ToolCallContent | null;
|
737
873
|
}
|
@@ -756,7 +892,7 @@ export interface ChatMessageTool {
|
|
756
892
|
error: ToolCallError | null;
|
757
893
|
}
|
758
894
|
export interface ToolCallError {
|
759
|
-
type:
|
895
|
+
type: Type7;
|
760
896
|
message: Message1;
|
761
897
|
}
|
762
898
|
/**
|
@@ -767,7 +903,7 @@ export interface ModelOutput {
|
|
767
903
|
choices: Choices1;
|
768
904
|
usage: ModelUsage1 | null;
|
769
905
|
time: Time;
|
770
|
-
metadata:
|
906
|
+
metadata: Metadata5;
|
771
907
|
error: Error;
|
772
908
|
}
|
773
909
|
/**
|
@@ -808,15 +944,16 @@ export interface Score {
|
|
808
944
|
value: Value1;
|
809
945
|
answer: Answer;
|
810
946
|
explanation: Explanation;
|
811
|
-
metadata:
|
947
|
+
metadata: Metadata6;
|
812
948
|
}
|
813
|
-
export interface
|
949
|
+
export interface Metadata7 {}
|
814
950
|
export interface Store {}
|
815
951
|
/**
|
816
952
|
* Beginning of processing a Sample.
|
817
953
|
*/
|
818
954
|
export interface SampleInitEvent {
|
819
955
|
timestamp: Timestamp;
|
956
|
+
working_start: WorkingStart;
|
820
957
|
pending: Pending;
|
821
958
|
event: Event;
|
822
959
|
sample: Sample;
|
@@ -830,7 +967,7 @@ export interface Sample {
|
|
830
967
|
choices: Choices2;
|
831
968
|
target: Target1;
|
832
969
|
id: Id2;
|
833
|
-
metadata:
|
970
|
+
metadata: Metadata8;
|
834
971
|
sandbox: SandboxEnvironmentSpec | null;
|
835
972
|
files: Files1;
|
836
973
|
setup: Setup1;
|
@@ -840,19 +977,38 @@ export interface Sample {
|
|
840
977
|
*/
|
841
978
|
export interface SampleLimitEvent {
|
842
979
|
timestamp: Timestamp1;
|
980
|
+
working_start: WorkingStart1;
|
843
981
|
pending: Pending1;
|
844
982
|
event: Event1;
|
845
|
-
type:
|
983
|
+
type: Type8;
|
846
984
|
message: Message2;
|
847
985
|
limit: Limit1;
|
848
986
|
}
|
849
987
|
/**
|
850
|
-
*
|
988
|
+
* Sandbox execution or I/O
|
851
989
|
*/
|
852
|
-
export interface
|
990
|
+
export interface SandboxEvent {
|
853
991
|
timestamp: Timestamp2;
|
992
|
+
working_start: WorkingStart2;
|
854
993
|
pending: Pending2;
|
855
994
|
event: Event2;
|
995
|
+
action: Action;
|
996
|
+
cmd: Cmd;
|
997
|
+
options: Options2;
|
998
|
+
file: File;
|
999
|
+
input: Input2;
|
1000
|
+
result: Result;
|
1001
|
+
output: Output;
|
1002
|
+
completed: Completed;
|
1003
|
+
}
|
1004
|
+
/**
|
1005
|
+
* Change to the current `TaskState`
|
1006
|
+
*/
|
1007
|
+
export interface StateEvent {
|
1008
|
+
timestamp: Timestamp3;
|
1009
|
+
working_start: WorkingStart3;
|
1010
|
+
pending: Pending3;
|
1011
|
+
event: Event3;
|
856
1012
|
changes: Changes;
|
857
1013
|
}
|
858
1014
|
/**
|
@@ -873,20 +1029,22 @@ export interface JsonChange {
|
|
873
1029
|
* Change to data within the current `Store`.
|
874
1030
|
*/
|
875
1031
|
export interface StoreEvent {
|
876
|
-
timestamp:
|
877
|
-
|
878
|
-
|
1032
|
+
timestamp: Timestamp4;
|
1033
|
+
working_start: WorkingStart4;
|
1034
|
+
pending: Pending4;
|
1035
|
+
event: Event4;
|
879
1036
|
changes: Changes1;
|
880
1037
|
}
|
881
1038
|
/**
|
882
1039
|
* Call to a language model.
|
883
1040
|
*/
|
884
1041
|
export interface ModelEvent {
|
885
|
-
timestamp:
|
886
|
-
|
887
|
-
|
1042
|
+
timestamp: Timestamp5;
|
1043
|
+
working_start: WorkingStart5;
|
1044
|
+
pending: Pending5;
|
1045
|
+
event: Event5;
|
888
1046
|
model: Model2;
|
889
|
-
input:
|
1047
|
+
input: Input3;
|
890
1048
|
tools: Tools1;
|
891
1049
|
tool_choice: ToolChoice;
|
892
1050
|
config: GenerateConfig1;
|
@@ -894,6 +1052,8 @@ export interface ModelEvent {
|
|
894
1052
|
error: Error1;
|
895
1053
|
cache: Cache;
|
896
1054
|
call: ModelCall | null;
|
1055
|
+
completed: Completed1;
|
1056
|
+
working_time: WorkingTime;
|
897
1057
|
}
|
898
1058
|
/**
|
899
1059
|
* Specification of a tool (JSON Schema compatible)
|
@@ -922,7 +1082,7 @@ export interface ModelEvent {
|
|
922
1082
|
* ```
|
923
1083
|
*/
|
924
1084
|
export interface ToolInfo {
|
925
|
-
name:
|
1085
|
+
name: Name7;
|
926
1086
|
description: Description;
|
927
1087
|
parameters: ToolParams;
|
928
1088
|
}
|
@@ -930,7 +1090,7 @@ export interface ToolInfo {
|
|
930
1090
|
* Description of tool parameters object in JSON Schema format.
|
931
1091
|
*/
|
932
1092
|
export interface ToolParams {
|
933
|
-
type:
|
1093
|
+
type: Type9;
|
934
1094
|
properties: Properties;
|
935
1095
|
required: Required1;
|
936
1096
|
additionalProperties: Additionalproperties1;
|
@@ -942,7 +1102,7 @@ export interface Properties {
|
|
942
1102
|
* Description of tool parameter in JSON Schema format.
|
943
1103
|
*/
|
944
1104
|
export interface ToolParam {
|
945
|
-
type:
|
1105
|
+
type: Type10;
|
946
1106
|
description: Description1;
|
947
1107
|
default: Default;
|
948
1108
|
enum: Enum;
|
@@ -956,7 +1116,7 @@ export interface Default {
|
|
956
1116
|
[k: string]: unknown;
|
957
1117
|
}
|
958
1118
|
export interface ToolFunction {
|
959
|
-
name:
|
1119
|
+
name: Name8;
|
960
1120
|
}
|
961
1121
|
/**
|
962
1122
|
* Model generation options.
|
@@ -984,6 +1144,7 @@ export interface GenerateConfig1 {
|
|
984
1144
|
max_tool_output: MaxToolOutput;
|
985
1145
|
cache_prompt: CachePrompt;
|
986
1146
|
reasoning_effort: ReasoningEffort;
|
1147
|
+
reasoning_tokens: ReasoningTokens;
|
987
1148
|
reasoning_history: ReasoningHistory;
|
988
1149
|
}
|
989
1150
|
/**
|
@@ -992,6 +1153,7 @@ export interface GenerateConfig1 {
|
|
992
1153
|
export interface ModelCall {
|
993
1154
|
request: Request;
|
994
1155
|
response: Response;
|
1156
|
+
time: Time1;
|
995
1157
|
}
|
996
1158
|
export interface Request {
|
997
1159
|
[k: string]: JsonValue;
|
@@ -1003,18 +1165,21 @@ export interface Response {
|
|
1003
1165
|
* Call to a tool.
|
1004
1166
|
*/
|
1005
1167
|
export interface ToolEvent {
|
1006
|
-
timestamp:
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1168
|
+
timestamp: Timestamp6;
|
1169
|
+
working_start: WorkingStart6;
|
1170
|
+
pending: Pending6;
|
1171
|
+
event: Event6;
|
1172
|
+
type: Type11;
|
1010
1173
|
id: Id3;
|
1011
1174
|
function: Function2;
|
1012
1175
|
arguments: Arguments1;
|
1013
1176
|
view: ToolCallContent | null;
|
1014
|
-
result:
|
1177
|
+
result: Result1;
|
1015
1178
|
truncated: Truncated;
|
1016
1179
|
error: ToolCallError | null;
|
1017
1180
|
events: Events1;
|
1181
|
+
completed: Completed3;
|
1182
|
+
working_time: WorkingTime2;
|
1018
1183
|
}
|
1019
1184
|
export interface Arguments1 {
|
1020
1185
|
[k: string]: JsonValue;
|
@@ -1023,9 +1188,10 @@ export interface Arguments1 {
|
|
1023
1188
|
* Tool approval.
|
1024
1189
|
*/
|
1025
1190
|
export interface ApprovalEvent {
|
1026
|
-
timestamp:
|
1027
|
-
|
1028
|
-
|
1191
|
+
timestamp: Timestamp7;
|
1192
|
+
working_start: WorkingStart7;
|
1193
|
+
pending: Pending7;
|
1194
|
+
event: Event7;
|
1029
1195
|
message: Message3;
|
1030
1196
|
call: ToolCall;
|
1031
1197
|
view: ToolCallView | null;
|
@@ -1048,10 +1214,11 @@ export interface ToolCallView {
|
|
1048
1214
|
* Input screen interaction.
|
1049
1215
|
*/
|
1050
1216
|
export interface InputEvent {
|
1051
|
-
timestamp:
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1217
|
+
timestamp: Timestamp8;
|
1218
|
+
working_start: WorkingStart8;
|
1219
|
+
pending: Pending8;
|
1220
|
+
event: Event8;
|
1221
|
+
input: Input4;
|
1055
1222
|
input_ansi: InputAnsi;
|
1056
1223
|
}
|
1057
1224
|
/**
|
@@ -1061,9 +1228,10 @@ export interface InputEvent {
|
|
1061
1228
|
* resulting from a call to `score`.
|
1062
1229
|
*/
|
1063
1230
|
export interface ScoreEvent {
|
1064
|
-
timestamp:
|
1065
|
-
|
1066
|
-
|
1231
|
+
timestamp: Timestamp9;
|
1232
|
+
working_start: WorkingStart9;
|
1233
|
+
pending: Pending9;
|
1234
|
+
event: Event9;
|
1067
1235
|
score: Score;
|
1068
1236
|
target: Target2;
|
1069
1237
|
intermediate: Intermediate;
|
@@ -1072,25 +1240,27 @@ export interface ScoreEvent {
|
|
1072
1240
|
* Event with sample error.
|
1073
1241
|
*/
|
1074
1242
|
export interface ErrorEvent {
|
1075
|
-
timestamp:
|
1076
|
-
|
1077
|
-
|
1243
|
+
timestamp: Timestamp10;
|
1244
|
+
working_start: WorkingStart10;
|
1245
|
+
pending: Pending10;
|
1246
|
+
event: Event10;
|
1078
1247
|
error: EvalError;
|
1079
1248
|
}
|
1080
1249
|
/**
|
1081
1250
|
* Log message recorded with Python logger.
|
1082
1251
|
*/
|
1083
1252
|
export interface LoggerEvent {
|
1084
|
-
timestamp:
|
1085
|
-
|
1086
|
-
|
1253
|
+
timestamp: Timestamp11;
|
1254
|
+
working_start: WorkingStart11;
|
1255
|
+
pending: Pending11;
|
1256
|
+
event: Event11;
|
1087
1257
|
message: LoggingMessage;
|
1088
1258
|
}
|
1089
1259
|
/**
|
1090
1260
|
* Message written to Python log.
|
1091
1261
|
*/
|
1092
1262
|
export interface LoggingMessage {
|
1093
|
-
name:
|
1263
|
+
name: Name9;
|
1094
1264
|
level: Level;
|
1095
1265
|
message: Message4;
|
1096
1266
|
created: Created1;
|
@@ -1102,9 +1272,10 @@ export interface LoggingMessage {
|
|
1102
1272
|
* Event with custom info/data.
|
1103
1273
|
*/
|
1104
1274
|
export interface InfoEvent {
|
1105
|
-
timestamp:
|
1106
|
-
|
1107
|
-
|
1275
|
+
timestamp: Timestamp12;
|
1276
|
+
working_start: WorkingStart12;
|
1277
|
+
pending: Pending12;
|
1278
|
+
event: Event12;
|
1108
1279
|
source: Source4;
|
1109
1280
|
data: JsonValue;
|
1110
1281
|
}
|
@@ -1112,28 +1283,32 @@ export interface InfoEvent {
|
|
1112
1283
|
* Step within current sample or subtask.
|
1113
1284
|
*/
|
1114
1285
|
export interface StepEvent {
|
1115
|
-
timestamp:
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1286
|
+
timestamp: Timestamp13;
|
1287
|
+
working_start: WorkingStart13;
|
1288
|
+
pending: Pending13;
|
1289
|
+
event: Event13;
|
1290
|
+
action: Action1;
|
1291
|
+
type: Type12;
|
1292
|
+
name: Name10;
|
1121
1293
|
}
|
1122
1294
|
/**
|
1123
1295
|
* Subtask spawned.
|
1124
1296
|
*/
|
1125
1297
|
export interface SubtaskEvent {
|
1126
|
-
timestamp:
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1298
|
+
timestamp: Timestamp14;
|
1299
|
+
working_start: WorkingStart14;
|
1300
|
+
pending: Pending14;
|
1301
|
+
event: Event14;
|
1302
|
+
name: Name11;
|
1303
|
+
type: Type13;
|
1304
|
+
input: Input5;
|
1305
|
+
result: Result2;
|
1133
1306
|
events: Events2;
|
1307
|
+
completed: Completed2;
|
1308
|
+
working_time: WorkingTime1;
|
1134
1309
|
}
|
1135
|
-
export interface
|
1136
|
-
export interface
|
1310
|
+
export interface Input5 {}
|
1311
|
+
export interface Result2 {
|
1137
1312
|
[k: string]: unknown;
|
1138
1313
|
}
|
1139
1314
|
export interface ModelUsage2 {
|
@@ -1146,7 +1321,7 @@ export interface Attachments {
|
|
1146
1321
|
* Limit encontered by sample.
|
1147
1322
|
*/
|
1148
1323
|
export interface EvalSampleLimit {
|
1149
|
-
type:
|
1324
|
+
type: Type14;
|
1150
1325
|
limit: Limit2;
|
1151
1326
|
}
|
1152
1327
|
/**
|
@@ -1164,6 +1339,6 @@ export interface EvalSampleScore {
|
|
1164
1339
|
value: Value2;
|
1165
1340
|
answer: Answer1;
|
1166
1341
|
explanation: Explanation2;
|
1167
|
-
metadata:
|
1342
|
+
metadata: Metadata9;
|
1168
1343
|
sample_id: SampleId1;
|
1169
1344
|
}
|