inspect-ai 0.3.91__py3-none-any.whl → 0.3.93__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +31 -0
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +13 -20
- inspect_ai/_util/local_server.py +368 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +159 -146
- inspect_ai/_view/www/dist/assets/index.js +1020 -1061
- inspect_ai/_view/www/log-schema.json +4 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +3 -2
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +20 -12
- inspect_ai/agent/_as_tool.py +15 -3
- inspect_ai/agent/_handoff.py +8 -1
- inspect_ai/agent/_run.py +11 -3
- inspect_ai/log/__init__.py +4 -0
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +0 -8
- inspect_ai/log/_transcript.py +7 -1
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +32 -12
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +21 -48
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_openai_responses.py +13 -1
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +241 -0
- inspect_ai/model/_providers/vllm.py +207 -400
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +2 -0
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +12 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/RECORD +90 -109
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/WHEEL +1 -1
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.91.dist-info → inspect_ai-0.3.93.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,10 @@
|
|
1
1
|
import { EvalPlan, EvalScore, EvalSpec, Params2 } from "../../@types/log";
|
2
|
-
import { toTitleCase } from "../../utils/format";
|
3
|
-
import { ghCommitUrl } from "../../utils/git";
|
4
|
-
import { MetaDataView } from "../content/MetaDataView";
|
5
2
|
import { DatasetDetailView } from "./DatasetDetailView";
|
6
3
|
import { ScorerDetailView } from "./ScorerDetailView";
|
7
4
|
import { SolversDetailView } from "./SolverDetailView";
|
8
5
|
|
9
6
|
import clsx from "clsx";
|
10
7
|
import { FC, ReactNode } from "react";
|
11
|
-
import { kModelNone } from "../../constants";
|
12
8
|
import styles from "./PlanDetailView.module.css";
|
13
9
|
|
14
10
|
interface PlanDetailViewProps {
|
@@ -26,71 +22,7 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
|
|
26
22
|
return null;
|
27
23
|
}
|
28
24
|
|
29
|
-
// Add configuration
|
30
|
-
const config: Record<string, unknown> = {};
|
31
|
-
Object.entries(evaluation?.config || {}).forEach((entry) => {
|
32
|
-
const key = entry[0];
|
33
|
-
const value = entry[1];
|
34
|
-
config[key] = value;
|
35
|
-
});
|
36
|
-
|
37
25
|
const steps = plan?.steps;
|
38
|
-
const metadata = evaluation?.metadata;
|
39
|
-
const revision = evaluation?.revision;
|
40
|
-
const packages = evaluation?.packages;
|
41
|
-
const model_args = evaluation?.model_args;
|
42
|
-
const task_args = evaluation?.task_args;
|
43
|
-
const generate_config = plan?.config;
|
44
|
-
|
45
|
-
const taskInformation: Record<string, unknown> = {
|
46
|
-
["Task ID"]: evaluation?.task_id,
|
47
|
-
["Run ID"]: evaluation?.run_id,
|
48
|
-
};
|
49
|
-
if (revision) {
|
50
|
-
taskInformation[
|
51
|
-
`${revision.type ? `${toTitleCase(revision.type)} ` : ""}Revision`
|
52
|
-
] = {
|
53
|
-
_html: (
|
54
|
-
<a href={ghCommitUrl(revision.origin, revision.commit)}>
|
55
|
-
{revision.commit}
|
56
|
-
</a>
|
57
|
-
),
|
58
|
-
};
|
59
|
-
}
|
60
|
-
if (packages) {
|
61
|
-
const names = Object.keys(packages).map((key) => {
|
62
|
-
return `${key} ${packages[key]}`;
|
63
|
-
});
|
64
|
-
|
65
|
-
if (names.length === 1) {
|
66
|
-
taskInformation["Inspect"] = names[0];
|
67
|
-
} else {
|
68
|
-
taskInformation["Inspect"] = names;
|
69
|
-
}
|
70
|
-
}
|
71
|
-
if (evaluation.tags) {
|
72
|
-
taskInformation["Tags"] = evaluation.tags.join(", ");
|
73
|
-
}
|
74
|
-
|
75
|
-
if (evaluation?.model && evaluation.model !== kModelNone) {
|
76
|
-
config["model"] = evaluation.model;
|
77
|
-
}
|
78
|
-
|
79
|
-
if (evaluation?.model_base_url) {
|
80
|
-
config["model_base_url"] = evaluation.model_base_url;
|
81
|
-
}
|
82
|
-
|
83
|
-
if (evaluation?.sandbox) {
|
84
|
-
if (Array.isArray(evaluation?.sandbox)) {
|
85
|
-
config["sandbox"] = evaluation.sandbox[0];
|
86
|
-
if (evaluation.sandbox[1]) {
|
87
|
-
config["sandbox_config"] = evaluation.sandbox[1];
|
88
|
-
}
|
89
|
-
} else {
|
90
|
-
config["sandbox"] = evaluation?.sandbox.type;
|
91
|
-
config["sandbox_config"] = evaluation?.sandbox.config;
|
92
|
-
}
|
93
|
-
}
|
94
26
|
|
95
27
|
const taskColumns: {
|
96
28
|
title: string;
|
@@ -148,117 +80,12 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
|
|
148
80
|
}
|
149
81
|
}
|
150
82
|
|
151
|
-
// Compute the column style for the remaining (either 1 or 2 columns wide)
|
152
|
-
const metadataColumns: {
|
153
|
-
title: string;
|
154
|
-
className: string;
|
155
|
-
contents: ReactNode;
|
156
|
-
}[] = [];
|
157
|
-
const cols = colCount(
|
158
|
-
metadataColumns,
|
159
|
-
task_args,
|
160
|
-
model_args,
|
161
|
-
config,
|
162
|
-
metadata,
|
163
|
-
);
|
164
|
-
|
165
|
-
metadataColumns.push({
|
166
|
-
title: "Task Information",
|
167
|
-
className: cols === 1 ? styles.oneCol : styles.twoCol,
|
168
|
-
contents: (
|
169
|
-
<MetaDataView
|
170
|
-
key={`plan-md-task`}
|
171
|
-
className={"text-size-small"}
|
172
|
-
entries={taskInformation}
|
173
|
-
tableOptions="sm"
|
174
|
-
/>
|
175
|
-
),
|
176
|
-
});
|
177
|
-
|
178
|
-
if (task_args && Object.keys(task_args).length > 0) {
|
179
|
-
metadataColumns.push({
|
180
|
-
title: "Task Args",
|
181
|
-
className: cols === 1 ? styles.oneCol : styles.twoCol,
|
182
|
-
contents: (
|
183
|
-
<MetaDataView
|
184
|
-
key={`plan-md-task-args`}
|
185
|
-
className={"text-size-small"}
|
186
|
-
entries={task_args as Record<string, unknown>}
|
187
|
-
tableOptions="sm"
|
188
|
-
/>
|
189
|
-
),
|
190
|
-
});
|
191
|
-
}
|
192
|
-
if (model_args && Object.keys(model_args).length > 0) {
|
193
|
-
metadataColumns.push({
|
194
|
-
title: "Model Args",
|
195
|
-
className: cols === 1 ? styles.oneCol : styles.twoCol,
|
196
|
-
contents: (
|
197
|
-
<MetaDataView
|
198
|
-
key={`plan-md-model-args`}
|
199
|
-
className={"text-size-small"}
|
200
|
-
entries={model_args as Record<string, unknown>}
|
201
|
-
tableOptions="sm"
|
202
|
-
/>
|
203
|
-
),
|
204
|
-
});
|
205
|
-
}
|
206
|
-
|
207
|
-
if (config && Object.keys(config).length > 0) {
|
208
|
-
metadataColumns.push({
|
209
|
-
title: "Configuration",
|
210
|
-
className: cols === 1 ? styles.oneCol : styles.twoCol,
|
211
|
-
contents: (
|
212
|
-
<MetaDataView
|
213
|
-
key={`plan-md-config`}
|
214
|
-
className={"text-size-small"}
|
215
|
-
entries={config}
|
216
|
-
tableOptions="sm"
|
217
|
-
/>
|
218
|
-
),
|
219
|
-
});
|
220
|
-
}
|
221
|
-
|
222
|
-
if (generate_config && Object.keys(generate_config).length > 0) {
|
223
|
-
const generate_record: Record<string, unknown> = Object.fromEntries(
|
224
|
-
Object.entries(generate_config),
|
225
|
-
);
|
226
|
-
|
227
|
-
metadataColumns.push({
|
228
|
-
title: "Generate Config",
|
229
|
-
className: cols === 1 ? styles.oneCol : styles.twoCol,
|
230
|
-
contents: (
|
231
|
-
<MetaDataView
|
232
|
-
key={`plan-md-generate-config`}
|
233
|
-
className={"text-size-small"}
|
234
|
-
entries={generate_record}
|
235
|
-
tableOptions="sm"
|
236
|
-
/>
|
237
|
-
),
|
238
|
-
});
|
239
|
-
}
|
240
|
-
|
241
|
-
if (metadata && Object.keys(metadata).length > 0) {
|
242
|
-
metadataColumns.push({
|
243
|
-
title: "Metadata",
|
244
|
-
className: cols === 1 ? styles.oneCol : styles.twoCol,
|
245
|
-
contents: (
|
246
|
-
<MetaDataView
|
247
|
-
key={`plan-md-metadata`}
|
248
|
-
className={"text-size-small"}
|
249
|
-
entries={metadata}
|
250
|
-
tableOptions="sm"
|
251
|
-
/>
|
252
|
-
),
|
253
|
-
});
|
254
|
-
}
|
255
|
-
|
256
83
|
return (
|
257
84
|
<div className={styles.container}>
|
258
85
|
<div
|
259
86
|
className={styles.grid}
|
260
87
|
style={{
|
261
|
-
gridTemplateColumns: `repeat(${taskColumns.length},
|
88
|
+
gridTemplateColumns: `repeat(${taskColumns.length}, fit-content(50%))`,
|
262
89
|
}}
|
263
90
|
>
|
264
91
|
{taskColumns.map((col) => {
|
@@ -273,34 +100,10 @@ export const PlanDetailView: FC<PlanDetailViewProps> = ({
|
|
273
100
|
);
|
274
101
|
})}
|
275
102
|
</div>
|
276
|
-
|
277
|
-
<div className={clsx(styles.row)}>
|
278
|
-
{metadataColumns.map((col) => {
|
279
|
-
return (
|
280
|
-
<PlanColumn
|
281
|
-
title={col.title}
|
282
|
-
className={col.className}
|
283
|
-
key={`plan-col-${col.title}`}
|
284
|
-
>
|
285
|
-
{col.contents}
|
286
|
-
</PlanColumn>
|
287
|
-
);
|
288
|
-
})}
|
289
|
-
</div>
|
290
103
|
</div>
|
291
104
|
);
|
292
105
|
};
|
293
106
|
|
294
|
-
const colCount = (...other: unknown[]) => {
|
295
|
-
let count = 0;
|
296
|
-
for (const o in other) {
|
297
|
-
if (o && Object.keys(o).length > 0) {
|
298
|
-
count++;
|
299
|
-
}
|
300
|
-
}
|
301
|
-
return count;
|
302
|
-
};
|
303
|
-
|
304
107
|
interface PlanColumnProps {
|
305
108
|
title: string;
|
306
109
|
className: string | string[];
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import { Value2 } from "../../../../@types/log";
|
2
2
|
import { kScoreTypeNumeric } from "../../../../constants";
|
3
3
|
import { formatDecimalNoTrailingZeroes } from "../../../../utils/format";
|
4
|
+
import { compareWithNan } from "../../../../utils/numeric";
|
4
5
|
import { ScoreDescriptor } from "../types";
|
5
6
|
|
6
7
|
export const numericScoreDescriptor = (values: Value2[]): ScoreDescriptor => {
|
@@ -14,7 +15,7 @@ export const numericScoreDescriptor = (values: Value2[]): ScoreDescriptor => {
|
|
14
15
|
max: Math.max(...onlyNumeric),
|
15
16
|
compare: (a, b) => {
|
16
17
|
if (typeof a.value === "number" && typeof b.value === "number") {
|
17
|
-
return a.value
|
18
|
+
return compareWithNan(a.value, b.value);
|
18
19
|
} else {
|
19
20
|
console.warn("Comparing non-numerics using a numeric score descriptor");
|
20
21
|
return 0;
|
@@ -6,6 +6,7 @@ import styles from "./ModelUsagePanel.module.css";
|
|
6
6
|
|
7
7
|
interface ModelUsageProps {
|
8
8
|
usage: ModelUsage1;
|
9
|
+
className?: string | string[];
|
9
10
|
}
|
10
11
|
|
11
12
|
interface ModelUsageRow {
|
@@ -19,7 +20,7 @@ interface ModelUsageRow {
|
|
19
20
|
/**
|
20
21
|
* Renders the ModelUsagePanel component.
|
21
22
|
*/
|
22
|
-
export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
|
23
|
+
export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage, className }) => {
|
23
24
|
if (!usage) {
|
24
25
|
return null;
|
25
26
|
}
|
@@ -84,7 +85,7 @@ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
|
|
84
85
|
});
|
85
86
|
|
86
87
|
return (
|
87
|
-
<div className={clsx("text-size-small", styles.wrapper)}>
|
88
|
+
<div className={clsx("text-size-small", styles.wrapper, className)}>
|
88
89
|
{rows.map((row, idx) => {
|
89
90
|
if (row.label === "---") {
|
90
91
|
return (
|
@@ -79,10 +79,10 @@ export const TokenRow: FC<TokenRowProps> = ({ model, usage }) => {
|
|
79
79
|
return (
|
80
80
|
<tr>
|
81
81
|
<td>
|
82
|
-
<div className={styles.model}>{model}</div>
|
82
|
+
<div className={clsx(styles.model, styles.cellContents)}>{model}</div>
|
83
83
|
</td>
|
84
84
|
<td>
|
85
|
-
<ModelUsagePanel usage={usage} />
|
85
|
+
<ModelUsagePanel usage={usage} className={clsx(styles.cellContents)} />
|
86
86
|
</td>
|
87
87
|
</tr>
|
88
88
|
);
|
@@ -3,13 +3,18 @@
|
|
3
3
|
padding-bottom: 1em;
|
4
4
|
margin-left: 0.5em;
|
5
5
|
display: flex;
|
6
|
+
flex-wrap: wrap;
|
7
|
+
gap: 1em;
|
6
8
|
}
|
7
9
|
|
8
10
|
.col1 {
|
9
|
-
flex:
|
10
|
-
|
11
|
+
flex: 0 1 auto;
|
12
|
+
min-width: 200px;
|
13
|
+
width: fit-content;
|
11
14
|
}
|
12
15
|
|
13
16
|
.col2 {
|
14
|
-
flex: 1 1
|
17
|
+
flex: 1 1 auto;
|
18
|
+
min-width: 300px;
|
19
|
+
width: fit-content;
|
15
20
|
}
|
@@ -1,10 +1,5 @@
|
|
1
|
-
import clsx from "clsx";
|
2
1
|
import { EvalStats } from "../../@types/log";
|
3
|
-
import { FontSize } from "../../app/appearance/fonts";
|
4
|
-
import { ApplicationIcons } from "../../app/appearance/icons";
|
5
|
-
import { MetaDataView } from "../../app/content/MetaDataView";
|
6
2
|
import { Card, CardBody, CardHeader } from "../../components/Card";
|
7
|
-
import { formatDuration } from "../../utils/format";
|
8
3
|
import { ModelTokenTable } from "./ModelTokenTable";
|
9
4
|
|
10
5
|
import { FC } from "react";
|
@@ -24,40 +19,11 @@ export const UsageCard: FC<UsageCardProps> = ({ stats }) => {
|
|
24
19
|
return null;
|
25
20
|
}
|
26
21
|
|
27
|
-
const totalDuration = formatDuration(
|
28
|
-
new Date(stats.started_at),
|
29
|
-
new Date(stats.completed_at),
|
30
|
-
);
|
31
|
-
const usageMetadataStyle = {
|
32
|
-
fontSize: FontSize.smaller,
|
33
|
-
};
|
34
|
-
|
35
22
|
return (
|
36
23
|
<Card>
|
37
|
-
<CardHeader
|
24
|
+
<CardHeader label="Usage" />
|
38
25
|
<CardBody id={kUsageCardBodyId}>
|
39
26
|
<div className={styles.wrapper}>
|
40
|
-
<div className={styles.col1}>
|
41
|
-
<div
|
42
|
-
className={clsx(
|
43
|
-
"text-size-smaller",
|
44
|
-
"text-style-label",
|
45
|
-
"text-style-secondary",
|
46
|
-
)}
|
47
|
-
>
|
48
|
-
Duration
|
49
|
-
</div>
|
50
|
-
<MetaDataView
|
51
|
-
entries={{
|
52
|
-
["Start"]: new Date(stats.started_at).toLocaleString(),
|
53
|
-
["End"]: new Date(stats.completed_at).toLocaleString(),
|
54
|
-
["Duration"]: totalDuration,
|
55
|
-
}}
|
56
|
-
tableOptions="borderless,sm"
|
57
|
-
style={usageMetadataStyle}
|
58
|
-
/>
|
59
|
-
</div>
|
60
|
-
|
61
27
|
<div className={styles.col2}>
|
62
28
|
<ModelTokenTable model_usage={stats.model_usage} />
|
63
29
|
</div>
|
@@ -5,6 +5,8 @@ export const kModelNone = "none/none";
|
|
5
5
|
export const kLogViewSamplesTabId = "samples";
|
6
6
|
export const kLogViewJsonTabId = "json";
|
7
7
|
export const kLogViewInfoTabId = "info";
|
8
|
+
export const kLogViewModelsTabId = "models";
|
9
|
+
export const kLogViewTaskTabId = "task";
|
8
10
|
|
9
11
|
// Sample tab constants
|
10
12
|
export const kSampleMessagesTabId = `messages`;
|
@@ -0,0 +1,17 @@
|
|
1
|
+
export function compareWithNan(a: number, b: number): number {
|
2
|
+
const aIsNaN = Number.isNaN(a);
|
3
|
+
const bIsNaN = Number.isNaN(b);
|
4
|
+
|
5
|
+
if (aIsNaN && bIsNaN) {
|
6
|
+
return 0;
|
7
|
+
}
|
8
|
+
|
9
|
+
if (aIsNaN) {
|
10
|
+
return 1;
|
11
|
+
}
|
12
|
+
if (bIsNaN) {
|
13
|
+
return -1;
|
14
|
+
}
|
15
|
+
|
16
|
+
return a - b;
|
17
|
+
}
|
inspect_ai/agent/_agent.py
CHANGED
@@ -27,13 +27,14 @@ from inspect_ai.model._chat_message import (
|
|
27
27
|
ChatMessageAssistant,
|
28
28
|
)
|
29
29
|
from inspect_ai.model._model_output import ChatCompletionChoice, ModelOutput
|
30
|
+
from inspect_ai.util._limited_conversation import ChatMessageList
|
30
31
|
|
31
32
|
|
32
33
|
class AgentState:
|
33
34
|
"""Agent state."""
|
34
35
|
|
35
36
|
def __init__(self, *, messages: list[ChatMessage]) -> None:
|
36
|
-
self._messages = messages
|
37
|
+
self._messages: list[ChatMessage] = ChatMessageList(messages)
|
37
38
|
self._output: ModelOutput | None = None
|
38
39
|
|
39
40
|
@property
|
@@ -43,8 +44,7 @@ class AgentState:
|
|
43
44
|
|
44
45
|
@messages.setter
|
45
46
|
def messages(self, messages: list[ChatMessage]) -> None:
|
46
|
-
|
47
|
-
self._messages = messages
|
47
|
+
self._messages = ChatMessageList(messages)
|
48
48
|
|
49
49
|
@property
|
50
50
|
def output(self) -> ModelOutput:
|
inspect_ai/agent/_as_solver.py
CHANGED
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
4
4
|
|
5
|
+
from inspect_ai.util._limit import Limit, apply_limits
|
6
|
+
|
5
7
|
if TYPE_CHECKING:
|
6
8
|
from inspect_ai.solver._solver import Solver
|
7
9
|
|
@@ -14,7 +16,7 @@ from inspect_ai.tool._tool_info import parse_tool_info
|
|
14
16
|
from ._agent import Agent, AgentState
|
15
17
|
|
16
18
|
|
17
|
-
def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
|
19
|
+
def as_solver(agent: Agent, limits: list[Limit] = [], **agent_kwargs: Any) -> Solver:
|
18
20
|
"""Convert an agent to a solver.
|
19
21
|
|
20
22
|
Note that agents used as solvers will only receive their first parameter
|
@@ -23,6 +25,8 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
|
|
23
25
|
|
24
26
|
Args:
|
25
27
|
agent: Agent to convert.
|
28
|
+
limits: List of limits to apply to the agent. Should a limit
|
29
|
+
be exceeded, the Sample ends and proceeds to scoring.
|
26
30
|
**agent_kwargs: Arguments to curry to Agent function (required
|
27
31
|
if the agent has parameters without default values).
|
28
32
|
|
@@ -52,17 +56,21 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
|
|
52
56
|
@solver(name=agent_name)
|
53
57
|
def agent_to_solver() -> Solver:
|
54
58
|
async def solve(state: TaskState, generate: Generate) -> TaskState:
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
state.
|
59
|
+
agent_state = AgentState(messages=state.messages)
|
60
|
+
|
61
|
+
try:
|
62
|
+
# run the agent with limits
|
63
|
+
with apply_limits(limits):
|
64
|
+
agent_state = await agent(agent_state, **agent_kwargs)
|
65
|
+
# if an exception occurs, we still want to update the TaskState with the
|
66
|
+
# AgentState's messages + output so that it appears in the log and is scored
|
67
|
+
finally:
|
68
|
+
# update messages
|
69
|
+
state.messages = agent_state.messages
|
70
|
+
|
71
|
+
# update output if its not empty
|
72
|
+
if agent_state.output:
|
73
|
+
state.output = agent_state.output
|
66
74
|
|
67
75
|
return state
|
68
76
|
|
inspect_ai/agent/_as_tool.py
CHANGED
@@ -10,12 +10,18 @@ from inspect_ai.tool._tool import Tool, ToolResult, tool
|
|
10
10
|
from inspect_ai.tool._tool_def import ToolDef, validate_tool_parameters
|
11
11
|
from inspect_ai.tool._tool_info import ToolInfo, parse_tool_info
|
12
12
|
from inspect_ai.tool._tool_params import ToolParam
|
13
|
+
from inspect_ai.util._limit import Limit, apply_limits
|
13
14
|
|
14
15
|
from ._agent import AGENT_DESCRIPTION, Agent, AgentState
|
15
16
|
|
16
17
|
|
17
18
|
@tool
|
18
|
-
def as_tool(
|
19
|
+
def as_tool(
|
20
|
+
agent: Agent,
|
21
|
+
description: str | None = None,
|
22
|
+
limits: list[Limit] = [],
|
23
|
+
**agent_kwargs: Any,
|
24
|
+
) -> Tool:
|
19
25
|
"""Convert an agent to a tool.
|
20
26
|
|
21
27
|
By default the model will see all of the agent's arguments as
|
@@ -27,6 +33,9 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
|
|
27
33
|
Args:
|
28
34
|
agent: Agent to convert.
|
29
35
|
description: Tool description (defaults to agent description)
|
36
|
+
limits: List of limits to apply to the agent. Should a limit
|
37
|
+
be exceeded, the tool call ends and returns an error
|
38
|
+
explaining that a limit was exceeded.
|
30
39
|
**agent_kwargs: Arguments to curry to Agent function (arguments
|
31
40
|
provided here will not be presented to the model as part
|
32
41
|
of the tool interface).
|
@@ -41,9 +50,12 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
|
|
41
50
|
)
|
42
51
|
|
43
52
|
async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
|
44
|
-
# prepare state
|
53
|
+
# prepare state
|
45
54
|
state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
|
46
|
-
|
55
|
+
|
56
|
+
# run the agent with limits
|
57
|
+
with apply_limits(limits):
|
58
|
+
state = await agent(state, *args, **(agent_kwargs | kwargs))
|
47
59
|
|
48
60
|
# find assistant message to read content from (prefer output)
|
49
61
|
if not state.output.empty:
|
inspect_ai/agent/_handoff.py
CHANGED
@@ -9,6 +9,7 @@ from inspect_ai._util.registry import (
|
|
9
9
|
from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
|
10
10
|
from inspect_ai.tool._tool_def import ToolDef
|
11
11
|
from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
|
12
|
+
from inspect_ai.util._limit import Limit
|
12
13
|
|
13
14
|
from ._agent import Agent
|
14
15
|
from ._as_tool import agent_tool_info
|
@@ -21,6 +22,7 @@ def handoff(
|
|
21
22
|
input_filter: MessageFilter | None = None,
|
22
23
|
output_filter: MessageFilter | None = None,
|
23
24
|
tool_name: str | None = None,
|
25
|
+
limits: list[Limit] = [],
|
24
26
|
**agent_kwargs: Any,
|
25
27
|
) -> Tool:
|
26
28
|
"""Create a tool that enables models to handoff to agents.
|
@@ -35,6 +37,9 @@ def handoff(
|
|
35
37
|
Use the built-in `last_message` filter to return only the last message
|
36
38
|
or alternatively specify a custom `MessageFilter` function.
|
37
39
|
tool_name: Alternate tool name (defaults to `transfer_to_{agent_name}`)
|
40
|
+
limits: List of limits to apply to the agent. Should a limit be exceeded,
|
41
|
+
the agent stops and a user message is appended explaining that a limit was
|
42
|
+
exceeded.
|
38
43
|
**agent_kwargs: Arguments to curry to `Agent` function (arguments provided here
|
39
44
|
will not be presented to the model as part of the tool interface).
|
40
45
|
|
@@ -52,7 +57,7 @@ def handoff(
|
|
52
57
|
tool_info = agent_tool_info(agent, description, **agent_kwargs)
|
53
58
|
|
54
59
|
# AgentTool calls will be intercepted by execute_tools
|
55
|
-
agent_tool = AgentTool(agent, input_filter, output_filter, **agent_kwargs)
|
60
|
+
agent_tool = AgentTool(agent, input_filter, output_filter, limits, **agent_kwargs)
|
56
61
|
tool_name = tool_name or f"transfer_to_{tool_info.name}"
|
57
62
|
set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
|
58
63
|
set_tool_description(
|
@@ -72,11 +77,13 @@ class AgentTool(Tool):
|
|
72
77
|
agent: Agent,
|
73
78
|
input_filter: MessageFilter | None = None,
|
74
79
|
output_filter: MessageFilter | None = None,
|
80
|
+
limits: list[Limit] = [],
|
75
81
|
**kwargs: Any,
|
76
82
|
):
|
77
83
|
self.agent = agent
|
78
84
|
self.input_filter = input_filter
|
79
85
|
self.output_filter = output_filter
|
86
|
+
self.limits = limits
|
80
87
|
self.kwargs = kwargs
|
81
88
|
|
82
89
|
@property
|
inspect_ai/agent/_run.py
CHANGED
@@ -2,12 +2,16 @@ from copy import copy
|
|
2
2
|
from typing import Any
|
3
3
|
|
4
4
|
from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
|
5
|
+
from inspect_ai.util._limit import Limit, apply_limits
|
5
6
|
|
6
7
|
from ._agent import Agent, AgentState
|
7
8
|
|
8
9
|
|
9
10
|
async def run(
|
10
|
-
agent: Agent,
|
11
|
+
agent: Agent,
|
12
|
+
input: str | list[ChatMessage] | AgentState,
|
13
|
+
limits: list[Limit] = [],
|
14
|
+
**agent_kwargs: Any,
|
11
15
|
) -> AgentState:
|
12
16
|
"""Run an agent.
|
13
17
|
|
@@ -17,6 +21,9 @@ async def run(
|
|
17
21
|
Args:
|
18
22
|
agent: Agent to run.
|
19
23
|
input: Agent input (string, list of messages, or an `AgentState`).
|
24
|
+
limits: List of limits to apply to the agent. Should a limit be
|
25
|
+
exceeded, a LimitExceededError is raised which the caller may
|
26
|
+
handle as appropriate.
|
20
27
|
**agent_kwargs: Additional arguments to pass to agent.
|
21
28
|
|
22
29
|
Returns:
|
@@ -43,5 +50,6 @@ async def run(
|
|
43
50
|
# create state
|
44
51
|
state = AgentState(messages=input_messages)
|
45
52
|
|
46
|
-
# run the agent
|
47
|
-
|
53
|
+
# run the agent with limits
|
54
|
+
with apply_limits(limits):
|
55
|
+
return await agent(state, **agent_kwargs)
|
inspect_ai/log/__init__.py
CHANGED
@@ -9,6 +9,7 @@ from ._file import (
|
|
9
9
|
read_eval_log,
|
10
10
|
read_eval_log_async,
|
11
11
|
read_eval_log_sample,
|
12
|
+
read_eval_log_sample_summaries,
|
12
13
|
read_eval_log_samples,
|
13
14
|
write_eval_log,
|
14
15
|
write_eval_log_async,
|
@@ -28,6 +29,7 @@ from ._log import (
|
|
28
29
|
EvalSampleLimit,
|
29
30
|
EvalSampleReductions,
|
30
31
|
EvalSampleScore,
|
32
|
+
EvalSampleSummary,
|
31
33
|
EvalScore,
|
32
34
|
EvalSpec,
|
33
35
|
EvalStats,
|
@@ -70,6 +72,7 @@ __all__ = [
|
|
70
72
|
"EvalSampleLimit",
|
71
73
|
"EvalSampleScore",
|
72
74
|
"EvalSampleReductions",
|
75
|
+
"EvalSampleSummary",
|
73
76
|
"EvalScore",
|
74
77
|
"EvalSpec",
|
75
78
|
"EvalStats",
|
@@ -100,6 +103,7 @@ __all__ = [
|
|
100
103
|
"read_eval_log_async",
|
101
104
|
"read_eval_log_sample",
|
102
105
|
"read_eval_log_samples",
|
106
|
+
"read_eval_log_sample_summaries",
|
103
107
|
"condense_sample",
|
104
108
|
"resolve_sample_attachments",
|
105
109
|
"write_eval_log",
|