inspect-ai 0.3.88__py3-none-any.whl → 0.3.90__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +16 -0
- inspect_ai/_cli/score.py +1 -12
- inspect_ai/_cli/util.py +4 -2
- inspect_ai/_display/core/footer.py +2 -2
- inspect_ai/_display/plain/display.py +2 -2
- inspect_ai/_eval/context.py +7 -1
- inspect_ai/_eval/eval.py +51 -27
- inspect_ai/_eval/evalset.py +27 -10
- inspect_ai/_eval/loader.py +7 -8
- inspect_ai/_eval/run.py +23 -31
- inspect_ai/_eval/score.py +18 -1
- inspect_ai/_eval/task/log.py +5 -13
- inspect_ai/_eval/task/resolved.py +1 -0
- inspect_ai/_eval/task/run.py +231 -256
- inspect_ai/_eval/task/task.py +25 -2
- inspect_ai/_eval/task/util.py +1 -8
- inspect_ai/_util/constants.py +1 -0
- inspect_ai/_util/json.py +8 -3
- inspect_ai/_util/registry.py +30 -13
- inspect_ai/_view/www/App.css +5 -0
- inspect_ai/_view/www/dist/assets/index.css +71 -36
- inspect_ai/_view/www/dist/assets/index.js +573 -475
- inspect_ai/_view/www/log-schema.json +66 -0
- inspect_ai/_view/www/src/metadata/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/metadata/MetaDataView.tsx +13 -8
- inspect_ai/_view/www/src/metadata/RenderedContent.tsx +3 -0
- inspect_ai/_view/www/src/plan/ModelCard.module.css +16 -0
- inspect_ai/_view/www/src/plan/ModelCard.tsx +93 -0
- inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +2 -2
- inspect_ai/_view/www/src/samples/chat/tools/ToolInput.module.css +2 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.tsx +5 -1
- inspect_ai/_view/www/src/samples/transcript/StepEventView.tsx +12 -6
- inspect_ai/_view/www/src/samples/transcript/TranscriptView.module.css +0 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +6 -29
- inspect_ai/_view/www/src/types/log.d.ts +24 -6
- inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.module.css +16 -0
- inspect_ai/_view/www/src/workspace/navbar/ModelRolesView.tsx +43 -0
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.module.css +1 -1
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +5 -0
- inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +2 -0
- inspect_ai/agent/_agent.py +12 -0
- inspect_ai/agent/_as_tool.py +1 -1
- inspect_ai/agent/_bridge/bridge.py +9 -2
- inspect_ai/agent/_react.py +142 -74
- inspect_ai/agent/_run.py +13 -2
- inspect_ai/agent/_types.py +6 -0
- inspect_ai/approval/_apply.py +6 -7
- inspect_ai/approval/_approver.py +3 -3
- inspect_ai/approval/_auto.py +2 -2
- inspect_ai/approval/_call.py +20 -4
- inspect_ai/approval/_human/approver.py +3 -3
- inspect_ai/approval/_human/manager.py +2 -2
- inspect_ai/approval/_human/panel.py +3 -3
- inspect_ai/approval/_policy.py +3 -3
- inspect_ai/log/__init__.py +2 -0
- inspect_ai/log/_log.py +23 -2
- inspect_ai/log/_model.py +58 -0
- inspect_ai/log/_recorders/file.py +14 -3
- inspect_ai/log/_transcript.py +3 -0
- inspect_ai/model/__init__.py +2 -0
- inspect_ai/model/_call_tools.py +4 -1
- inspect_ai/model/_model.py +49 -3
- inspect_ai/model/_openai.py +151 -21
- inspect_ai/model/_providers/anthropic.py +20 -12
- inspect_ai/model/_providers/bedrock.py +3 -3
- inspect_ai/model/_providers/cloudflare.py +29 -108
- inspect_ai/model/_providers/google.py +21 -10
- inspect_ai/model/_providers/grok.py +23 -17
- inspect_ai/model/_providers/groq.py +61 -37
- inspect_ai/model/_providers/llama_cpp_python.py +8 -9
- inspect_ai/model/_providers/mistral.py +8 -3
- inspect_ai/model/_providers/ollama.py +8 -9
- inspect_ai/model/_providers/openai.py +53 -157
- inspect_ai/model/_providers/openai_compatible.py +195 -0
- inspect_ai/model/_providers/openrouter.py +4 -15
- inspect_ai/model/_providers/providers.py +11 -0
- inspect_ai/model/_providers/together.py +25 -23
- inspect_ai/model/_trim.py +83 -0
- inspect_ai/solver/_plan.py +5 -3
- inspect_ai/tool/_tool_def.py +8 -2
- inspect_ai/util/__init__.py +3 -0
- inspect_ai/util/_concurrency.py +15 -2
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/RECORD +88 -83
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/WHEEL +1 -1
- inspect_ai/_eval/task/rundir.py +0 -78
- inspect_ai/_view/www/node_modules/flatted/python/flatted.py +0 -149
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.88.dist-info → inspect_ai-0.3.90.dist-info}/top_level.txt +0 -0
@@ -1342,6 +1342,43 @@
|
|
1342
1342
|
"type": "object",
|
1343
1343
|
"additionalProperties": false
|
1344
1344
|
},
|
1345
|
+
"EvalModelConfig": {
|
1346
|
+
"description": "Model config.",
|
1347
|
+
"properties": {
|
1348
|
+
"model": {
|
1349
|
+
"title": "Model",
|
1350
|
+
"type": "string"
|
1351
|
+
},
|
1352
|
+
"config": {
|
1353
|
+
"$ref": "#/$defs/GenerateConfig"
|
1354
|
+
},
|
1355
|
+
"base_url": {
|
1356
|
+
"anyOf": [
|
1357
|
+
{
|
1358
|
+
"type": "string"
|
1359
|
+
},
|
1360
|
+
{
|
1361
|
+
"type": "null"
|
1362
|
+
}
|
1363
|
+
],
|
1364
|
+
"default": null,
|
1365
|
+
"title": "Base Url"
|
1366
|
+
},
|
1367
|
+
"args": {
|
1368
|
+
"title": "Args",
|
1369
|
+
"type": "object"
|
1370
|
+
}
|
1371
|
+
},
|
1372
|
+
"required": [
|
1373
|
+
"model",
|
1374
|
+
"config",
|
1375
|
+
"base_url",
|
1376
|
+
"args"
|
1377
|
+
],
|
1378
|
+
"title": "EvalModelConfig",
|
1379
|
+
"type": "object",
|
1380
|
+
"additionalProperties": false
|
1381
|
+
},
|
1345
1382
|
"EvalPlan": {
|
1346
1383
|
"description": "Plan (solvers) used in evaluation.",
|
1347
1384
|
"properties": {
|
@@ -2269,6 +2306,21 @@
|
|
2269
2306
|
"title": "Model Args",
|
2270
2307
|
"type": "object"
|
2271
2308
|
},
|
2309
|
+
"model_roles": {
|
2310
|
+
"anyOf": [
|
2311
|
+
{
|
2312
|
+
"additionalProperties": {
|
2313
|
+
"$ref": "#/$defs/EvalModelConfig"
|
2314
|
+
},
|
2315
|
+
"type": "object"
|
2316
|
+
},
|
2317
|
+
{
|
2318
|
+
"type": "null"
|
2319
|
+
}
|
2320
|
+
],
|
2321
|
+
"default": null,
|
2322
|
+
"title": "Model Roles"
|
2323
|
+
},
|
2272
2324
|
"config": {
|
2273
2325
|
"$ref": "#/$defs/EvalConfig"
|
2274
2326
|
},
|
@@ -2361,6 +2413,7 @@
|
|
2361
2413
|
"model_generate_config",
|
2362
2414
|
"model_base_url",
|
2363
2415
|
"model_args",
|
2416
|
+
"model_roles",
|
2364
2417
|
"config",
|
2365
2418
|
"revision",
|
2366
2419
|
"packages",
|
@@ -3310,6 +3363,18 @@
|
|
3310
3363
|
"title": "Model",
|
3311
3364
|
"type": "string"
|
3312
3365
|
},
|
3366
|
+
"role": {
|
3367
|
+
"anyOf": [
|
3368
|
+
{
|
3369
|
+
"type": "string"
|
3370
|
+
},
|
3371
|
+
{
|
3372
|
+
"type": "null"
|
3373
|
+
}
|
3374
|
+
],
|
3375
|
+
"default": null,
|
3376
|
+
"title": "Role"
|
3377
|
+
},
|
3313
3378
|
"input": {
|
3314
3379
|
"items": {
|
3315
3380
|
"anyOf": [
|
@@ -3430,6 +3495,7 @@
|
|
3430
3495
|
"pending",
|
3431
3496
|
"event",
|
3432
3497
|
"model",
|
3498
|
+
"role",
|
3433
3499
|
"input",
|
3434
3500
|
"tools",
|
3435
3501
|
"tool_choice",
|
@@ -6,7 +6,7 @@ import { RenderedContent } from "./RenderedContent";
|
|
6
6
|
interface MetadataViewProps {
|
7
7
|
id?: string;
|
8
8
|
style?: CSSProperties;
|
9
|
-
entries: Record<string, unknown>;
|
9
|
+
entries: Record<string, unknown> | Array<{ name: string; value: unknown }>;
|
10
10
|
tableOptions?: string;
|
11
11
|
compact?: boolean;
|
12
12
|
className?: string | string[];
|
@@ -66,11 +66,6 @@ export const MetaDataView: FC<MetadataViewProps> = ({
|
|
66
66
|
)}
|
67
67
|
style={style}
|
68
68
|
>
|
69
|
-
<thead>
|
70
|
-
<tr>
|
71
|
-
<th colSpan={2} className={"th"}></th>
|
72
|
-
</tr>
|
73
|
-
</thead>
|
74
69
|
<tbody>{entryEls}</tbody>
|
75
70
|
</table>
|
76
71
|
);
|
@@ -80,11 +75,21 @@ export const MetaDataView: FC<MetadataViewProps> = ({
|
|
80
75
|
// or an array of record with name/value on way in
|
81
76
|
// but coerce to array of records for order
|
82
77
|
const toNameValues = (
|
83
|
-
entries?:
|
78
|
+
entries?:
|
79
|
+
| Array<{ name: string; value: unknown }>
|
80
|
+
| Record<string, unknown>
|
81
|
+
| Array<unknown>,
|
84
82
|
): Array<{ name: string; value: unknown }> | undefined => {
|
85
83
|
if (entries) {
|
86
84
|
if (Array.isArray(entries)) {
|
87
|
-
|
85
|
+
// filter arrays that don't contain the expected name value pairs
|
86
|
+
const filtered = entries.filter((entry) => {
|
87
|
+
if (entry && typeof entry === "object") {
|
88
|
+
return "name" in entry && "value" in entry;
|
89
|
+
}
|
90
|
+
return false;
|
91
|
+
});
|
92
|
+
return filtered as Array<{ name: string; value: unknown }>;
|
88
93
|
} else {
|
89
94
|
return Object.entries(entries || {}).map(([key, value]) => {
|
90
95
|
return { name: key, value };
|
@@ -147,6 +147,9 @@ const contentRenderers: Record<string, ContentRenderer> = {
|
|
147
147
|
canRender: (entry) => {
|
148
148
|
const isArray = Array.isArray(entry.value);
|
149
149
|
if (isArray) {
|
150
|
+
if (entry.value.length === 0 || entry.value.length === 1) {
|
151
|
+
return true;
|
152
|
+
}
|
150
153
|
const types = new Set(
|
151
154
|
entry.value
|
152
155
|
.filter((e: unknown) => e !== null)
|
@@ -0,0 +1,16 @@
|
|
1
|
+
.container {
|
2
|
+
display: grid;
|
3
|
+
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
4
|
+
row-gap: 2em;
|
5
|
+
column-gap: 1em;
|
6
|
+
}
|
7
|
+
|
8
|
+
.modelInfo {
|
9
|
+
display: grid;
|
10
|
+
grid-template-columns: max-content auto;
|
11
|
+
column-gap: 1em;
|
12
|
+
}
|
13
|
+
|
14
|
+
.role {
|
15
|
+
grid-column: -1/1;
|
16
|
+
}
|
@@ -0,0 +1,93 @@
|
|
1
|
+
import { FC } from "react";
|
2
|
+
import { ApplicationIcons } from "../appearance/icons";
|
3
|
+
import { Card, CardBody, CardHeader } from "../components/Card";
|
4
|
+
import { EvalModelConfig, EvalSpec } from "../types/log";
|
5
|
+
|
6
|
+
import clsx from "clsx";
|
7
|
+
import { MetaDataGrid } from "../metadata/MetaDataGrid";
|
8
|
+
import styles from "./ModelCard.module.css";
|
9
|
+
|
10
|
+
interface ModelCardProps {
|
11
|
+
evalSpec?: EvalSpec;
|
12
|
+
}
|
13
|
+
|
14
|
+
/**
|
15
|
+
* Renders the plan card
|
16
|
+
*/
|
17
|
+
export const ModelCard: FC<ModelCardProps> = ({ evalSpec }) => {
|
18
|
+
if (!evalSpec) {
|
19
|
+
return undefined;
|
20
|
+
}
|
21
|
+
|
22
|
+
const modelsInfo: Record<string, EvalModelConfig> = {
|
23
|
+
eval: {
|
24
|
+
model: evalSpec.model,
|
25
|
+
base_url: evalSpec.model_base_url,
|
26
|
+
config: evalSpec.model_generate_config,
|
27
|
+
args: evalSpec.model_args,
|
28
|
+
},
|
29
|
+
...evalSpec.model_roles,
|
30
|
+
};
|
31
|
+
|
32
|
+
const noneEl = <span className="text-style-secondary">None</span>;
|
33
|
+
|
34
|
+
return (
|
35
|
+
<Card>
|
36
|
+
<CardHeader icon={ApplicationIcons.model} label="Models" />
|
37
|
+
<CardBody id={"task-model-card-body"}>
|
38
|
+
<div className={styles.container}>
|
39
|
+
{Object.keys(modelsInfo || {}).map((modelKey) => {
|
40
|
+
const modelInfo = modelsInfo[modelKey];
|
41
|
+
return (
|
42
|
+
<div
|
43
|
+
key={modelKey}
|
44
|
+
className={clsx(styles.modelInfo, "text-size-small")}
|
45
|
+
>
|
46
|
+
<div
|
47
|
+
className={clsx(
|
48
|
+
styles.role,
|
49
|
+
"text-style-label",
|
50
|
+
"text-style-secondary",
|
51
|
+
)}
|
52
|
+
>
|
53
|
+
{modelKey}
|
54
|
+
</div>
|
55
|
+
|
56
|
+
<div className={clsx("text-style-label")}>Model</div>
|
57
|
+
<div>{modelInfo.model}</div>
|
58
|
+
|
59
|
+
<div className={clsx("text-style-label")}>Base Url</div>
|
60
|
+
<div className="text-size-small">
|
61
|
+
{modelInfo.base_url || noneEl}
|
62
|
+
</div>
|
63
|
+
<div className={clsx("text-style-label")}>Configuration</div>
|
64
|
+
<div className="text-size-small">
|
65
|
+
{modelInfo.config &&
|
66
|
+
Object.keys(modelInfo.config).length > 0 ? (
|
67
|
+
<MetaDataGrid
|
68
|
+
entries={
|
69
|
+
modelInfo.config as any as Record<string, unknown>
|
70
|
+
}
|
71
|
+
/>
|
72
|
+
) : (
|
73
|
+
noneEl
|
74
|
+
)}
|
75
|
+
</div>
|
76
|
+
<div className={clsx("text-style-label")}>Args</div>
|
77
|
+
<div className="text-size-small">
|
78
|
+
{Object.keys(modelInfo.args).length > 0 ? (
|
79
|
+
<MetaDataGrid
|
80
|
+
entries={modelInfo.args as any as Record<string, unknown>}
|
81
|
+
/>
|
82
|
+
) : (
|
83
|
+
noneEl
|
84
|
+
)}
|
85
|
+
</div>
|
86
|
+
</div>
|
87
|
+
);
|
88
|
+
})}
|
89
|
+
</div>
|
90
|
+
</CardBody>
|
91
|
+
</Card>
|
92
|
+
);
|
93
|
+
};
|
@@ -26,7 +26,7 @@ export const ChatMessage: FC<ChatMessageProps> = ({
|
|
26
26
|
indented,
|
27
27
|
toolCallStyle,
|
28
28
|
}) => {
|
29
|
-
const collapse = message.role === "system";
|
29
|
+
const collapse = message.role === "system" || message.role === "user";
|
30
30
|
return (
|
31
31
|
<div
|
32
32
|
className={clsx(
|
@@ -46,7 +46,7 @@ export const ChatMessage: FC<ChatMessageProps> = ({
|
|
46
46
|
indented ? styles.indented : undefined,
|
47
47
|
)}
|
48
48
|
>
|
49
|
-
<ExpandablePanel id={`${id}-message`} collapse={collapse} lines={
|
49
|
+
<ExpandablePanel id={`${id}-message`} collapse={collapse} lines={15}>
|
50
50
|
<MessageContents
|
51
51
|
id={`${id}-contents`}
|
52
52
|
key={`${id}-contents`}
|
@@ -63,11 +63,15 @@ export const ModelEventView: FC<ModelEventViewProps> = ({
|
|
63
63
|
}
|
64
64
|
}
|
65
65
|
|
66
|
+
const panelTitle = event.role
|
67
|
+
? `Model Call (${event.role}): ${event.model}`
|
68
|
+
: `Model Call: ${event.model}`;
|
69
|
+
|
66
70
|
return (
|
67
71
|
<EventPanel
|
68
72
|
id={id}
|
69
73
|
className={className}
|
70
|
-
title={formatTitle(
|
74
|
+
title={formatTitle(panelTitle, totalUsage, callTime)}
|
71
75
|
subTitle={formatTiming(event.timestamp, event.working_start)}
|
72
76
|
icon={ApplicationIcons.model}
|
73
77
|
>
|
@@ -132,6 +132,18 @@ const stepDescriptor = (
|
|
132
132
|
return {
|
133
133
|
...rootStepDescriptor,
|
134
134
|
};
|
135
|
+
} else if (event.event === "step") {
|
136
|
+
if (event.name === "init") {
|
137
|
+
return {
|
138
|
+
...rootStepDescriptor,
|
139
|
+
name: "Init",
|
140
|
+
collapse: true,
|
141
|
+
};
|
142
|
+
} else {
|
143
|
+
return {
|
144
|
+
...rootStepDescriptor,
|
145
|
+
};
|
146
|
+
}
|
135
147
|
} else {
|
136
148
|
switch (event.name) {
|
137
149
|
case "sample_init":
|
@@ -140,12 +152,6 @@ const stepDescriptor = (
|
|
140
152
|
name: "Sample Init",
|
141
153
|
collapse: true,
|
142
154
|
};
|
143
|
-
case "init":
|
144
|
-
return {
|
145
|
-
...rootStepDescriptor,
|
146
|
-
name: "Init",
|
147
|
-
collapse: true,
|
148
|
-
};
|
149
155
|
default:
|
150
156
|
return {
|
151
157
|
endSpace: false,
|
@@ -276,44 +276,21 @@ function setPath(
|
|
276
276
|
value: unknown,
|
277
277
|
): void {
|
278
278
|
const keys = parsePath(path);
|
279
|
-
let current: Record<string, unknown>
|
279
|
+
let current: Record<string, unknown> = target;
|
280
280
|
|
281
281
|
for (let i = 0; i < keys.length - 1; i++) {
|
282
282
|
const key = keys[i];
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
current[numericIndex] = isArrayIndex(keys[i + 1]) ? [] : {};
|
287
|
-
current = current[numericIndex] as
|
288
|
-
| Record<string, unknown>
|
289
|
-
| Array<unknown>;
|
290
|
-
} else {
|
291
|
-
if (!(key in current)) {
|
292
|
-
// If the next key is a number, create an array, otherwise an object
|
293
|
-
current[key] = isArrayIndex(keys[i + 1]) ? [] : {};
|
294
|
-
}
|
295
|
-
current = current[key] as Record<string, unknown> | Array<unknown>;
|
283
|
+
if (!(key in current)) {
|
284
|
+
// If the next key is a number, create an array, otherwise an object
|
285
|
+
current[key] = isArrayIndex(keys[i + 1]) ? [] : {};
|
296
286
|
}
|
287
|
+
current = current[key] as Record<string, unknown>;
|
297
288
|
}
|
298
289
|
|
299
290
|
const lastKey = keys[keys.length - 1];
|
300
|
-
|
301
|
-
const numericIndex = getIndex(lastKey);
|
302
|
-
current[numericIndex] = value;
|
303
|
-
} else {
|
304
|
-
current[lastKey] = value;
|
305
|
-
}
|
291
|
+
current[lastKey] = value;
|
306
292
|
}
|
307
293
|
|
308
|
-
const getIndex = (key: string): number => {
|
309
|
-
const numericIndex = isArrayIndex(key) ? parseInt(key) : undefined;
|
310
|
-
if (numericIndex === undefined) {
|
311
|
-
throw new Error(`The key ${key} isn't a valid Array index!`);
|
312
|
-
}
|
313
|
-
|
314
|
-
return numericIndex;
|
315
|
-
};
|
316
|
-
|
317
294
|
/**
|
318
295
|
* Places structure in an object (without placing values)
|
319
296
|
*/
|
@@ -65,6 +65,11 @@ export type Required = string[] | null;
|
|
65
65
|
export type Description1 = string | null;
|
66
66
|
export type Strict = boolean | null;
|
67
67
|
export type ModelBaseUrl = string | null;
|
68
|
+
export type ModelRoles = {
|
69
|
+
[k: string]: EvalModelConfig;
|
70
|
+
} | null;
|
71
|
+
export type Model1 = string;
|
72
|
+
export type BaseUrl = string | null;
|
68
73
|
export type Limit = number | [unknown, unknown] | null;
|
69
74
|
export type SampleId = string | number | (string | number)[] | null;
|
70
75
|
export type Epochs = number | null;
|
@@ -211,7 +216,7 @@ export type Title = string | null;
|
|
211
216
|
export type Format2 = "text" | "markdown";
|
212
217
|
export type Content3 = string;
|
213
218
|
export type Type8 = string | null;
|
214
|
-
export type
|
219
|
+
export type Model2 = string | null;
|
215
220
|
export type Id5 = string | null;
|
216
221
|
export type Content4 =
|
217
222
|
| string
|
@@ -247,7 +252,7 @@ export type Messages = (
|
|
247
252
|
| ChatMessageAssistant
|
248
253
|
| ChatMessageTool
|
249
254
|
)[];
|
250
|
-
export type
|
255
|
+
export type Model3 = string;
|
251
256
|
export type StopReason =
|
252
257
|
| "stop"
|
253
258
|
| "max_tokens"
|
@@ -346,7 +351,8 @@ export type Timestamp5 = string;
|
|
346
351
|
export type WorkingStart5 = number;
|
347
352
|
export type Pending5 = boolean | null;
|
348
353
|
export type Event5 = "model";
|
349
|
-
export type
|
354
|
+
export type Model4 = string;
|
355
|
+
export type Role4 = string | null;
|
350
356
|
export type Input3 = (
|
351
357
|
| ChatMessageSystem
|
352
358
|
| ChatMessageUser
|
@@ -580,6 +586,7 @@ export interface EvalSpec {
|
|
580
586
|
model_generate_config: GenerateConfig;
|
581
587
|
model_base_url: ModelBaseUrl;
|
582
588
|
model_args: ModelArgs;
|
589
|
+
model_roles: ModelRoles;
|
583
590
|
config: EvalConfig;
|
584
591
|
revision: EvalRevision | null;
|
585
592
|
packages: Packages;
|
@@ -666,6 +673,16 @@ export interface Default {
|
|
666
673
|
[k: string]: unknown;
|
667
674
|
}
|
668
675
|
export interface ModelArgs {}
|
676
|
+
/**
|
677
|
+
* Model config.
|
678
|
+
*/
|
679
|
+
export interface EvalModelConfig {
|
680
|
+
model: Model1;
|
681
|
+
config: GenerateConfig;
|
682
|
+
base_url: BaseUrl;
|
683
|
+
args: Args;
|
684
|
+
}
|
685
|
+
export interface Args {}
|
669
686
|
/**
|
670
687
|
* Configuration used for evaluation.
|
671
688
|
*/
|
@@ -948,7 +965,7 @@ export interface ChatMessageAssistant {
|
|
948
965
|
internal: unknown;
|
949
966
|
role: Role2;
|
950
967
|
tool_calls: ToolCalls;
|
951
|
-
model:
|
968
|
+
model: Model2;
|
952
969
|
}
|
953
970
|
export interface ToolCall {
|
954
971
|
id: Id4;
|
@@ -989,7 +1006,7 @@ export interface ToolCallError {
|
|
989
1006
|
* Output from model generation.
|
990
1007
|
*/
|
991
1008
|
export interface ModelOutput {
|
992
|
-
model:
|
1009
|
+
model: Model3;
|
993
1010
|
choices: Choices1;
|
994
1011
|
usage: ModelUsage1 | null;
|
995
1012
|
time: Time;
|
@@ -1133,7 +1150,8 @@ export interface ModelEvent {
|
|
1133
1150
|
working_start: WorkingStart5;
|
1134
1151
|
pending: Pending5;
|
1135
1152
|
event: Event5;
|
1136
|
-
model:
|
1153
|
+
model: Model4;
|
1154
|
+
role: Role4;
|
1137
1155
|
input: Input3;
|
1138
1156
|
tools: Tools1;
|
1139
1157
|
tool_choice: ToolChoice;
|
@@ -0,0 +1,16 @@
|
|
1
|
+
.container {
|
2
|
+
display: flex;
|
3
|
+
flex-direction: row;
|
4
|
+
flex-wrap: wrap;
|
5
|
+
gap: 0;
|
6
|
+
margin-top: -0.2rem;
|
7
|
+
margin-bottom: 0.2rem;
|
8
|
+
}
|
9
|
+
|
10
|
+
.grid {
|
11
|
+
display: grid;
|
12
|
+
grid-template-rows: repeat(auto-fill, minmax(10px, 1fr));
|
13
|
+
grid-template-columns: 1fr;
|
14
|
+
gap: 0.1em;
|
15
|
+
padding-right: 1em;
|
16
|
+
}
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import { FC } from "react";
|
2
|
+
import { ModelRoles } from "../../types/log";
|
3
|
+
|
4
|
+
import clsx from "clsx";
|
5
|
+
import styles from "./ModelRolesView.module.css";
|
6
|
+
|
7
|
+
interface ModelRolesViewProps {
|
8
|
+
roles: ModelRoles;
|
9
|
+
}
|
10
|
+
|
11
|
+
/**
|
12
|
+
* Renders the Navbar
|
13
|
+
*/
|
14
|
+
export const ModelRolesView: FC<ModelRolesViewProps> = ({ roles }) => {
|
15
|
+
roles = roles || {};
|
16
|
+
|
17
|
+
// Render as a single line if there is only a single
|
18
|
+
// model role
|
19
|
+
const singleLine = Object.keys(roles).length !== 1;
|
20
|
+
|
21
|
+
// Render a layout of model roles
|
22
|
+
const modelEls = Object.keys(roles).map((key) => {
|
23
|
+
const role = key;
|
24
|
+
const roleData = roles[role];
|
25
|
+
const model = roleData.model;
|
26
|
+
return (
|
27
|
+
<div
|
28
|
+
className={clsx(
|
29
|
+
singleLine ? styles.grid : undefined,
|
30
|
+
"text-style-secondary",
|
31
|
+
"text-size-smallest",
|
32
|
+
)}
|
33
|
+
key={key}
|
34
|
+
>
|
35
|
+
<span className={clsx("text-style-label")}>{role}:</span>
|
36
|
+
<span>{model}</span>
|
37
|
+
</div>
|
38
|
+
);
|
39
|
+
});
|
40
|
+
return modelEls.length > 0 ? (
|
41
|
+
<div className={styles.container}>{modelEls}</div>
|
42
|
+
) : undefined;
|
43
|
+
};
|
@@ -7,6 +7,7 @@ import { kModelNone } from "../../constants";
|
|
7
7
|
import { useStore } from "../../state/store";
|
8
8
|
import { EvalResults, EvalSpec, Status } from "../../types/log";
|
9
9
|
import { filename } from "../../utils/path";
|
10
|
+
import { ModelRolesView } from "./ModelRolesView";
|
10
11
|
import styles from "./PrimaryBar.module.css";
|
11
12
|
import {
|
12
13
|
displayScorersFromRunningMetrics,
|
@@ -100,6 +101,10 @@ export const PrimaryBar: FC<PrimaryBarProps> = ({
|
|
100
101
|
""
|
101
102
|
)}
|
102
103
|
</div>
|
104
|
+
{evalSpec?.model_roles ? (
|
105
|
+
<ModelRolesView roles={evalSpec.model_roles} />
|
106
|
+
) : undefined}
|
107
|
+
|
103
108
|
<div className={clsx("text-size-small", styles.secondaryContainer)}>
|
104
109
|
<div className={clsx("navbar-secondary-text", "text-truncate")}>
|
105
110
|
{logFileName}
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import { FC } from "react";
|
2
2
|
import { SampleSummary } from "../../api/types";
|
3
3
|
import { MessageBand } from "../../components/MessageBand";
|
4
|
+
import { ModelCard } from "../../plan/ModelCard";
|
4
5
|
import { PlanCard } from "../../plan/PlanCard";
|
5
6
|
import {
|
6
7
|
EvalError,
|
@@ -55,6 +56,7 @@ export const InfoTab: FC<PlanTabProps> = ({
|
|
55
56
|
evalPlan={evalPlan}
|
56
57
|
scores={evalResults?.scores}
|
57
58
|
/>
|
59
|
+
{evalSpec ? <ModelCard evalSpec={evalSpec} /> : undefined}
|
58
60
|
{evalStatus !== "started" ? <UsageCard stats={evalStats} /> : undefined}
|
59
61
|
{evalStatus === "error" && evalError ? (
|
60
62
|
<TaskErrorCard error={evalError} />
|
inspect_ai/agent/_agent.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from copy import copy, deepcopy
|
2
2
|
from functools import wraps
|
3
|
+
from inspect import signature
|
3
4
|
from typing import (
|
4
5
|
Any,
|
5
6
|
Callable,
|
@@ -7,6 +8,7 @@ from typing import (
|
|
7
8
|
Protocol,
|
8
9
|
TypeGuard,
|
9
10
|
cast,
|
11
|
+
get_type_hints,
|
10
12
|
overload,
|
11
13
|
runtime_checkable,
|
12
14
|
)
|
@@ -189,6 +191,16 @@ def agent(
|
|
189
191
|
)
|
190
192
|
return agent
|
191
193
|
|
194
|
+
# If a user's code runs "from __future__ import annotations", all type annotations are stored as strings,
|
195
|
+
# which can break introspection-based mechanisms (like inspecting a function’s signature).
|
196
|
+
# The following two lines resolve these string annotations using the original function's globals,
|
197
|
+
# ensuring that any forward references (e.g., "Agent") are evaluated to their actual types,
|
198
|
+
# and then reassign the original function's signature to the wrapper.
|
199
|
+
agent_wrapper.__annotations__ = get_type_hints(
|
200
|
+
agent_wrapper, agent_type.__globals__
|
201
|
+
)
|
202
|
+
agent_wrapper.__signature__ = signature(agent_type) # type: ignore[attr-defined]
|
203
|
+
|
192
204
|
# register
|
193
205
|
return agent_register(cast(Callable[P, Agent], agent_wrapper), agent_name)
|
194
206
|
|
inspect_ai/agent/_as_tool.py
CHANGED
@@ -42,7 +42,7 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
|
|
42
42
|
|
43
43
|
async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
|
44
44
|
# prepare state and call agent
|
45
|
-
state = AgentState(messages=[ChatMessageUser(content=input)])
|
45
|
+
state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
|
46
46
|
state = await agent(state, *args, **(agent_kwargs | kwargs))
|
47
47
|
|
48
48
|
# find assistant message to read content from (prefer output)
|