inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -0
- inspect_ai/_display/textual/widgets/samples.py +3 -3
- inspect_ai/_display/textual/widgets/transcript.py +3 -29
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +23 -27
- inspect_ai/_util/answer.py +26 -0
- inspect_ai/_util/constants.py +0 -1
- inspect_ai/_util/local_server.py +398 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +173 -159
- inspect_ai/_view/www/dist/assets/index.js +1417 -1142
- inspect_ai/_view/www/log-schema.json +379 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +93 -14
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +22 -12
- inspect_ai/agent/_as_tool.py +20 -6
- inspect_ai/agent/_handoff.py +12 -1
- inspect_ai/agent/_react.py +4 -3
- inspect_ai/agent/_run.py +16 -3
- inspect_ai/agent/_types.py +9 -0
- inspect_ai/dataset/_dataset.py +6 -3
- inspect_ai/log/__init__.py +14 -0
- inspect_ai/log/_convert.py +4 -9
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +14 -25
- inspect_ai/log/_transcript.py +84 -36
- inspect_ai/log/_tree.py +118 -0
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +72 -44
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +66 -88
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/hf.py +27 -1
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +247 -0
- inspect_ai/model/_providers/vllm.py +211 -400
- inspect_ai/scorer/_choice.py +1 -2
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_chain.py +1 -1
- inspect_ai/solver/_fork.py +1 -1
- inspect_ai/solver/_multiple_choice.py +5 -22
- inspect_ai/solver/_plan.py +2 -2
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/solver/_transcript.py +6 -7
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +8 -5
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_execute.py +4 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +16 -0
- inspect_ai/util/_anyio.py +11 -0
- inspect_ai/util/_collect.py +50 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- inspect_ai/util/_span.py +58 -0
- inspect_ai/util/_subtask.py +27 -42
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
- inspect_ai/_display/core/group.py +0 -79
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,12 @@
|
|
1
1
|
// This is a special name that signals a group of sandbox events.
|
2
2
|
|
3
|
-
import {
|
3
|
+
import {
|
4
|
+
Events,
|
5
|
+
SpanBeginEvent,
|
6
|
+
SpanEndEvent,
|
7
|
+
StepEvent,
|
8
|
+
} from "../../../../@types/log";
|
9
|
+
import { hasSpans } from "./utils";
|
4
10
|
|
5
11
|
// It will be caught elsewhere and rendered with a pretty name
|
6
12
|
export const kSandboxSignalName = "53787D8A-D3FC-426D-B383-9F880B70E4AA";
|
@@ -54,39 +60,55 @@ const processPendingEvents = (events: Events, filter: boolean): Events => {
|
|
54
60
|
};
|
55
61
|
|
56
62
|
const collapseSampleInit = (events: Events): Events => {
|
57
|
-
//
|
63
|
+
// Don't performance sample init logic if spans are present
|
64
|
+
const hasSpans = events.some((e) => {
|
65
|
+
return e.event === "span_begin" || e.event === "span_end";
|
66
|
+
});
|
67
|
+
if (hasSpans) {
|
68
|
+
return events;
|
69
|
+
}
|
70
|
+
|
71
|
+
// Don't synthesize a sample init step if one already exists
|
58
72
|
const hasInitStep =
|
59
73
|
events.findIndex((e) => {
|
60
74
|
return e.event === "step" && e.name === "init";
|
61
75
|
}) !== -1;
|
76
|
+
if (hasInitStep) {
|
77
|
+
return events;
|
78
|
+
}
|
62
79
|
|
80
|
+
// Find a sample init event
|
63
81
|
const initEventIndex = events.findIndex((e) => {
|
64
82
|
return e.event === "sample_init";
|
65
83
|
});
|
66
84
|
const initEvent = events[initEventIndex];
|
85
|
+
if (!initEvent) {
|
86
|
+
return events;
|
87
|
+
}
|
67
88
|
|
89
|
+
// Splice in sample init step if needed
|
68
90
|
const fixedUp = [...events];
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
}
|
91
|
+
fixedUp.splice(initEventIndex, 0, {
|
92
|
+
timestamp: initEvent.timestamp,
|
93
|
+
event: "step",
|
94
|
+
action: "begin",
|
95
|
+
type: null,
|
96
|
+
name: "sample_init",
|
97
|
+
pending: false,
|
98
|
+
working_start: 0,
|
99
|
+
span_id: initEvent.span_id,
|
100
|
+
});
|
101
|
+
|
102
|
+
fixedUp.splice(initEventIndex + 2, 0, {
|
103
|
+
timestamp: initEvent.timestamp,
|
104
|
+
event: "step",
|
105
|
+
action: "end",
|
106
|
+
type: null,
|
107
|
+
name: "sample_init",
|
108
|
+
pending: false,
|
109
|
+
working_start: 0,
|
110
|
+
span_id: initEvent.span_id,
|
111
|
+
});
|
90
112
|
return fixedUp;
|
91
113
|
};
|
92
114
|
|
@@ -94,12 +116,22 @@ const groupSandboxEvents = (events: Events): Events => {
|
|
94
116
|
const result: Events = [];
|
95
117
|
const pendingSandboxEvents: Events = [];
|
96
118
|
|
119
|
+
const useSpans = hasSpans(events);
|
120
|
+
|
97
121
|
const pushPendingSandboxEvents = () => {
|
98
122
|
const timestamp =
|
99
123
|
pendingSandboxEvents[pendingSandboxEvents.length - 1].timestamp;
|
100
|
-
|
124
|
+
if (useSpans) {
|
125
|
+
result.push(createSpanBegin(kSandboxSignalName, timestamp, null));
|
126
|
+
} else {
|
127
|
+
result.push(createStepEvent(kSandboxSignalName, timestamp, "begin"));
|
128
|
+
}
|
101
129
|
result.push(...pendingSandboxEvents);
|
102
|
-
|
130
|
+
if (useSpans) {
|
131
|
+
result.push(createSpanEnd(kSandboxSignalName, timestamp));
|
132
|
+
} else {
|
133
|
+
result.push(createStepEvent(kSandboxSignalName, timestamp, "end"));
|
134
|
+
}
|
103
135
|
pendingSandboxEvents.length = 0;
|
104
136
|
};
|
105
137
|
|
@@ -139,4 +171,34 @@ const createStepEvent = (
|
|
139
171
|
name,
|
140
172
|
pending: false,
|
141
173
|
working_start: 0,
|
174
|
+
span_id: null,
|
142
175
|
});
|
176
|
+
|
177
|
+
const createSpanBegin = (
|
178
|
+
name: string,
|
179
|
+
timestamp: string,
|
180
|
+
parent_id: string | null,
|
181
|
+
): SpanBeginEvent => {
|
182
|
+
return {
|
183
|
+
name,
|
184
|
+
id: `${name}-begin`,
|
185
|
+
span_id: name,
|
186
|
+
parent_id,
|
187
|
+
timestamp,
|
188
|
+
event: "span_begin",
|
189
|
+
type: null,
|
190
|
+
pending: false,
|
191
|
+
working_start: 0,
|
192
|
+
};
|
193
|
+
};
|
194
|
+
|
195
|
+
const createSpanEnd = (name: string, timestamp: string): SpanEndEvent => {
|
196
|
+
return {
|
197
|
+
id: `${name}-end`,
|
198
|
+
timestamp,
|
199
|
+
event: "span_end",
|
200
|
+
pending: false,
|
201
|
+
working_start: 0,
|
202
|
+
span_id: name,
|
203
|
+
};
|
204
|
+
};
|
@@ -1,14 +1,28 @@
|
|
1
1
|
import { Events } from "../../../../@types/log";
|
2
2
|
import { EventNode, EventType } from "../types";
|
3
|
+
import {
|
4
|
+
ACTION_BEGIN,
|
5
|
+
ET_SPAN_BEGIN,
|
6
|
+
ET_SPAN_END,
|
7
|
+
ET_STEP,
|
8
|
+
hasSpans,
|
9
|
+
} from "./utils";
|
10
|
+
|
11
|
+
type TreeifyFunction = (
|
12
|
+
event: EventType,
|
13
|
+
addNode: (event: EventType) => EventNode,
|
14
|
+
pushStack: (node: EventNode) => void,
|
15
|
+
popStack: () => void,
|
16
|
+
) => void;
|
3
17
|
|
4
|
-
/**
|
5
|
-
* Gathers events into a hierarchy of EventNodes.
|
6
|
-
*/
|
7
18
|
export function treeifyEvents(events: Events, depth: number): EventNode[] {
|
19
|
+
const useSpans = hasSpans(events);
|
20
|
+
const treeFn = useSpans ? treeifyFnSpan : treeifyFnStep;
|
21
|
+
|
8
22
|
const rootNodes: EventNode[] = [];
|
9
23
|
const stack: EventNode[] = [];
|
10
24
|
|
11
|
-
const
|
25
|
+
const addNode = (event: EventType): EventNode => {
|
12
26
|
const node = new EventNode(event, stack.length + depth);
|
13
27
|
if (stack.length > 0) {
|
14
28
|
const parentNode = stack[stack.length - 1];
|
@@ -19,21 +33,219 @@ export function treeifyEvents(events: Events, depth: number): EventNode[] {
|
|
19
33
|
return node;
|
20
34
|
};
|
21
35
|
|
36
|
+
const pushStack = (node: EventNode): void => {
|
37
|
+
stack.push(node);
|
38
|
+
};
|
39
|
+
|
40
|
+
const popStack = (): void => {
|
41
|
+
if (stack.length > 0) {
|
42
|
+
stack.pop();
|
43
|
+
}
|
44
|
+
};
|
45
|
+
|
22
46
|
events.forEach((event) => {
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
47
|
+
treeFn(event, addNode, pushStack, popStack);
|
48
|
+
});
|
49
|
+
|
50
|
+
if (useSpans) {
|
51
|
+
return transformTree(rootNodes);
|
52
|
+
} else {
|
53
|
+
return rootNodes;
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
const treeifyFnStep: TreeifyFunction = (
|
58
|
+
event: EventType,
|
59
|
+
addNode: (event: EventType) => EventNode,
|
60
|
+
pushStack: (node: EventNode) => void,
|
61
|
+
popStack: () => void,
|
62
|
+
): void => {
|
63
|
+
switch (event.event) {
|
64
|
+
case ET_STEP:
|
65
|
+
if (event.action === ACTION_BEGIN) {
|
66
|
+
// Starting a new step
|
67
|
+
const node = addNode(event);
|
68
|
+
pushStack(node);
|
69
|
+
} else {
|
70
|
+
// An ending step
|
71
|
+
popStack();
|
31
72
|
}
|
32
|
-
|
73
|
+
break;
|
74
|
+
case ET_SPAN_BEGIN: {
|
75
|
+
// These shoudn't be here, but throw away
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
case ET_SPAN_END: {
|
79
|
+
// These shoudn't be here, but throw away
|
80
|
+
break;
|
81
|
+
}
|
82
|
+
default:
|
33
83
|
// An event
|
34
|
-
|
84
|
+
addNode(event);
|
85
|
+
break;
|
86
|
+
}
|
87
|
+
};
|
88
|
+
|
89
|
+
const treeifyFnSpan: TreeifyFunction = (
|
90
|
+
event: EventType,
|
91
|
+
addNode: (event: EventType) => EventNode,
|
92
|
+
pushStack: (node: EventNode) => void,
|
93
|
+
popStack: () => void,
|
94
|
+
): void => {
|
95
|
+
switch (event.event) {
|
96
|
+
case ET_STEP:
|
97
|
+
// strip steps
|
98
|
+
break;
|
99
|
+
case ET_SPAN_BEGIN: {
|
100
|
+
const node = addNode(event);
|
101
|
+
pushStack(node);
|
102
|
+
break;
|
35
103
|
}
|
36
|
-
|
104
|
+
case ET_SPAN_END: {
|
105
|
+
popStack();
|
106
|
+
break;
|
107
|
+
}
|
108
|
+
default:
|
109
|
+
// An event
|
110
|
+
addNode(event);
|
111
|
+
break;
|
112
|
+
}
|
113
|
+
};
|
37
114
|
|
38
|
-
|
39
|
-
|
115
|
+
type TreeNodeTransformer = {
|
116
|
+
name: string;
|
117
|
+
matches: (node: EventNode) => boolean;
|
118
|
+
process: (node: EventNode) => EventNode;
|
119
|
+
};
|
120
|
+
|
121
|
+
const treeNodeTransformers: TreeNodeTransformer[] = [
|
122
|
+
{
|
123
|
+
name: "unwrap_tools",
|
124
|
+
matches: (node) =>
|
125
|
+
node.event.event === "span_begin" && node.event.type === "tool",
|
126
|
+
process: (node) => elevateChildNode(node, "tool") || node,
|
127
|
+
},
|
128
|
+
{
|
129
|
+
name: "unwrap_subtasks",
|
130
|
+
matches: (node) =>
|
131
|
+
node.event.event === "span_begin" && node.event.type === "subtask",
|
132
|
+
process: (node) => elevateChildNode(node, "subtask") || node,
|
133
|
+
},
|
134
|
+
{
|
135
|
+
name: "unwrap_agent_solver",
|
136
|
+
matches: (node) =>
|
137
|
+
node.event.event === "span_begin" &&
|
138
|
+
node.event["type"] === "solver" &&
|
139
|
+
node.children.length === 2 &&
|
140
|
+
node.children[0].event.event === "span_begin" &&
|
141
|
+
node.children[0].event.type === "agent" &&
|
142
|
+
node.children[1].event.event === "state",
|
143
|
+
|
144
|
+
process: (node) => skipFirstChildNode(node),
|
145
|
+
},
|
146
|
+
{
|
147
|
+
name: "unwrap_agent_solver w/store",
|
148
|
+
matches: (node) =>
|
149
|
+
node.event.event === "span_begin" &&
|
150
|
+
node.event["type"] === "solver" &&
|
151
|
+
node.children.length === 3 &&
|
152
|
+
node.children[0].event.event === "span_begin" &&
|
153
|
+
node.children[0].event.type === "agent" &&
|
154
|
+
node.children[1].event.event === "state" &&
|
155
|
+
node.children[2].event.event === "store",
|
156
|
+
process: (node) => skipFirstChildNode(node),
|
157
|
+
},
|
158
|
+
{
|
159
|
+
name: "unwrap_handoff",
|
160
|
+
matches: (node) =>
|
161
|
+
node.event.event === "span_begin" &&
|
162
|
+
node.event["type"] === "handoff" &&
|
163
|
+
node.children.length === 2 &&
|
164
|
+
node.children[0].event.event === "tool" &&
|
165
|
+
node.children[1].event.event === "store" &&
|
166
|
+
node.children[0].children.length === 2 &&
|
167
|
+
node.children[0].children[0].event.event === "span_begin" &&
|
168
|
+
node.children[0].children[0].event.type === "agent",
|
169
|
+
process: (node) => skipThisNode(node),
|
170
|
+
},
|
171
|
+
];
|
172
|
+
|
173
|
+
const transformTree = (roots: EventNode[]): EventNode[] => {
|
174
|
+
const visitNode = (node: EventNode): EventNode => {
|
175
|
+
let processedNode = node;
|
176
|
+
|
177
|
+
// Visit children (depth first)
|
178
|
+
processedNode.children = processedNode.children.map(visitNode);
|
179
|
+
|
180
|
+
// Apply any visitors to this node
|
181
|
+
for (const transformer of treeNodeTransformers) {
|
182
|
+
if (transformer.matches(processedNode)) {
|
183
|
+
processedNode = transformer.process(processedNode);
|
184
|
+
// Only apply the first matching transformer
|
185
|
+
break;
|
186
|
+
}
|
187
|
+
}
|
188
|
+
return processedNode;
|
189
|
+
};
|
190
|
+
|
191
|
+
return roots.map(visitNode);
|
192
|
+
};
|
193
|
+
|
194
|
+
/**
|
195
|
+
* Process a span node by elevating a specific child node type and moving its siblings as children
|
196
|
+
* @template T - Type of the event (either ToolEvent or SubtaskEvent)
|
197
|
+
*/
|
198
|
+
const elevateChildNode = (
|
199
|
+
node: EventNode,
|
200
|
+
childEventType: "tool" | "subtask",
|
201
|
+
): EventNode | null => {
|
202
|
+
// Find the specific event child
|
203
|
+
const targetIndex = node.children.findIndex(
|
204
|
+
(child) => child.event.event === childEventType,
|
205
|
+
);
|
206
|
+
|
207
|
+
if (targetIndex === -1) {
|
208
|
+
console.log(
|
209
|
+
`No ${childEventType} event found in a span, this is very unexpected.`,
|
210
|
+
);
|
211
|
+
return null;
|
212
|
+
}
|
213
|
+
|
214
|
+
// Get the target node and set its depth
|
215
|
+
const targetNode = { ...node.children[targetIndex] };
|
216
|
+
const remainingChildren = node.children.filter((_, i) => i !== targetIndex);
|
217
|
+
|
218
|
+
// Process the remaining children
|
219
|
+
targetNode.depth = node.depth;
|
220
|
+
targetNode.children = reduceDepth(remainingChildren);
|
221
|
+
|
222
|
+
// No need to update the event itself (events have been deprecated
|
223
|
+
// and more importantly we drive children / transcripts using the tree structure itself
|
224
|
+
// and notes rather than the event.events itself)
|
225
|
+
return targetNode;
|
226
|
+
};
|
227
|
+
|
228
|
+
const skipFirstChildNode = (node: EventNode): EventNode => {
|
229
|
+
const agentSpan = node.children.splice(0, 1)[0];
|
230
|
+
node.children.unshift(...reduceDepth(agentSpan.children));
|
231
|
+
return node;
|
232
|
+
};
|
233
|
+
|
234
|
+
const skipThisNode = (node: EventNode): EventNode => {
|
235
|
+
const newNode = { ...node.children[0] };
|
236
|
+
newNode.depth = node.depth;
|
237
|
+
newNode.children = reduceDepth(newNode.children[0].children, 2);
|
238
|
+
return newNode;
|
239
|
+
};
|
240
|
+
|
241
|
+
// Reduce the depth of the children by 1
|
242
|
+
// This is used when we hoist a child node to the parent
|
243
|
+
const reduceDepth = (nodes: EventNode[], depth: number = 1): EventNode[] => {
|
244
|
+
return nodes.map((node) => {
|
245
|
+
if (node.children.length > 0) {
|
246
|
+
node.children = reduceDepth(node.children, 1);
|
247
|
+
}
|
248
|
+
node.depth = node.depth - depth;
|
249
|
+
return node;
|
250
|
+
});
|
251
|
+
};
|
@@ -0,0 +1,11 @@
|
|
1
|
+
import { Events } from "../../../../@types/log";
|
2
|
+
|
3
|
+
export const ET_STEP = "step";
|
4
|
+
export const ACTION_BEGIN = "begin";
|
5
|
+
|
6
|
+
export const ET_SPAN_BEGIN = "span_begin";
|
7
|
+
export const ET_SPAN_END = "span_end";
|
8
|
+
|
9
|
+
export const hasSpans = (events: Events): boolean => {
|
10
|
+
return events.some((event) => event.event === ET_SPAN_BEGIN);
|
11
|
+
};
|
@@ -10,6 +10,8 @@ import {
|
|
10
10
|
SampleLimitEvent,
|
11
11
|
SandboxEvent,
|
12
12
|
ScoreEvent,
|
13
|
+
SpanBeginEvent,
|
14
|
+
SpanEndEvent,
|
13
15
|
StateEvent,
|
14
16
|
StepEvent,
|
15
17
|
StoreEvent,
|
@@ -39,7 +41,9 @@ export type EventType =
|
|
39
41
|
| InputEvent
|
40
42
|
| ErrorEvent
|
41
43
|
| ApprovalEvent
|
42
|
-
| SandboxEvent
|
44
|
+
| SandboxEvent
|
45
|
+
| SpanBeginEvent
|
46
|
+
| SpanEndEvent;
|
43
47
|
|
44
48
|
export class EventNode {
|
45
49
|
event: EventType;
|
@@ -6,6 +6,7 @@ import styles from "./ModelUsagePanel.module.css";
|
|
6
6
|
|
7
7
|
interface ModelUsageProps {
|
8
8
|
usage: ModelUsage1;
|
9
|
+
className?: string | string[];
|
9
10
|
}
|
10
11
|
|
11
12
|
interface ModelUsageRow {
|
@@ -19,7 +20,7 @@ interface ModelUsageRow {
|
|
19
20
|
/**
|
20
21
|
* Renders the ModelUsagePanel component.
|
21
22
|
*/
|
22
|
-
export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
|
23
|
+
export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage, className }) => {
|
23
24
|
if (!usage) {
|
24
25
|
return null;
|
25
26
|
}
|
@@ -84,7 +85,7 @@ export const ModelUsagePanel: FC<ModelUsageProps> = ({ usage }) => {
|
|
84
85
|
});
|
85
86
|
|
86
87
|
return (
|
87
|
-
<div className={clsx("text-size-small", styles.wrapper)}>
|
88
|
+
<div className={clsx("text-size-small", styles.wrapper, className)}>
|
88
89
|
{rows.map((row, idx) => {
|
89
90
|
if (row.label === "---") {
|
90
91
|
return (
|
@@ -79,10 +79,10 @@ export const TokenRow: FC<TokenRowProps> = ({ model, usage }) => {
|
|
79
79
|
return (
|
80
80
|
<tr>
|
81
81
|
<td>
|
82
|
-
<div className={styles.model}>{model}</div>
|
82
|
+
<div className={clsx(styles.model, styles.cellContents)}>{model}</div>
|
83
83
|
</td>
|
84
84
|
<td>
|
85
|
-
<ModelUsagePanel usage={usage} />
|
85
|
+
<ModelUsagePanel usage={usage} className={clsx(styles.cellContents)} />
|
86
86
|
</td>
|
87
87
|
</tr>
|
88
88
|
);
|
@@ -3,13 +3,18 @@
|
|
3
3
|
padding-bottom: 1em;
|
4
4
|
margin-left: 0.5em;
|
5
5
|
display: flex;
|
6
|
+
flex-wrap: wrap;
|
7
|
+
gap: 1em;
|
6
8
|
}
|
7
9
|
|
8
10
|
.col1 {
|
9
|
-
flex:
|
10
|
-
|
11
|
+
flex: 0 1 auto;
|
12
|
+
min-width: 200px;
|
13
|
+
width: fit-content;
|
11
14
|
}
|
12
15
|
|
13
16
|
.col2 {
|
14
|
-
flex: 1 1
|
17
|
+
flex: 1 1 auto;
|
18
|
+
min-width: 300px;
|
19
|
+
width: fit-content;
|
15
20
|
}
|
@@ -1,10 +1,5 @@
|
|
1
|
-
import clsx from "clsx";
|
2
1
|
import { EvalStats } from "../../@types/log";
|
3
|
-
import { FontSize } from "../../app/appearance/fonts";
|
4
|
-
import { ApplicationIcons } from "../../app/appearance/icons";
|
5
|
-
import { MetaDataView } from "../../app/content/MetaDataView";
|
6
2
|
import { Card, CardBody, CardHeader } from "../../components/Card";
|
7
|
-
import { formatDuration } from "../../utils/format";
|
8
3
|
import { ModelTokenTable } from "./ModelTokenTable";
|
9
4
|
|
10
5
|
import { FC } from "react";
|
@@ -24,40 +19,11 @@ export const UsageCard: FC<UsageCardProps> = ({ stats }) => {
|
|
24
19
|
return null;
|
25
20
|
}
|
26
21
|
|
27
|
-
const totalDuration = formatDuration(
|
28
|
-
new Date(stats.started_at),
|
29
|
-
new Date(stats.completed_at),
|
30
|
-
);
|
31
|
-
const usageMetadataStyle = {
|
32
|
-
fontSize: FontSize.smaller,
|
33
|
-
};
|
34
|
-
|
35
22
|
return (
|
36
23
|
<Card>
|
37
|
-
<CardHeader
|
24
|
+
<CardHeader label="Usage" />
|
38
25
|
<CardBody id={kUsageCardBodyId}>
|
39
26
|
<div className={styles.wrapper}>
|
40
|
-
<div className={styles.col1}>
|
41
|
-
<div
|
42
|
-
className={clsx(
|
43
|
-
"text-size-smaller",
|
44
|
-
"text-style-label",
|
45
|
-
"text-style-secondary",
|
46
|
-
)}
|
47
|
-
>
|
48
|
-
Duration
|
49
|
-
</div>
|
50
|
-
<MetaDataView
|
51
|
-
entries={{
|
52
|
-
["Start"]: new Date(stats.started_at).toLocaleString(),
|
53
|
-
["End"]: new Date(stats.completed_at).toLocaleString(),
|
54
|
-
["Duration"]: totalDuration,
|
55
|
-
}}
|
56
|
-
tableOptions="borderless,sm"
|
57
|
-
style={usageMetadataStyle}
|
58
|
-
/>
|
59
|
-
</div>
|
60
|
-
|
61
27
|
<div className={styles.col2}>
|
62
28
|
<ModelTokenTable model_usage={stats.model_usage} />
|
63
29
|
</div>
|
@@ -5,6 +5,8 @@ export const kModelNone = "none/none";
|
|
5
5
|
export const kLogViewSamplesTabId = "samples";
|
6
6
|
export const kLogViewJsonTabId = "json";
|
7
7
|
export const kLogViewInfoTabId = "info";
|
8
|
+
export const kLogViewModelsTabId = "models";
|
9
|
+
export const kLogViewTaskTabId = "task";
|
8
10
|
|
9
11
|
// Sample tab constants
|
10
12
|
export const kSampleMessagesTabId = `messages`;
|
@@ -0,0 +1,17 @@
|
|
1
|
+
export function compareWithNan(a: number, b: number): number {
|
2
|
+
const aIsNaN = Number.isNaN(a);
|
3
|
+
const bIsNaN = Number.isNaN(b);
|
4
|
+
|
5
|
+
if (aIsNaN && bIsNaN) {
|
6
|
+
return 0;
|
7
|
+
}
|
8
|
+
|
9
|
+
if (aIsNaN) {
|
10
|
+
return 1;
|
11
|
+
}
|
12
|
+
if (bIsNaN) {
|
13
|
+
return -1;
|
14
|
+
}
|
15
|
+
|
16
|
+
return a - b;
|
17
|
+
}
|
inspect_ai/agent/_agent.py
CHANGED
@@ -27,13 +27,14 @@ from inspect_ai.model._chat_message import (
|
|
27
27
|
ChatMessageAssistant,
|
28
28
|
)
|
29
29
|
from inspect_ai.model._model_output import ChatCompletionChoice, ModelOutput
|
30
|
+
from inspect_ai.util._limited_conversation import ChatMessageList
|
30
31
|
|
31
32
|
|
32
33
|
class AgentState:
|
33
34
|
"""Agent state."""
|
34
35
|
|
35
36
|
def __init__(self, *, messages: list[ChatMessage]) -> None:
|
36
|
-
self._messages = messages
|
37
|
+
self._messages: list[ChatMessage] = ChatMessageList(messages)
|
37
38
|
self._output: ModelOutput | None = None
|
38
39
|
|
39
40
|
@property
|
@@ -43,8 +44,7 @@ class AgentState:
|
|
43
44
|
|
44
45
|
@messages.setter
|
45
46
|
def messages(self, messages: list[ChatMessage]) -> None:
|
46
|
-
|
47
|
-
self._messages = messages
|
47
|
+
self._messages = ChatMessageList(messages)
|
48
48
|
|
49
49
|
@property
|
50
50
|
def output(self) -> ModelOutput:
|