inspect-ai 0.3.93__py3-none-any.whl → 0.3.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_display/textual/widgets/samples.py +3 -3
- inspect_ai/_display/textual/widgets/transcript.py +3 -29
- inspect_ai/_eval/task/run.py +10 -7
- inspect_ai/_util/answer.py +26 -0
- inspect_ai/_util/constants.py +0 -1
- inspect_ai/_util/local_server.py +51 -21
- inspect_ai/_view/www/dist/assets/index.css +14 -13
- inspect_ai/_view/www/dist/assets/index.js +400 -84
- inspect_ai/_view/www/log-schema.json +375 -0
- inspect_ai/_view/www/src/@types/log.d.ts +90 -12
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
- inspect_ai/agent/_as_solver.py +3 -1
- inspect_ai/agent/_as_tool.py +6 -4
- inspect_ai/agent/_handoff.py +5 -1
- inspect_ai/agent/_react.py +4 -3
- inspect_ai/agent/_run.py +6 -1
- inspect_ai/agent/_types.py +9 -0
- inspect_ai/dataset/_dataset.py +6 -3
- inspect_ai/log/__init__.py +10 -0
- inspect_ai/log/_convert.py +4 -9
- inspect_ai/log/_samples.py +14 -17
- inspect_ai/log/_transcript.py +77 -35
- inspect_ai/log/_tree.py +118 -0
- inspect_ai/model/_call_tools.py +42 -34
- inspect_ai/model/_model.py +45 -40
- inspect_ai/model/_providers/hf.py +27 -1
- inspect_ai/model/_providers/sglang.py +8 -2
- inspect_ai/model/_providers/vllm.py +6 -2
- inspect_ai/scorer/_choice.py +1 -2
- inspect_ai/solver/_chain.py +1 -1
- inspect_ai/solver/_fork.py +1 -1
- inspect_ai/solver/_multiple_choice.py +5 -22
- inspect_ai/solver/_plan.py +2 -2
- inspect_ai/solver/_transcript.py +6 -7
- inspect_ai/tool/_mcp/_mcp.py +6 -5
- inspect_ai/tool/_tools/_execute.py +4 -1
- inspect_ai/util/__init__.py +4 -0
- inspect_ai/util/_anyio.py +11 -0
- inspect_ai/util/_collect.py +50 -0
- inspect_ai/util/_span.py +58 -0
- inspect_ai/util/_subtask.py +27 -42
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +56 -51
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
- inspect_ai/_display/core/group.py +0 -79
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,12 @@
|
|
1
1
|
// This is a special name that signals a group of sandbox events.
|
2
2
|
|
3
|
-
import {
|
3
|
+
import {
|
4
|
+
Events,
|
5
|
+
SpanBeginEvent,
|
6
|
+
SpanEndEvent,
|
7
|
+
StepEvent,
|
8
|
+
} from "../../../../@types/log";
|
9
|
+
import { hasSpans } from "./utils";
|
4
10
|
|
5
11
|
// It will be caught elsewhere and rendered with a pretty name
|
6
12
|
export const kSandboxSignalName = "53787D8A-D3FC-426D-B383-9F880B70E4AA";
|
@@ -54,39 +60,55 @@ const processPendingEvents = (events: Events, filter: boolean): Events => {
|
|
54
60
|
};
|
55
61
|
|
56
62
|
const collapseSampleInit = (events: Events): Events => {
|
57
|
-
//
|
63
|
+
// Don't performance sample init logic if spans are present
|
64
|
+
const hasSpans = events.some((e) => {
|
65
|
+
return e.event === "span_begin" || e.event === "span_end";
|
66
|
+
});
|
67
|
+
if (hasSpans) {
|
68
|
+
return events;
|
69
|
+
}
|
70
|
+
|
71
|
+
// Don't synthesize a sample init step if one already exists
|
58
72
|
const hasInitStep =
|
59
73
|
events.findIndex((e) => {
|
60
74
|
return e.event === "step" && e.name === "init";
|
61
75
|
}) !== -1;
|
76
|
+
if (hasInitStep) {
|
77
|
+
return events;
|
78
|
+
}
|
62
79
|
|
80
|
+
// Find a sample init event
|
63
81
|
const initEventIndex = events.findIndex((e) => {
|
64
82
|
return e.event === "sample_init";
|
65
83
|
});
|
66
84
|
const initEvent = events[initEventIndex];
|
85
|
+
if (!initEvent) {
|
86
|
+
return events;
|
87
|
+
}
|
67
88
|
|
89
|
+
// Splice in sample init step if needed
|
68
90
|
const fixedUp = [...events];
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
}
|
91
|
+
fixedUp.splice(initEventIndex, 0, {
|
92
|
+
timestamp: initEvent.timestamp,
|
93
|
+
event: "step",
|
94
|
+
action: "begin",
|
95
|
+
type: null,
|
96
|
+
name: "sample_init",
|
97
|
+
pending: false,
|
98
|
+
working_start: 0,
|
99
|
+
span_id: initEvent.span_id,
|
100
|
+
});
|
101
|
+
|
102
|
+
fixedUp.splice(initEventIndex + 2, 0, {
|
103
|
+
timestamp: initEvent.timestamp,
|
104
|
+
event: "step",
|
105
|
+
action: "end",
|
106
|
+
type: null,
|
107
|
+
name: "sample_init",
|
108
|
+
pending: false,
|
109
|
+
working_start: 0,
|
110
|
+
span_id: initEvent.span_id,
|
111
|
+
});
|
90
112
|
return fixedUp;
|
91
113
|
};
|
92
114
|
|
@@ -94,12 +116,22 @@ const groupSandboxEvents = (events: Events): Events => {
|
|
94
116
|
const result: Events = [];
|
95
117
|
const pendingSandboxEvents: Events = [];
|
96
118
|
|
119
|
+
const useSpans = hasSpans(events);
|
120
|
+
|
97
121
|
const pushPendingSandboxEvents = () => {
|
98
122
|
const timestamp =
|
99
123
|
pendingSandboxEvents[pendingSandboxEvents.length - 1].timestamp;
|
100
|
-
|
124
|
+
if (useSpans) {
|
125
|
+
result.push(createSpanBegin(kSandboxSignalName, timestamp, null));
|
126
|
+
} else {
|
127
|
+
result.push(createStepEvent(kSandboxSignalName, timestamp, "begin"));
|
128
|
+
}
|
101
129
|
result.push(...pendingSandboxEvents);
|
102
|
-
|
130
|
+
if (useSpans) {
|
131
|
+
result.push(createSpanEnd(kSandboxSignalName, timestamp));
|
132
|
+
} else {
|
133
|
+
result.push(createStepEvent(kSandboxSignalName, timestamp, "end"));
|
134
|
+
}
|
103
135
|
pendingSandboxEvents.length = 0;
|
104
136
|
};
|
105
137
|
|
@@ -139,4 +171,34 @@ const createStepEvent = (
|
|
139
171
|
name,
|
140
172
|
pending: false,
|
141
173
|
working_start: 0,
|
174
|
+
span_id: null,
|
142
175
|
});
|
176
|
+
|
177
|
+
const createSpanBegin = (
|
178
|
+
name: string,
|
179
|
+
timestamp: string,
|
180
|
+
parent_id: string | null,
|
181
|
+
): SpanBeginEvent => {
|
182
|
+
return {
|
183
|
+
name,
|
184
|
+
id: `${name}-begin`,
|
185
|
+
span_id: name,
|
186
|
+
parent_id,
|
187
|
+
timestamp,
|
188
|
+
event: "span_begin",
|
189
|
+
type: null,
|
190
|
+
pending: false,
|
191
|
+
working_start: 0,
|
192
|
+
};
|
193
|
+
};
|
194
|
+
|
195
|
+
const createSpanEnd = (name: string, timestamp: string): SpanEndEvent => {
|
196
|
+
return {
|
197
|
+
id: `${name}-end`,
|
198
|
+
timestamp,
|
199
|
+
event: "span_end",
|
200
|
+
pending: false,
|
201
|
+
working_start: 0,
|
202
|
+
span_id: name,
|
203
|
+
};
|
204
|
+
};
|
@@ -1,14 +1,28 @@
|
|
1
1
|
import { Events } from "../../../../@types/log";
|
2
2
|
import { EventNode, EventType } from "../types";
|
3
|
+
import {
|
4
|
+
ACTION_BEGIN,
|
5
|
+
ET_SPAN_BEGIN,
|
6
|
+
ET_SPAN_END,
|
7
|
+
ET_STEP,
|
8
|
+
hasSpans,
|
9
|
+
} from "./utils";
|
10
|
+
|
11
|
+
type TreeifyFunction = (
|
12
|
+
event: EventType,
|
13
|
+
addNode: (event: EventType) => EventNode,
|
14
|
+
pushStack: (node: EventNode) => void,
|
15
|
+
popStack: () => void,
|
16
|
+
) => void;
|
3
17
|
|
4
|
-
/**
|
5
|
-
* Gathers events into a hierarchy of EventNodes.
|
6
|
-
*/
|
7
18
|
export function treeifyEvents(events: Events, depth: number): EventNode[] {
|
19
|
+
const useSpans = hasSpans(events);
|
20
|
+
const treeFn = useSpans ? treeifyFnSpan : treeifyFnStep;
|
21
|
+
|
8
22
|
const rootNodes: EventNode[] = [];
|
9
23
|
const stack: EventNode[] = [];
|
10
24
|
|
11
|
-
const
|
25
|
+
const addNode = (event: EventType): EventNode => {
|
12
26
|
const node = new EventNode(event, stack.length + depth);
|
13
27
|
if (stack.length > 0) {
|
14
28
|
const parentNode = stack[stack.length - 1];
|
@@ -19,21 +33,219 @@ export function treeifyEvents(events: Events, depth: number): EventNode[] {
|
|
19
33
|
return node;
|
20
34
|
};
|
21
35
|
|
36
|
+
const pushStack = (node: EventNode): void => {
|
37
|
+
stack.push(node);
|
38
|
+
};
|
39
|
+
|
40
|
+
const popStack = (): void => {
|
41
|
+
if (stack.length > 0) {
|
42
|
+
stack.pop();
|
43
|
+
}
|
44
|
+
};
|
45
|
+
|
22
46
|
events.forEach((event) => {
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
47
|
+
treeFn(event, addNode, pushStack, popStack);
|
48
|
+
});
|
49
|
+
|
50
|
+
if (useSpans) {
|
51
|
+
return transformTree(rootNodes);
|
52
|
+
} else {
|
53
|
+
return rootNodes;
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
const treeifyFnStep: TreeifyFunction = (
|
58
|
+
event: EventType,
|
59
|
+
addNode: (event: EventType) => EventNode,
|
60
|
+
pushStack: (node: EventNode) => void,
|
61
|
+
popStack: () => void,
|
62
|
+
): void => {
|
63
|
+
switch (event.event) {
|
64
|
+
case ET_STEP:
|
65
|
+
if (event.action === ACTION_BEGIN) {
|
66
|
+
// Starting a new step
|
67
|
+
const node = addNode(event);
|
68
|
+
pushStack(node);
|
69
|
+
} else {
|
70
|
+
// An ending step
|
71
|
+
popStack();
|
31
72
|
}
|
32
|
-
|
73
|
+
break;
|
74
|
+
case ET_SPAN_BEGIN: {
|
75
|
+
// These shoudn't be here, but throw away
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
case ET_SPAN_END: {
|
79
|
+
// These shoudn't be here, but throw away
|
80
|
+
break;
|
81
|
+
}
|
82
|
+
default:
|
33
83
|
// An event
|
34
|
-
|
84
|
+
addNode(event);
|
85
|
+
break;
|
86
|
+
}
|
87
|
+
};
|
88
|
+
|
89
|
+
const treeifyFnSpan: TreeifyFunction = (
|
90
|
+
event: EventType,
|
91
|
+
addNode: (event: EventType) => EventNode,
|
92
|
+
pushStack: (node: EventNode) => void,
|
93
|
+
popStack: () => void,
|
94
|
+
): void => {
|
95
|
+
switch (event.event) {
|
96
|
+
case ET_STEP:
|
97
|
+
// strip steps
|
98
|
+
break;
|
99
|
+
case ET_SPAN_BEGIN: {
|
100
|
+
const node = addNode(event);
|
101
|
+
pushStack(node);
|
102
|
+
break;
|
35
103
|
}
|
36
|
-
|
104
|
+
case ET_SPAN_END: {
|
105
|
+
popStack();
|
106
|
+
break;
|
107
|
+
}
|
108
|
+
default:
|
109
|
+
// An event
|
110
|
+
addNode(event);
|
111
|
+
break;
|
112
|
+
}
|
113
|
+
};
|
37
114
|
|
38
|
-
|
39
|
-
|
115
|
+
type TreeNodeTransformer = {
|
116
|
+
name: string;
|
117
|
+
matches: (node: EventNode) => boolean;
|
118
|
+
process: (node: EventNode) => EventNode;
|
119
|
+
};
|
120
|
+
|
121
|
+
const treeNodeTransformers: TreeNodeTransformer[] = [
|
122
|
+
{
|
123
|
+
name: "unwrap_tools",
|
124
|
+
matches: (node) =>
|
125
|
+
node.event.event === "span_begin" && node.event.type === "tool",
|
126
|
+
process: (node) => elevateChildNode(node, "tool") || node,
|
127
|
+
},
|
128
|
+
{
|
129
|
+
name: "unwrap_subtasks",
|
130
|
+
matches: (node) =>
|
131
|
+
node.event.event === "span_begin" && node.event.type === "subtask",
|
132
|
+
process: (node) => elevateChildNode(node, "subtask") || node,
|
133
|
+
},
|
134
|
+
{
|
135
|
+
name: "unwrap_agent_solver",
|
136
|
+
matches: (node) =>
|
137
|
+
node.event.event === "span_begin" &&
|
138
|
+
node.event["type"] === "solver" &&
|
139
|
+
node.children.length === 2 &&
|
140
|
+
node.children[0].event.event === "span_begin" &&
|
141
|
+
node.children[0].event.type === "agent" &&
|
142
|
+
node.children[1].event.event === "state",
|
143
|
+
|
144
|
+
process: (node) => skipFirstChildNode(node),
|
145
|
+
},
|
146
|
+
{
|
147
|
+
name: "unwrap_agent_solver w/store",
|
148
|
+
matches: (node) =>
|
149
|
+
node.event.event === "span_begin" &&
|
150
|
+
node.event["type"] === "solver" &&
|
151
|
+
node.children.length === 3 &&
|
152
|
+
node.children[0].event.event === "span_begin" &&
|
153
|
+
node.children[0].event.type === "agent" &&
|
154
|
+
node.children[1].event.event === "state" &&
|
155
|
+
node.children[2].event.event === "store",
|
156
|
+
process: (node) => skipFirstChildNode(node),
|
157
|
+
},
|
158
|
+
{
|
159
|
+
name: "unwrap_handoff",
|
160
|
+
matches: (node) =>
|
161
|
+
node.event.event === "span_begin" &&
|
162
|
+
node.event["type"] === "handoff" &&
|
163
|
+
node.children.length === 2 &&
|
164
|
+
node.children[0].event.event === "tool" &&
|
165
|
+
node.children[1].event.event === "store" &&
|
166
|
+
node.children[0].children.length === 2 &&
|
167
|
+
node.children[0].children[0].event.event === "span_begin" &&
|
168
|
+
node.children[0].children[0].event.type === "agent",
|
169
|
+
process: (node) => skipThisNode(node),
|
170
|
+
},
|
171
|
+
];
|
172
|
+
|
173
|
+
const transformTree = (roots: EventNode[]): EventNode[] => {
|
174
|
+
const visitNode = (node: EventNode): EventNode => {
|
175
|
+
let processedNode = node;
|
176
|
+
|
177
|
+
// Visit children (depth first)
|
178
|
+
processedNode.children = processedNode.children.map(visitNode);
|
179
|
+
|
180
|
+
// Apply any visitors to this node
|
181
|
+
for (const transformer of treeNodeTransformers) {
|
182
|
+
if (transformer.matches(processedNode)) {
|
183
|
+
processedNode = transformer.process(processedNode);
|
184
|
+
// Only apply the first matching transformer
|
185
|
+
break;
|
186
|
+
}
|
187
|
+
}
|
188
|
+
return processedNode;
|
189
|
+
};
|
190
|
+
|
191
|
+
return roots.map(visitNode);
|
192
|
+
};
|
193
|
+
|
194
|
+
/**
|
195
|
+
* Process a span node by elevating a specific child node type and moving its siblings as children
|
196
|
+
* @template T - Type of the event (either ToolEvent or SubtaskEvent)
|
197
|
+
*/
|
198
|
+
const elevateChildNode = (
|
199
|
+
node: EventNode,
|
200
|
+
childEventType: "tool" | "subtask",
|
201
|
+
): EventNode | null => {
|
202
|
+
// Find the specific event child
|
203
|
+
const targetIndex = node.children.findIndex(
|
204
|
+
(child) => child.event.event === childEventType,
|
205
|
+
);
|
206
|
+
|
207
|
+
if (targetIndex === -1) {
|
208
|
+
console.log(
|
209
|
+
`No ${childEventType} event found in a span, this is very unexpected.`,
|
210
|
+
);
|
211
|
+
return null;
|
212
|
+
}
|
213
|
+
|
214
|
+
// Get the target node and set its depth
|
215
|
+
const targetNode = { ...node.children[targetIndex] };
|
216
|
+
const remainingChildren = node.children.filter((_, i) => i !== targetIndex);
|
217
|
+
|
218
|
+
// Process the remaining children
|
219
|
+
targetNode.depth = node.depth;
|
220
|
+
targetNode.children = reduceDepth(remainingChildren);
|
221
|
+
|
222
|
+
// No need to update the event itself (events have been deprecated
|
223
|
+
// and more importantly we drive children / transcripts using the tree structure itself
|
224
|
+
// and notes rather than the event.events itself)
|
225
|
+
return targetNode;
|
226
|
+
};
|
227
|
+
|
228
|
+
const skipFirstChildNode = (node: EventNode): EventNode => {
|
229
|
+
const agentSpan = node.children.splice(0, 1)[0];
|
230
|
+
node.children.unshift(...reduceDepth(agentSpan.children));
|
231
|
+
return node;
|
232
|
+
};
|
233
|
+
|
234
|
+
const skipThisNode = (node: EventNode): EventNode => {
|
235
|
+
const newNode = { ...node.children[0] };
|
236
|
+
newNode.depth = node.depth;
|
237
|
+
newNode.children = reduceDepth(newNode.children[0].children, 2);
|
238
|
+
return newNode;
|
239
|
+
};
|
240
|
+
|
241
|
+
// Reduce the depth of the children by 1
|
242
|
+
// This is used when we hoist a child node to the parent
|
243
|
+
const reduceDepth = (nodes: EventNode[], depth: number = 1): EventNode[] => {
|
244
|
+
return nodes.map((node) => {
|
245
|
+
if (node.children.length > 0) {
|
246
|
+
node.children = reduceDepth(node.children, 1);
|
247
|
+
}
|
248
|
+
node.depth = node.depth - depth;
|
249
|
+
return node;
|
250
|
+
});
|
251
|
+
};
|
@@ -0,0 +1,11 @@
|
|
1
|
+
import { Events } from "../../../../@types/log";
|
2
|
+
|
3
|
+
export const ET_STEP = "step";
|
4
|
+
export const ACTION_BEGIN = "begin";
|
5
|
+
|
6
|
+
export const ET_SPAN_BEGIN = "span_begin";
|
7
|
+
export const ET_SPAN_END = "span_end";
|
8
|
+
|
9
|
+
export const hasSpans = (events: Events): boolean => {
|
10
|
+
return events.some((event) => event.event === ET_SPAN_BEGIN);
|
11
|
+
};
|
@@ -10,6 +10,8 @@ import {
|
|
10
10
|
SampleLimitEvent,
|
11
11
|
SandboxEvent,
|
12
12
|
ScoreEvent,
|
13
|
+
SpanBeginEvent,
|
14
|
+
SpanEndEvent,
|
13
15
|
StateEvent,
|
14
16
|
StepEvent,
|
15
17
|
StoreEvent,
|
@@ -39,7 +41,9 @@ export type EventType =
|
|
39
41
|
| InputEvent
|
40
42
|
| ErrorEvent
|
41
43
|
| ApprovalEvent
|
42
|
-
| SandboxEvent
|
44
|
+
| SandboxEvent
|
45
|
+
| SpanBeginEvent
|
46
|
+
| SpanEndEvent;
|
43
47
|
|
44
48
|
export class EventNode {
|
45
49
|
event: EventType;
|
inspect_ai/agent/_as_solver.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
4
4
|
|
5
5
|
from inspect_ai.util._limit import Limit, apply_limits
|
6
|
+
from inspect_ai.util._span import span
|
6
7
|
|
7
8
|
if TYPE_CHECKING:
|
8
9
|
from inspect_ai.solver._solver import Solver
|
@@ -61,7 +62,8 @@ def as_solver(agent: Agent, limits: list[Limit] = [], **agent_kwargs: Any) -> So
|
|
61
62
|
try:
|
62
63
|
# run the agent with limits
|
63
64
|
with apply_limits(limits):
|
64
|
-
|
65
|
+
async with span(name=agent_name, type="agent"):
|
66
|
+
agent_state = await agent(agent_state, **agent_kwargs)
|
65
67
|
# if an exception occurs, we still want to update the TaskState with the
|
66
68
|
# AgentState's messages + output so that it appears in the log and is scored
|
67
69
|
finally:
|
inspect_ai/agent/_as_tool.py
CHANGED
@@ -11,6 +11,7 @@ from inspect_ai.tool._tool_def import ToolDef, validate_tool_parameters
|
|
11
11
|
from inspect_ai.tool._tool_info import ToolInfo, parse_tool_info
|
12
12
|
from inspect_ai.tool._tool_params import ToolParam
|
13
13
|
from inspect_ai.util._limit import Limit, apply_limits
|
14
|
+
from inspect_ai.util._span import span
|
14
15
|
|
15
16
|
from ._agent import AGENT_DESCRIPTION, Agent, AgentState
|
16
17
|
|
@@ -49,13 +50,17 @@ def as_tool(
|
|
49
50
|
"Agent passed to as_tool was not created by an @agent decorated function"
|
50
51
|
)
|
51
52
|
|
53
|
+
# get tool_info
|
54
|
+
tool_info = agent_tool_info(agent, description, **agent_kwargs)
|
55
|
+
|
52
56
|
async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
|
53
57
|
# prepare state
|
54
58
|
state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
|
55
59
|
|
56
60
|
# run the agent with limits
|
57
61
|
with apply_limits(limits):
|
58
|
-
|
62
|
+
async with span(name=tool_info.name, type="agent"):
|
63
|
+
state = await agent(state, *args, **(agent_kwargs | kwargs))
|
59
64
|
|
60
65
|
# find assistant message to read content from (prefer output)
|
61
66
|
if not state.output.empty:
|
@@ -67,9 +72,6 @@ def as_tool(
|
|
67
72
|
else:
|
68
73
|
return ""
|
69
74
|
|
70
|
-
# get tool_info
|
71
|
-
tool_info = agent_tool_info(agent, description, **agent_kwargs)
|
72
|
-
|
73
75
|
# add "input" param
|
74
76
|
tool_info.parameters.properties = {
|
75
77
|
"input": ToolParam(type="string", description="Input message.")
|
inspect_ai/agent/_handoff.py
CHANGED
@@ -57,7 +57,9 @@ def handoff(
|
|
57
57
|
tool_info = agent_tool_info(agent, description, **agent_kwargs)
|
58
58
|
|
59
59
|
# AgentTool calls will be intercepted by execute_tools
|
60
|
-
agent_tool = AgentTool(
|
60
|
+
agent_tool = AgentTool(
|
61
|
+
agent, tool_info.name, input_filter, output_filter, limits, **agent_kwargs
|
62
|
+
)
|
61
63
|
tool_name = tool_name or f"transfer_to_{tool_info.name}"
|
62
64
|
set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
|
63
65
|
set_tool_description(
|
@@ -75,12 +77,14 @@ class AgentTool(Tool):
|
|
75
77
|
def __init__(
|
76
78
|
self,
|
77
79
|
agent: Agent,
|
80
|
+
name: str,
|
78
81
|
input_filter: MessageFilter | None = None,
|
79
82
|
output_filter: MessageFilter | None = None,
|
80
83
|
limits: list[Limit] = [],
|
81
84
|
**kwargs: Any,
|
82
85
|
):
|
83
86
|
self.agent = agent
|
87
|
+
self.name = name
|
84
88
|
self.input_filter = input_filter
|
85
89
|
self.output_filter = output_filter
|
86
90
|
self.limits = limits
|
inspect_ai/agent/_react.py
CHANGED
@@ -195,9 +195,10 @@ def react(
|
|
195
195
|
answer = submission(messages)
|
196
196
|
if answer is not None:
|
197
197
|
# set the output to the answer for scoring
|
198
|
-
|
199
|
-
|
200
|
-
|
198
|
+
if submit.answer_only:
|
199
|
+
state.output.completion = answer
|
200
|
+
else:
|
201
|
+
state.output.completion = f"{state.output.completion}{submit.answer_delimiter}{answer}".strip()
|
201
202
|
|
202
203
|
# exit if we are at max_attempts
|
203
204
|
attempt_count += 1
|
inspect_ai/agent/_run.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
from copy import copy
|
2
2
|
from typing import Any
|
3
3
|
|
4
|
+
from inspect_ai._util.registry import registry_unqualified_name
|
4
5
|
from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
|
5
6
|
from inspect_ai.util._limit import Limit, apply_limits
|
7
|
+
from inspect_ai.util._span import span
|
6
8
|
|
7
9
|
from ._agent import Agent, AgentState
|
8
10
|
|
@@ -52,4 +54,7 @@ async def run(
|
|
52
54
|
|
53
55
|
# run the agent with limits
|
54
56
|
with apply_limits(limits):
|
55
|
-
|
57
|
+
# run the agent
|
58
|
+
agent_name = registry_unqualified_name(agent)
|
59
|
+
async with span(name=agent_name, type="agent"):
|
60
|
+
return await agent(state, **agent_kwargs)
|
inspect_ai/agent/_types.py
CHANGED
@@ -96,3 +96,12 @@ class AgentSubmit(NamedTuple):
|
|
96
96
|
|
97
97
|
The tool should return the `answer` provided to it for scoring.
|
98
98
|
"""
|
99
|
+
|
100
|
+
answer_only: bool = False
|
101
|
+
"""Set the completion to only the answer provided by the submit tool.
|
102
|
+
|
103
|
+
By default, the answer is appended (with `answer_delimiter`) to whatever
|
104
|
+
other content the model generated along with the call to `submit()`."""
|
105
|
+
|
106
|
+
answer_delimiter: str = "\n\n"
|
107
|
+
"""Delimter used when appending submit tool answer to other content the model generated along with the call to `submit()`."""
|
inspect_ai/dataset/_dataset.py
CHANGED
@@ -16,6 +16,7 @@ from typing import (
|
|
16
16
|
from pydantic import BaseModel, Field, ValidationError
|
17
17
|
from typing_extensions import override
|
18
18
|
|
19
|
+
from inspect_ai._util.answer import answer_character, answer_index
|
19
20
|
from inspect_ai.model import ChatMessage
|
20
21
|
from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
|
21
22
|
from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
|
@@ -328,7 +329,9 @@ class MemoryDataset(Dataset):
|
|
328
329
|
shuffled_choices = [sample.choices[i] for i in positions]
|
329
330
|
|
330
331
|
# Map of original position / target letter
|
331
|
-
position_map = {
|
332
|
+
position_map = {
|
333
|
+
i: answer_character(new_i) for new_i, i in enumerate(positions)
|
334
|
+
}
|
332
335
|
|
333
336
|
# Update to the shuffled choices and target
|
334
337
|
sample.choices = shuffled_choices
|
@@ -338,9 +341,9 @@ class MemoryDataset(Dataset):
|
|
338
341
|
self, target: str | list[str], position_map: dict[int, str]
|
339
342
|
) -> str | list[str]:
|
340
343
|
if isinstance(target, list):
|
341
|
-
return [position_map[
|
344
|
+
return [position_map[answer_index(t)] for t in target]
|
342
345
|
else:
|
343
|
-
return position_map[
|
346
|
+
return position_map[answer_index(target)]
|
344
347
|
|
345
348
|
@override
|
346
349
|
def sort(
|
inspect_ai/log/__init__.py
CHANGED
@@ -48,6 +48,8 @@ from ._transcript import (
|
|
48
48
|
SampleLimitEvent,
|
49
49
|
SandboxEvent,
|
50
50
|
ScoreEvent,
|
51
|
+
SpanBeginEvent,
|
52
|
+
SpanEndEvent,
|
51
53
|
StateEvent,
|
52
54
|
StepEvent,
|
53
55
|
StoreEvent,
|
@@ -56,6 +58,7 @@ from ._transcript import (
|
|
56
58
|
Transcript,
|
57
59
|
transcript,
|
58
60
|
)
|
61
|
+
from ._tree import EventNode, EventTree, SpanNode, event_sequence, event_tree
|
59
62
|
|
60
63
|
__all__ = [
|
61
64
|
"EvalConfig",
|
@@ -92,6 +95,8 @@ __all__ = [
|
|
92
95
|
"SampleLimitEvent",
|
93
96
|
"SandboxEvent",
|
94
97
|
"ScoreEvent",
|
98
|
+
"SpanBeginEvent",
|
99
|
+
"SpanEndEvent",
|
95
100
|
"StateEvent",
|
96
101
|
"StepEvent",
|
97
102
|
"StoreEvent",
|
@@ -111,4 +116,9 @@ __all__ = [
|
|
111
116
|
"write_log_dir_manifest",
|
112
117
|
"retryable_eval_logs",
|
113
118
|
"bundle_log_dir",
|
119
|
+
"event_tree",
|
120
|
+
"event_sequence",
|
121
|
+
"EventTree",
|
122
|
+
"EventNode",
|
123
|
+
"SpanNode",
|
114
124
|
]
|
inspect_ai/log/_convert.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2
2
|
from typing import Literal
|
3
3
|
|
4
4
|
from inspect_ai._util.error import PrerequisiteError
|
5
|
-
from inspect_ai._util.file import
|
5
|
+
from inspect_ai._util.file import exists, filesystem
|
6
6
|
from inspect_ai.log._file import (
|
7
7
|
log_files_from_ls,
|
8
8
|
read_eval_log,
|
@@ -66,14 +66,9 @@ def convert_eval_logs(
|
|
66
66
|
"Output file {output_file} already exists (use --overwrite to overwrite existing files)"
|
67
67
|
)
|
68
68
|
|
69
|
-
#
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
# otherwise do a full read/write
|
74
|
-
else:
|
75
|
-
log = read_eval_log(input_file)
|
76
|
-
write_eval_log(log, output_file)
|
69
|
+
# do a full read/write (normalized deprecated constructs and adds sample summaries)
|
70
|
+
log = read_eval_log(input_file)
|
71
|
+
write_eval_log(log, output_file)
|
77
72
|
|
78
73
|
if fs.info(path).type == "file":
|
79
74
|
convert_file(path)
|