inspect-ai 0.3.93__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. inspect_ai/_display/textual/widgets/samples.py +3 -3
  2. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  3. inspect_ai/_eval/task/run.py +10 -7
  4. inspect_ai/_util/answer.py +26 -0
  5. inspect_ai/_util/constants.py +0 -1
  6. inspect_ai/_util/local_server.py +51 -21
  7. inspect_ai/_view/www/dist/assets/index.css +14 -13
  8. inspect_ai/_view/www/dist/assets/index.js +400 -84
  9. inspect_ai/_view/www/log-schema.json +375 -0
  10. inspect_ai/_view/www/src/@types/log.d.ts +90 -12
  11. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  12. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  13. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  14. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  15. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  16. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  17. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  18. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  19. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  20. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  21. inspect_ai/agent/_as_solver.py +3 -1
  22. inspect_ai/agent/_as_tool.py +6 -4
  23. inspect_ai/agent/_handoff.py +5 -1
  24. inspect_ai/agent/_react.py +4 -3
  25. inspect_ai/agent/_run.py +6 -1
  26. inspect_ai/agent/_types.py +9 -0
  27. inspect_ai/dataset/_dataset.py +6 -3
  28. inspect_ai/log/__init__.py +10 -0
  29. inspect_ai/log/_convert.py +4 -9
  30. inspect_ai/log/_samples.py +14 -17
  31. inspect_ai/log/_transcript.py +77 -35
  32. inspect_ai/log/_tree.py +118 -0
  33. inspect_ai/model/_call_tools.py +42 -34
  34. inspect_ai/model/_model.py +45 -40
  35. inspect_ai/model/_providers/hf.py +27 -1
  36. inspect_ai/model/_providers/sglang.py +8 -2
  37. inspect_ai/model/_providers/vllm.py +6 -2
  38. inspect_ai/scorer/_choice.py +1 -2
  39. inspect_ai/solver/_chain.py +1 -1
  40. inspect_ai/solver/_fork.py +1 -1
  41. inspect_ai/solver/_multiple_choice.py +5 -22
  42. inspect_ai/solver/_plan.py +2 -2
  43. inspect_ai/solver/_transcript.py +6 -7
  44. inspect_ai/tool/_mcp/_mcp.py +6 -5
  45. inspect_ai/tool/_tools/_execute.py +4 -1
  46. inspect_ai/util/__init__.py +4 -0
  47. inspect_ai/util/_anyio.py +11 -0
  48. inspect_ai/util/_collect.py +50 -0
  49. inspect_ai/util/_span.py +58 -0
  50. inspect_ai/util/_subtask.py +27 -42
  51. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  52. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +56 -51
  53. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  54. inspect_ai/_display/core/group.py +0 -79
  55. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  56. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  57. {inspect_ai-0.3.93.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,12 @@
1
1
  // This is a special name that signals a group of sandbox events.
2
2
 
3
- import { Events, StepEvent } from "../../../../@types/log";
3
+ import {
4
+ Events,
5
+ SpanBeginEvent,
6
+ SpanEndEvent,
7
+ StepEvent,
8
+ } from "../../../../@types/log";
9
+ import { hasSpans } from "./utils";
4
10
 
5
11
  // It will be caught elsewhere and rendered with a pretty name
6
12
  export const kSandboxSignalName = "53787D8A-D3FC-426D-B383-9F880B70E4AA";
@@ -54,39 +60,55 @@ const processPendingEvents = (events: Events, filter: boolean): Events => {
54
60
  };
55
61
 
56
62
  const collapseSampleInit = (events: Events): Events => {
57
- // See if the events have an init step
63
+ // Don't performance sample init logic if spans are present
64
+ const hasSpans = events.some((e) => {
65
+ return e.event === "span_begin" || e.event === "span_end";
66
+ });
67
+ if (hasSpans) {
68
+ return events;
69
+ }
70
+
71
+ // Don't synthesize a sample init step if one already exists
58
72
  const hasInitStep =
59
73
  events.findIndex((e) => {
60
74
  return e.event === "step" && e.name === "init";
61
75
  }) !== -1;
76
+ if (hasInitStep) {
77
+ return events;
78
+ }
62
79
 
80
+ // Find a sample init event
63
81
  const initEventIndex = events.findIndex((e) => {
64
82
  return e.event === "sample_init";
65
83
  });
66
84
  const initEvent = events[initEventIndex];
85
+ if (!initEvent) {
86
+ return events;
87
+ }
67
88
 
89
+ // Splice in sample init step if needed
68
90
  const fixedUp = [...events];
69
- if (!hasInitStep && initEvent) {
70
- fixedUp.splice(initEventIndex, 0, {
71
- timestamp: initEvent.timestamp,
72
- event: "step",
73
- action: "begin",
74
- type: null,
75
- name: "sample_init",
76
- pending: false,
77
- working_start: 0,
78
- });
79
-
80
- fixedUp.splice(initEventIndex + 2, 0, {
81
- timestamp: initEvent.timestamp,
82
- event: "step",
83
- action: "end",
84
- type: null,
85
- name: "sample_init",
86
- pending: false,
87
- working_start: 0,
88
- });
89
- }
91
+ fixedUp.splice(initEventIndex, 0, {
92
+ timestamp: initEvent.timestamp,
93
+ event: "step",
94
+ action: "begin",
95
+ type: null,
96
+ name: "sample_init",
97
+ pending: false,
98
+ working_start: 0,
99
+ span_id: initEvent.span_id,
100
+ });
101
+
102
+ fixedUp.splice(initEventIndex + 2, 0, {
103
+ timestamp: initEvent.timestamp,
104
+ event: "step",
105
+ action: "end",
106
+ type: null,
107
+ name: "sample_init",
108
+ pending: false,
109
+ working_start: 0,
110
+ span_id: initEvent.span_id,
111
+ });
90
112
  return fixedUp;
91
113
  };
92
114
 
@@ -94,12 +116,22 @@ const groupSandboxEvents = (events: Events): Events => {
94
116
  const result: Events = [];
95
117
  const pendingSandboxEvents: Events = [];
96
118
 
119
+ const useSpans = hasSpans(events);
120
+
97
121
  const pushPendingSandboxEvents = () => {
98
122
  const timestamp =
99
123
  pendingSandboxEvents[pendingSandboxEvents.length - 1].timestamp;
100
- result.push(createStepEvent(kSandboxSignalName, timestamp, "begin"));
124
+ if (useSpans) {
125
+ result.push(createSpanBegin(kSandboxSignalName, timestamp, null));
126
+ } else {
127
+ result.push(createStepEvent(kSandboxSignalName, timestamp, "begin"));
128
+ }
101
129
  result.push(...pendingSandboxEvents);
102
- result.push(createStepEvent(kSandboxSignalName, timestamp, "end"));
130
+ if (useSpans) {
131
+ result.push(createSpanEnd(kSandboxSignalName, timestamp));
132
+ } else {
133
+ result.push(createStepEvent(kSandboxSignalName, timestamp, "end"));
134
+ }
103
135
  pendingSandboxEvents.length = 0;
104
136
  };
105
137
 
@@ -139,4 +171,34 @@ const createStepEvent = (
139
171
  name,
140
172
  pending: false,
141
173
  working_start: 0,
174
+ span_id: null,
142
175
  });
176
+
177
+ const createSpanBegin = (
178
+ name: string,
179
+ timestamp: string,
180
+ parent_id: string | null,
181
+ ): SpanBeginEvent => {
182
+ return {
183
+ name,
184
+ id: `${name}-begin`,
185
+ span_id: name,
186
+ parent_id,
187
+ timestamp,
188
+ event: "span_begin",
189
+ type: null,
190
+ pending: false,
191
+ working_start: 0,
192
+ };
193
+ };
194
+
195
+ const createSpanEnd = (name: string, timestamp: string): SpanEndEvent => {
196
+ return {
197
+ id: `${name}-end`,
198
+ timestamp,
199
+ event: "span_end",
200
+ pending: false,
201
+ working_start: 0,
202
+ span_id: name,
203
+ };
204
+ };
@@ -1,14 +1,28 @@
1
1
  import { Events } from "../../../../@types/log";
2
2
  import { EventNode, EventType } from "../types";
3
+ import {
4
+ ACTION_BEGIN,
5
+ ET_SPAN_BEGIN,
6
+ ET_SPAN_END,
7
+ ET_STEP,
8
+ hasSpans,
9
+ } from "./utils";
10
+
11
+ type TreeifyFunction = (
12
+ event: EventType,
13
+ addNode: (event: EventType) => EventNode,
14
+ pushStack: (node: EventNode) => void,
15
+ popStack: () => void,
16
+ ) => void;
3
17
 
4
- /**
5
- * Gathers events into a hierarchy of EventNodes.
6
- */
7
18
  export function treeifyEvents(events: Events, depth: number): EventNode[] {
19
+ const useSpans = hasSpans(events);
20
+ const treeFn = useSpans ? treeifyFnSpan : treeifyFnStep;
21
+
8
22
  const rootNodes: EventNode[] = [];
9
23
  const stack: EventNode[] = [];
10
24
 
11
- const pushNode = (event: EventType): EventNode => {
25
+ const addNode = (event: EventType): EventNode => {
12
26
  const node = new EventNode(event, stack.length + depth);
13
27
  if (stack.length > 0) {
14
28
  const parentNode = stack[stack.length - 1];
@@ -19,21 +33,219 @@ export function treeifyEvents(events: Events, depth: number): EventNode[] {
19
33
  return node;
20
34
  };
21
35
 
36
+ const pushStack = (node: EventNode): void => {
37
+ stack.push(node);
38
+ };
39
+
40
+ const popStack = (): void => {
41
+ if (stack.length > 0) {
42
+ stack.pop();
43
+ }
44
+ };
45
+
22
46
  events.forEach((event) => {
23
- if (event.event === "step" && event.action === "begin") {
24
- // Starting a new step
25
- const node = pushNode(event);
26
- stack.push(node);
27
- } else if (event.event === "step" && event.action === "end") {
28
- // An ending step
29
- if (stack.length > 0) {
30
- stack.pop();
47
+ treeFn(event, addNode, pushStack, popStack);
48
+ });
49
+
50
+ if (useSpans) {
51
+ return transformTree(rootNodes);
52
+ } else {
53
+ return rootNodes;
54
+ }
55
+ }
56
+
57
+ const treeifyFnStep: TreeifyFunction = (
58
+ event: EventType,
59
+ addNode: (event: EventType) => EventNode,
60
+ pushStack: (node: EventNode) => void,
61
+ popStack: () => void,
62
+ ): void => {
63
+ switch (event.event) {
64
+ case ET_STEP:
65
+ if (event.action === ACTION_BEGIN) {
66
+ // Starting a new step
67
+ const node = addNode(event);
68
+ pushStack(node);
69
+ } else {
70
+ // An ending step
71
+ popStack();
31
72
  }
32
- } else {
73
+ break;
74
+ case ET_SPAN_BEGIN: {
75
+ // These shoudn't be here, but throw away
76
+ break;
77
+ }
78
+ case ET_SPAN_END: {
79
+ // These shoudn't be here, but throw away
80
+ break;
81
+ }
82
+ default:
33
83
  // An event
34
- pushNode(event);
84
+ addNode(event);
85
+ break;
86
+ }
87
+ };
88
+
89
+ const treeifyFnSpan: TreeifyFunction = (
90
+ event: EventType,
91
+ addNode: (event: EventType) => EventNode,
92
+ pushStack: (node: EventNode) => void,
93
+ popStack: () => void,
94
+ ): void => {
95
+ switch (event.event) {
96
+ case ET_STEP:
97
+ // strip steps
98
+ break;
99
+ case ET_SPAN_BEGIN: {
100
+ const node = addNode(event);
101
+ pushStack(node);
102
+ break;
35
103
  }
36
- });
104
+ case ET_SPAN_END: {
105
+ popStack();
106
+ break;
107
+ }
108
+ default:
109
+ // An event
110
+ addNode(event);
111
+ break;
112
+ }
113
+ };
37
114
 
38
- return rootNodes;
39
- }
115
+ type TreeNodeTransformer = {
116
+ name: string;
117
+ matches: (node: EventNode) => boolean;
118
+ process: (node: EventNode) => EventNode;
119
+ };
120
+
121
+ const treeNodeTransformers: TreeNodeTransformer[] = [
122
+ {
123
+ name: "unwrap_tools",
124
+ matches: (node) =>
125
+ node.event.event === "span_begin" && node.event.type === "tool",
126
+ process: (node) => elevateChildNode(node, "tool") || node,
127
+ },
128
+ {
129
+ name: "unwrap_subtasks",
130
+ matches: (node) =>
131
+ node.event.event === "span_begin" && node.event.type === "subtask",
132
+ process: (node) => elevateChildNode(node, "subtask") || node,
133
+ },
134
+ {
135
+ name: "unwrap_agent_solver",
136
+ matches: (node) =>
137
+ node.event.event === "span_begin" &&
138
+ node.event["type"] === "solver" &&
139
+ node.children.length === 2 &&
140
+ node.children[0].event.event === "span_begin" &&
141
+ node.children[0].event.type === "agent" &&
142
+ node.children[1].event.event === "state",
143
+
144
+ process: (node) => skipFirstChildNode(node),
145
+ },
146
+ {
147
+ name: "unwrap_agent_solver w/store",
148
+ matches: (node) =>
149
+ node.event.event === "span_begin" &&
150
+ node.event["type"] === "solver" &&
151
+ node.children.length === 3 &&
152
+ node.children[0].event.event === "span_begin" &&
153
+ node.children[0].event.type === "agent" &&
154
+ node.children[1].event.event === "state" &&
155
+ node.children[2].event.event === "store",
156
+ process: (node) => skipFirstChildNode(node),
157
+ },
158
+ {
159
+ name: "unwrap_handoff",
160
+ matches: (node) =>
161
+ node.event.event === "span_begin" &&
162
+ node.event["type"] === "handoff" &&
163
+ node.children.length === 2 &&
164
+ node.children[0].event.event === "tool" &&
165
+ node.children[1].event.event === "store" &&
166
+ node.children[0].children.length === 2 &&
167
+ node.children[0].children[0].event.event === "span_begin" &&
168
+ node.children[0].children[0].event.type === "agent",
169
+ process: (node) => skipThisNode(node),
170
+ },
171
+ ];
172
+
173
+ const transformTree = (roots: EventNode[]): EventNode[] => {
174
+ const visitNode = (node: EventNode): EventNode => {
175
+ let processedNode = node;
176
+
177
+ // Visit children (depth first)
178
+ processedNode.children = processedNode.children.map(visitNode);
179
+
180
+ // Apply any visitors to this node
181
+ for (const transformer of treeNodeTransformers) {
182
+ if (transformer.matches(processedNode)) {
183
+ processedNode = transformer.process(processedNode);
184
+ // Only apply the first matching transformer
185
+ break;
186
+ }
187
+ }
188
+ return processedNode;
189
+ };
190
+
191
+ return roots.map(visitNode);
192
+ };
193
+
194
+ /**
195
+ * Process a span node by elevating a specific child node type and moving its siblings as children
196
+ * @template T - Type of the event (either ToolEvent or SubtaskEvent)
197
+ */
198
+ const elevateChildNode = (
199
+ node: EventNode,
200
+ childEventType: "tool" | "subtask",
201
+ ): EventNode | null => {
202
+ // Find the specific event child
203
+ const targetIndex = node.children.findIndex(
204
+ (child) => child.event.event === childEventType,
205
+ );
206
+
207
+ if (targetIndex === -1) {
208
+ console.log(
209
+ `No ${childEventType} event found in a span, this is very unexpected.`,
210
+ );
211
+ return null;
212
+ }
213
+
214
+ // Get the target node and set its depth
215
+ const targetNode = { ...node.children[targetIndex] };
216
+ const remainingChildren = node.children.filter((_, i) => i !== targetIndex);
217
+
218
+ // Process the remaining children
219
+ targetNode.depth = node.depth;
220
+ targetNode.children = reduceDepth(remainingChildren);
221
+
222
+ // No need to update the event itself (events have been deprecated
223
+ // and more importantly we drive children / transcripts using the tree structure itself
224
+ // and notes rather than the event.events itself)
225
+ return targetNode;
226
+ };
227
+
228
+ const skipFirstChildNode = (node: EventNode): EventNode => {
229
+ const agentSpan = node.children.splice(0, 1)[0];
230
+ node.children.unshift(...reduceDepth(agentSpan.children));
231
+ return node;
232
+ };
233
+
234
+ const skipThisNode = (node: EventNode): EventNode => {
235
+ const newNode = { ...node.children[0] };
236
+ newNode.depth = node.depth;
237
+ newNode.children = reduceDepth(newNode.children[0].children, 2);
238
+ return newNode;
239
+ };
240
+
241
+ // Reduce the depth of the children by 1
242
+ // This is used when we hoist a child node to the parent
243
+ const reduceDepth = (nodes: EventNode[], depth: number = 1): EventNode[] => {
244
+ return nodes.map((node) => {
245
+ if (node.children.length > 0) {
246
+ node.children = reduceDepth(node.children, 1);
247
+ }
248
+ node.depth = node.depth - depth;
249
+ return node;
250
+ });
251
+ };
@@ -0,0 +1,11 @@
1
+ import { Events } from "../../../../@types/log";
2
+
3
+ export const ET_STEP = "step";
4
+ export const ACTION_BEGIN = "begin";
5
+
6
+ export const ET_SPAN_BEGIN = "span_begin";
7
+ export const ET_SPAN_END = "span_end";
8
+
9
+ export const hasSpans = (events: Events): boolean => {
10
+ return events.some((event) => event.event === ET_SPAN_BEGIN);
11
+ };
@@ -10,6 +10,8 @@ import {
10
10
  SampleLimitEvent,
11
11
  SandboxEvent,
12
12
  ScoreEvent,
13
+ SpanBeginEvent,
14
+ SpanEndEvent,
13
15
  StateEvent,
14
16
  StepEvent,
15
17
  StoreEvent,
@@ -39,7 +41,9 @@ export type EventType =
39
41
  | InputEvent
40
42
  | ErrorEvent
41
43
  | ApprovalEvent
42
- | SandboxEvent;
44
+ | SandboxEvent
45
+ | SpanBeginEvent
46
+ | SpanEndEvent;
43
47
 
44
48
  export class EventNode {
45
49
  event: EventType;
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING, Any
4
4
 
5
5
  from inspect_ai.util._limit import Limit, apply_limits
6
+ from inspect_ai.util._span import span
6
7
 
7
8
  if TYPE_CHECKING:
8
9
  from inspect_ai.solver._solver import Solver
@@ -61,7 +62,8 @@ def as_solver(agent: Agent, limits: list[Limit] = [], **agent_kwargs: Any) -> So
61
62
  try:
62
63
  # run the agent with limits
63
64
  with apply_limits(limits):
64
- agent_state = await agent(agent_state, **agent_kwargs)
65
+ async with span(name=agent_name, type="agent"):
66
+ agent_state = await agent(agent_state, **agent_kwargs)
65
67
  # if an exception occurs, we still want to update the TaskState with the
66
68
  # AgentState's messages + output so that it appears in the log and is scored
67
69
  finally:
@@ -11,6 +11,7 @@ from inspect_ai.tool._tool_def import ToolDef, validate_tool_parameters
11
11
  from inspect_ai.tool._tool_info import ToolInfo, parse_tool_info
12
12
  from inspect_ai.tool._tool_params import ToolParam
13
13
  from inspect_ai.util._limit import Limit, apply_limits
14
+ from inspect_ai.util._span import span
14
15
 
15
16
  from ._agent import AGENT_DESCRIPTION, Agent, AgentState
16
17
 
@@ -49,13 +50,17 @@ def as_tool(
49
50
  "Agent passed to as_tool was not created by an @agent decorated function"
50
51
  )
51
52
 
53
+ # get tool_info
54
+ tool_info = agent_tool_info(agent, description, **agent_kwargs)
55
+
52
56
  async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
53
57
  # prepare state
54
58
  state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
55
59
 
56
60
  # run the agent with limits
57
61
  with apply_limits(limits):
58
- state = await agent(state, *args, **(agent_kwargs | kwargs))
62
+ async with span(name=tool_info.name, type="agent"):
63
+ state = await agent(state, *args, **(agent_kwargs | kwargs))
59
64
 
60
65
  # find assistant message to read content from (prefer output)
61
66
  if not state.output.empty:
@@ -67,9 +72,6 @@ def as_tool(
67
72
  else:
68
73
  return ""
69
74
 
70
- # get tool_info
71
- tool_info = agent_tool_info(agent, description, **agent_kwargs)
72
-
73
75
  # add "input" param
74
76
  tool_info.parameters.properties = {
75
77
  "input": ToolParam(type="string", description="Input message.")
@@ -57,7 +57,9 @@ def handoff(
57
57
  tool_info = agent_tool_info(agent, description, **agent_kwargs)
58
58
 
59
59
  # AgentTool calls will be intercepted by execute_tools
60
- agent_tool = AgentTool(agent, input_filter, output_filter, limits, **agent_kwargs)
60
+ agent_tool = AgentTool(
61
+ agent, tool_info.name, input_filter, output_filter, limits, **agent_kwargs
62
+ )
61
63
  tool_name = tool_name or f"transfer_to_{tool_info.name}"
62
64
  set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
63
65
  set_tool_description(
@@ -75,12 +77,14 @@ class AgentTool(Tool):
75
77
  def __init__(
76
78
  self,
77
79
  agent: Agent,
80
+ name: str,
78
81
  input_filter: MessageFilter | None = None,
79
82
  output_filter: MessageFilter | None = None,
80
83
  limits: list[Limit] = [],
81
84
  **kwargs: Any,
82
85
  ):
83
86
  self.agent = agent
87
+ self.name = name
84
88
  self.input_filter = input_filter
85
89
  self.output_filter = output_filter
86
90
  self.limits = limits
@@ -195,9 +195,10 @@ def react(
195
195
  answer = submission(messages)
196
196
  if answer is not None:
197
197
  # set the output to the answer for scoring
198
- state.output.completion = (
199
- f"{state.output.completion}\n\n{answer}".strip()
200
- )
198
+ if submit.answer_only:
199
+ state.output.completion = answer
200
+ else:
201
+ state.output.completion = f"{state.output.completion}{submit.answer_delimiter}{answer}".strip()
201
202
 
202
203
  # exit if we are at max_attempts
203
204
  attempt_count += 1
inspect_ai/agent/_run.py CHANGED
@@ -1,8 +1,10 @@
1
1
  from copy import copy
2
2
  from typing import Any
3
3
 
4
+ from inspect_ai._util.registry import registry_unqualified_name
4
5
  from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
5
6
  from inspect_ai.util._limit import Limit, apply_limits
7
+ from inspect_ai.util._span import span
6
8
 
7
9
  from ._agent import Agent, AgentState
8
10
 
@@ -52,4 +54,7 @@ async def run(
52
54
 
53
55
  # run the agent with limits
54
56
  with apply_limits(limits):
55
- return await agent(state, **agent_kwargs)
57
+ # run the agent
58
+ agent_name = registry_unqualified_name(agent)
59
+ async with span(name=agent_name, type="agent"):
60
+ return await agent(state, **agent_kwargs)
@@ -96,3 +96,12 @@ class AgentSubmit(NamedTuple):
96
96
 
97
97
  The tool should return the `answer` provided to it for scoring.
98
98
  """
99
+
100
+ answer_only: bool = False
101
+ """Set the completion to only the answer provided by the submit tool.
102
+
103
+ By default, the answer is appended (with `answer_delimiter`) to whatever
104
+ other content the model generated along with the call to `submit()`."""
105
+
106
+ answer_delimiter: str = "\n\n"
107
+ """Delimter used when appending submit tool answer to other content the model generated along with the call to `submit()`."""
@@ -16,6 +16,7 @@ from typing import (
16
16
  from pydantic import BaseModel, Field, ValidationError
17
17
  from typing_extensions import override
18
18
 
19
+ from inspect_ai._util.answer import answer_character, answer_index
19
20
  from inspect_ai.model import ChatMessage
20
21
  from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
21
22
  from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
@@ -328,7 +329,9 @@ class MemoryDataset(Dataset):
328
329
  shuffled_choices = [sample.choices[i] for i in positions]
329
330
 
330
331
  # Map of original position / target letter
331
- position_map = {i: chr(65 + new_i) for new_i, i in enumerate(positions)}
332
+ position_map = {
333
+ i: answer_character(new_i) for new_i, i in enumerate(positions)
334
+ }
332
335
 
333
336
  # Update to the shuffled choices and target
334
337
  sample.choices = shuffled_choices
@@ -338,9 +341,9 @@ class MemoryDataset(Dataset):
338
341
  self, target: str | list[str], position_map: dict[int, str]
339
342
  ) -> str | list[str]:
340
343
  if isinstance(target, list):
341
- return [position_map[ord(t) - 65] for t in target]
344
+ return [position_map[answer_index(t)] for t in target]
342
345
  else:
343
- return position_map[ord(target) - 65]
346
+ return position_map[answer_index(target)]
344
347
 
345
348
  @override
346
349
  def sort(
@@ -48,6 +48,8 @@ from ._transcript import (
48
48
  SampleLimitEvent,
49
49
  SandboxEvent,
50
50
  ScoreEvent,
51
+ SpanBeginEvent,
52
+ SpanEndEvent,
51
53
  StateEvent,
52
54
  StepEvent,
53
55
  StoreEvent,
@@ -56,6 +58,7 @@ from ._transcript import (
56
58
  Transcript,
57
59
  transcript,
58
60
  )
61
+ from ._tree import EventNode, EventTree, SpanNode, event_sequence, event_tree
59
62
 
60
63
  __all__ = [
61
64
  "EvalConfig",
@@ -92,6 +95,8 @@ __all__ = [
92
95
  "SampleLimitEvent",
93
96
  "SandboxEvent",
94
97
  "ScoreEvent",
98
+ "SpanBeginEvent",
99
+ "SpanEndEvent",
95
100
  "StateEvent",
96
101
  "StepEvent",
97
102
  "StoreEvent",
@@ -111,4 +116,9 @@ __all__ = [
111
116
  "write_log_dir_manifest",
112
117
  "retryable_eval_logs",
113
118
  "bundle_log_dir",
119
+ "event_tree",
120
+ "event_sequence",
121
+ "EventTree",
122
+ "EventNode",
123
+ "SpanNode",
114
124
  ]
@@ -2,7 +2,7 @@ import os
2
2
  from typing import Literal
3
3
 
4
4
  from inspect_ai._util.error import PrerequisiteError
5
- from inspect_ai._util.file import copy_file, exists, filesystem
5
+ from inspect_ai._util.file import exists, filesystem
6
6
  from inspect_ai.log._file import (
7
7
  log_files_from_ls,
8
8
  read_eval_log,
@@ -66,14 +66,9 @@ def convert_eval_logs(
66
66
  "Output file {output_file} already exists (use --overwrite to overwrite existing files)"
67
67
  )
68
68
 
69
- # if the input and output files have the same format just copy
70
- if input_file.endswith(f".{to}"):
71
- copy_file(input_file, output_file)
72
-
73
- # otherwise do a full read/write
74
- else:
75
- log = read_eval_log(input_file)
76
- write_eval_log(log, output_file)
69
+ # do a full read/write (normalized deprecated constructs and adds sample summaries)
70
+ log = read_eval_log(input_file)
71
+ write_eval_log(log, output_file)
77
72
 
78
73
  if fs.info(path).type == "file":
79
74
  convert_file(path)