inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. inspect_ai/_cli/eval.py +27 -0
  2. inspect_ai/_display/textual/widgets/samples.py +3 -3
  3. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  4. inspect_ai/_eval/eval.py +19 -2
  5. inspect_ai/_eval/evalset.py +4 -1
  6. inspect_ai/_eval/run.py +41 -0
  7. inspect_ai/_eval/task/generate.py +38 -44
  8. inspect_ai/_eval/task/log.py +26 -28
  9. inspect_ai/_eval/task/run.py +23 -27
  10. inspect_ai/_util/answer.py +26 -0
  11. inspect_ai/_util/constants.py +0 -1
  12. inspect_ai/_util/local_server.py +398 -0
  13. inspect_ai/_util/working.py +10 -4
  14. inspect_ai/_view/www/dist/assets/index.css +173 -159
  15. inspect_ai/_view/www/dist/assets/index.js +1417 -1142
  16. inspect_ai/_view/www/log-schema.json +379 -3
  17. inspect_ai/_view/www/package.json +1 -1
  18. inspect_ai/_view/www/src/@types/log.d.ts +93 -14
  19. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  20. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  21. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  22. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  23. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  24. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  25. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  26. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  27. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  28. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  29. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  30. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  31. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  32. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  33. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  34. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  35. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  36. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  37. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  38. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  39. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  40. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  41. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  42. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  43. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  44. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  45. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  46. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  47. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  48. inspect_ai/_view/www/src/components/Card.css +0 -1
  49. inspect_ai/_view/www/src/constants.ts +2 -0
  50. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  51. inspect_ai/agent/_agent.py +3 -3
  52. inspect_ai/agent/_as_solver.py +22 -12
  53. inspect_ai/agent/_as_tool.py +20 -6
  54. inspect_ai/agent/_handoff.py +12 -1
  55. inspect_ai/agent/_react.py +4 -3
  56. inspect_ai/agent/_run.py +16 -3
  57. inspect_ai/agent/_types.py +9 -0
  58. inspect_ai/dataset/_dataset.py +6 -3
  59. inspect_ai/log/__init__.py +14 -0
  60. inspect_ai/log/_convert.py +4 -9
  61. inspect_ai/log/_file.py +56 -0
  62. inspect_ai/log/_log.py +99 -0
  63. inspect_ai/log/_recorders/__init__.py +2 -0
  64. inspect_ai/log/_recorders/buffer/database.py +12 -11
  65. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  66. inspect_ai/log/_recorders/buffer/types.py +2 -2
  67. inspect_ai/log/_recorders/eval.py +20 -65
  68. inspect_ai/log/_recorders/file.py +28 -6
  69. inspect_ai/log/_recorders/recorder.py +7 -0
  70. inspect_ai/log/_recorders/types.py +1 -23
  71. inspect_ai/log/_samples.py +14 -25
  72. inspect_ai/log/_transcript.py +84 -36
  73. inspect_ai/log/_tree.py +118 -0
  74. inspect_ai/log/_util.py +52 -0
  75. inspect_ai/model/__init__.py +5 -1
  76. inspect_ai/model/_call_tools.py +72 -44
  77. inspect_ai/model/_generate_config.py +14 -8
  78. inspect_ai/model/_model.py +66 -88
  79. inspect_ai/model/_model_output.py +25 -0
  80. inspect_ai/model/_openai.py +2 -0
  81. inspect_ai/model/_providers/anthropic.py +13 -23
  82. inspect_ai/model/_providers/hf.py +27 -1
  83. inspect_ai/model/_providers/openai_o1.py +8 -2
  84. inspect_ai/model/_providers/providers.py +18 -4
  85. inspect_ai/model/_providers/sglang.py +247 -0
  86. inspect_ai/model/_providers/vllm.py +211 -400
  87. inspect_ai/scorer/_choice.py +1 -2
  88. inspect_ai/solver/__init__.py +7 -2
  89. inspect_ai/solver/_basic_agent.py +3 -10
  90. inspect_ai/solver/_chain.py +1 -1
  91. inspect_ai/solver/_fork.py +1 -1
  92. inspect_ai/solver/_multiple_choice.py +5 -22
  93. inspect_ai/solver/_plan.py +2 -2
  94. inspect_ai/solver/_task_state.py +26 -88
  95. inspect_ai/solver/_transcript.py +6 -7
  96. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  97. inspect_ai/tool/_mcp/_mcp.py +8 -5
  98. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  99. inspect_ai/tool/_mcp/server.py +3 -1
  100. inspect_ai/tool/_tool_call.py +4 -1
  101. inspect_ai/tool/_tool_support_helpers.py +51 -12
  102. inspect_ai/tool/_tools/_bash_session.py +190 -68
  103. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  104. inspect_ai/tool/_tools/_execute.py +4 -1
  105. inspect_ai/tool/_tools/_text_editor.py +4 -3
  106. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  107. inspect_ai/util/__init__.py +16 -0
  108. inspect_ai/util/_anyio.py +11 -0
  109. inspect_ai/util/_collect.py +50 -0
  110. inspect_ai/util/_limit.py +393 -0
  111. inspect_ai/util/_limited_conversation.py +57 -0
  112. inspect_ai/util/_span.py +58 -0
  113. inspect_ai/util/_subtask.py +27 -42
  114. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  115. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
  116. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  117. inspect_ai/_display/core/group.py +0 -79
  118. inspect_ai/solver/_limit.py +0 -39
  119. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  120. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  121. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  122. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  123. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  124. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  125. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  126. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  127. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  128. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  129. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  130. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  131. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  132. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  133. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  134. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  135. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  136. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  137. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  138. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  139. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  140. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  141. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  142. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  143. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  144. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  145. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  146. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  147. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  148. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  149. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING, Any
4
4
 
5
+ from inspect_ai.util._limit import Limit, apply_limits
6
+ from inspect_ai.util._span import span
7
+
5
8
  if TYPE_CHECKING:
6
9
  from inspect_ai.solver._solver import Solver
7
10
 
@@ -14,7 +17,7 @@ from inspect_ai.tool._tool_info import parse_tool_info
14
17
  from ._agent import Agent, AgentState
15
18
 
16
19
 
17
- def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
20
+ def as_solver(agent: Agent, limits: list[Limit] = [], **agent_kwargs: Any) -> Solver:
18
21
  """Convert an agent to a solver.
19
22
 
20
23
  Note that agents used as solvers will only receive their first parameter
@@ -23,6 +26,8 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
23
26
 
24
27
  Args:
25
28
  agent: Agent to convert.
29
+ limits: List of limits to apply to the agent. Should a limit
30
+ be exceeded, the Sample ends and proceeds to scoring.
26
31
  **agent_kwargs: Arguments to curry to Agent function (required
27
32
  if the agent has parameters without default values).
28
33
 
@@ -52,17 +57,22 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
52
57
  @solver(name=agent_name)
53
58
  def agent_to_solver() -> Solver:
54
59
  async def solve(state: TaskState, generate: Generate) -> TaskState:
55
- # run agent
56
- agent_state = await agent(
57
- AgentState(messages=state.messages), **agent_kwargs
58
- )
59
-
60
- # update messages
61
- state.messages = agent_state.messages
62
-
63
- # update output if its not empty
64
- if agent_state.output:
65
- state.output = agent_state.output
60
+ agent_state = AgentState(messages=state.messages)
61
+
62
+ try:
63
+ # run the agent with limits
64
+ with apply_limits(limits):
65
+ async with span(name=agent_name, type="agent"):
66
+ agent_state = await agent(agent_state, **agent_kwargs)
67
+ # if an exception occurs, we still want to update the TaskState with the
68
+ # AgentState's messages + output so that it appears in the log and is scored
69
+ finally:
70
+ # update messages
71
+ state.messages = agent_state.messages
72
+
73
+ # update output if its not empty
74
+ if agent_state.output:
75
+ state.output = agent_state.output
66
76
 
67
77
  return state
68
78
 
@@ -10,12 +10,19 @@ from inspect_ai.tool._tool import Tool, ToolResult, tool
10
10
  from inspect_ai.tool._tool_def import ToolDef, validate_tool_parameters
11
11
  from inspect_ai.tool._tool_info import ToolInfo, parse_tool_info
12
12
  from inspect_ai.tool._tool_params import ToolParam
13
+ from inspect_ai.util._limit import Limit, apply_limits
14
+ from inspect_ai.util._span import span
13
15
 
14
16
  from ._agent import AGENT_DESCRIPTION, Agent, AgentState
15
17
 
16
18
 
17
19
  @tool
18
- def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -> Tool:
20
+ def as_tool(
21
+ agent: Agent,
22
+ description: str | None = None,
23
+ limits: list[Limit] = [],
24
+ **agent_kwargs: Any,
25
+ ) -> Tool:
19
26
  """Convert an agent to a tool.
20
27
 
21
28
  By default the model will see all of the agent's arguments as
@@ -27,6 +34,9 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
27
34
  Args:
28
35
  agent: Agent to convert.
29
36
  description: Tool description (defaults to agent description)
37
+ limits: List of limits to apply to the agent. Should a limit
38
+ be exceeded, the tool call ends and returns an error
39
+ explaining that a limit was exceeded.
30
40
  **agent_kwargs: Arguments to curry to Agent function (arguments
31
41
  provided here will not be presented to the model as part
32
42
  of the tool interface).
@@ -40,10 +50,17 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
40
50
  "Agent passed to as_tool was not created by an @agent decorated function"
41
51
  )
42
52
 
53
+ # get tool_info
54
+ tool_info = agent_tool_info(agent, description, **agent_kwargs)
55
+
43
56
  async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
44
- # prepare state and call agent
57
+ # prepare state
45
58
  state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
46
- state = await agent(state, *args, **(agent_kwargs | kwargs))
59
+
60
+ # run the agent with limits
61
+ with apply_limits(limits):
62
+ async with span(name=tool_info.name, type="agent"):
63
+ state = await agent(state, *args, **(agent_kwargs | kwargs))
47
64
 
48
65
  # find assistant message to read content from (prefer output)
49
66
  if not state.output.empty:
@@ -55,9 +72,6 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
55
72
  else:
56
73
  return ""
57
74
 
58
- # get tool_info
59
- tool_info = agent_tool_info(agent, description, **agent_kwargs)
60
-
61
75
  # add "input" param
62
76
  tool_info.parameters.properties = {
63
77
  "input": ToolParam(type="string", description="Input message.")
@@ -9,6 +9,7 @@ from inspect_ai._util.registry import (
9
9
  from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
10
10
  from inspect_ai.tool._tool_def import ToolDef
11
11
  from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
12
+ from inspect_ai.util._limit import Limit
12
13
 
13
14
  from ._agent import Agent
14
15
  from ._as_tool import agent_tool_info
@@ -21,6 +22,7 @@ def handoff(
21
22
  input_filter: MessageFilter | None = None,
22
23
  output_filter: MessageFilter | None = None,
23
24
  tool_name: str | None = None,
25
+ limits: list[Limit] = [],
24
26
  **agent_kwargs: Any,
25
27
  ) -> Tool:
26
28
  """Create a tool that enables models to handoff to agents.
@@ -35,6 +37,9 @@ def handoff(
35
37
  Use the built-in `last_message` filter to return only the last message
36
38
  or alternatively specify a custom `MessageFilter` function.
37
39
  tool_name: Alternate tool name (defaults to `transfer_to_{agent_name}`)
40
+ limits: List of limits to apply to the agent. Should a limit be exceeded,
41
+ the agent stops and a user message is appended explaining that a limit was
42
+ exceeded.
38
43
  **agent_kwargs: Arguments to curry to `Agent` function (arguments provided here
39
44
  will not be presented to the model as part of the tool interface).
40
45
 
@@ -52,7 +57,9 @@ def handoff(
52
57
  tool_info = agent_tool_info(agent, description, **agent_kwargs)
53
58
 
54
59
  # AgentTool calls will be intercepted by execute_tools
55
- agent_tool = AgentTool(agent, input_filter, output_filter, **agent_kwargs)
60
+ agent_tool = AgentTool(
61
+ agent, tool_info.name, input_filter, output_filter, limits, **agent_kwargs
62
+ )
56
63
  tool_name = tool_name or f"transfer_to_{tool_info.name}"
57
64
  set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
58
65
  set_tool_description(
@@ -70,13 +77,17 @@ class AgentTool(Tool):
70
77
  def __init__(
71
78
  self,
72
79
  agent: Agent,
80
+ name: str,
73
81
  input_filter: MessageFilter | None = None,
74
82
  output_filter: MessageFilter | None = None,
83
+ limits: list[Limit] = [],
75
84
  **kwargs: Any,
76
85
  ):
77
86
  self.agent = agent
87
+ self.name = name
78
88
  self.input_filter = input_filter
79
89
  self.output_filter = output_filter
90
+ self.limits = limits
80
91
  self.kwargs = kwargs
81
92
 
82
93
  @property
@@ -195,9 +195,10 @@ def react(
195
195
  answer = submission(messages)
196
196
  if answer is not None:
197
197
  # set the output to the answer for scoring
198
- state.output.completion = (
199
- f"{state.output.completion}\n\n{answer}".strip()
200
- )
198
+ if submit.answer_only:
199
+ state.output.completion = answer
200
+ else:
201
+ state.output.completion = f"{state.output.completion}{submit.answer_delimiter}{answer}".strip()
201
202
 
202
203
  # exit if we are at max_attempts
203
204
  attempt_count += 1
inspect_ai/agent/_run.py CHANGED
@@ -1,13 +1,19 @@
1
1
  from copy import copy
2
2
  from typing import Any
3
3
 
4
+ from inspect_ai._util.registry import registry_unqualified_name
4
5
  from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
6
+ from inspect_ai.util._limit import Limit, apply_limits
7
+ from inspect_ai.util._span import span
5
8
 
6
9
  from ._agent import Agent, AgentState
7
10
 
8
11
 
9
12
  async def run(
10
- agent: Agent, input: str | list[ChatMessage] | AgentState, **agent_kwargs: Any
13
+ agent: Agent,
14
+ input: str | list[ChatMessage] | AgentState,
15
+ limits: list[Limit] = [],
16
+ **agent_kwargs: Any,
11
17
  ) -> AgentState:
12
18
  """Run an agent.
13
19
 
@@ -17,6 +23,9 @@ async def run(
17
23
  Args:
18
24
  agent: Agent to run.
19
25
  input: Agent input (string, list of messages, or an `AgentState`).
26
+ limits: List of limits to apply to the agent. Should a limit be
27
+ exceeded, a LimitExceededError is raised which the caller may
28
+ handle as appropriate.
20
29
  **agent_kwargs: Additional arguments to pass to agent.
21
30
 
22
31
  Returns:
@@ -43,5 +52,9 @@ async def run(
43
52
  # create state
44
53
  state = AgentState(messages=input_messages)
45
54
 
46
- # run the agent
47
- return await agent(state, **agent_kwargs)
55
+ # run the agent with limits
56
+ with apply_limits(limits):
57
+ # run the agent
58
+ agent_name = registry_unqualified_name(agent)
59
+ async with span(name=agent_name, type="agent"):
60
+ return await agent(state, **agent_kwargs)
@@ -96,3 +96,12 @@ class AgentSubmit(NamedTuple):
96
96
 
97
97
  The tool should return the `answer` provided to it for scoring.
98
98
  """
99
+
100
+ answer_only: bool = False
101
+ """Set the completion to only the answer provided by the submit tool.
102
+
103
+ By default, the answer is appended (with `answer_delimiter`) to whatever
104
+ other content the model generated along with the call to `submit()`."""
105
+
106
+ answer_delimiter: str = "\n\n"
107
+ """Delimter used when appending submit tool answer to other content the model generated along with the call to `submit()`."""
@@ -16,6 +16,7 @@ from typing import (
16
16
  from pydantic import BaseModel, Field, ValidationError
17
17
  from typing_extensions import override
18
18
 
19
+ from inspect_ai._util.answer import answer_character, answer_index
19
20
  from inspect_ai.model import ChatMessage
20
21
  from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
21
22
  from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
@@ -328,7 +329,9 @@ class MemoryDataset(Dataset):
328
329
  shuffled_choices = [sample.choices[i] for i in positions]
329
330
 
330
331
  # Map of original position / target letter
331
- position_map = {i: chr(65 + new_i) for new_i, i in enumerate(positions)}
332
+ position_map = {
333
+ i: answer_character(new_i) for new_i, i in enumerate(positions)
334
+ }
332
335
 
333
336
  # Update to the shuffled choices and target
334
337
  sample.choices = shuffled_choices
@@ -338,9 +341,9 @@ class MemoryDataset(Dataset):
338
341
  self, target: str | list[str], position_map: dict[int, str]
339
342
  ) -> str | list[str]:
340
343
  if isinstance(target, list):
341
- return [position_map[ord(t) - 65] for t in target]
344
+ return [position_map[answer_index(t)] for t in target]
342
345
  else:
343
- return position_map[ord(target) - 65]
346
+ return position_map[answer_index(target)]
344
347
 
345
348
  @override
346
349
  def sort(
@@ -9,6 +9,7 @@ from ._file import (
9
9
  read_eval_log,
10
10
  read_eval_log_async,
11
11
  read_eval_log_sample,
12
+ read_eval_log_sample_summaries,
12
13
  read_eval_log_samples,
13
14
  write_eval_log,
14
15
  write_eval_log_async,
@@ -28,6 +29,7 @@ from ._log import (
28
29
  EvalSampleLimit,
29
30
  EvalSampleReductions,
30
31
  EvalSampleScore,
32
+ EvalSampleSummary,
31
33
  EvalScore,
32
34
  EvalSpec,
33
35
  EvalStats,
@@ -46,6 +48,8 @@ from ._transcript import (
46
48
  SampleLimitEvent,
47
49
  SandboxEvent,
48
50
  ScoreEvent,
51
+ SpanBeginEvent,
52
+ SpanEndEvent,
49
53
  StateEvent,
50
54
  StepEvent,
51
55
  StoreEvent,
@@ -54,6 +58,7 @@ from ._transcript import (
54
58
  Transcript,
55
59
  transcript,
56
60
  )
61
+ from ._tree import EventNode, EventTree, SpanNode, event_sequence, event_tree
57
62
 
58
63
  __all__ = [
59
64
  "EvalConfig",
@@ -70,6 +75,7 @@ __all__ = [
70
75
  "EvalSampleLimit",
71
76
  "EvalSampleScore",
72
77
  "EvalSampleReductions",
78
+ "EvalSampleSummary",
73
79
  "EvalScore",
74
80
  "EvalSpec",
75
81
  "EvalStats",
@@ -89,6 +95,8 @@ __all__ = [
89
95
  "SampleLimitEvent",
90
96
  "SandboxEvent",
91
97
  "ScoreEvent",
98
+ "SpanBeginEvent",
99
+ "SpanEndEvent",
92
100
  "StateEvent",
93
101
  "StepEvent",
94
102
  "StoreEvent",
@@ -100,6 +108,7 @@ __all__ = [
100
108
  "read_eval_log_async",
101
109
  "read_eval_log_sample",
102
110
  "read_eval_log_samples",
111
+ "read_eval_log_sample_summaries",
103
112
  "condense_sample",
104
113
  "resolve_sample_attachments",
105
114
  "write_eval_log",
@@ -107,4 +116,9 @@ __all__ = [
107
116
  "write_log_dir_manifest",
108
117
  "retryable_eval_logs",
109
118
  "bundle_log_dir",
119
+ "event_tree",
120
+ "event_sequence",
121
+ "EventTree",
122
+ "EventNode",
123
+ "SpanNode",
110
124
  ]
@@ -2,7 +2,7 @@ import os
2
2
  from typing import Literal
3
3
 
4
4
  from inspect_ai._util.error import PrerequisiteError
5
- from inspect_ai._util.file import copy_file, exists, filesystem
5
+ from inspect_ai._util.file import exists, filesystem
6
6
  from inspect_ai.log._file import (
7
7
  log_files_from_ls,
8
8
  read_eval_log,
@@ -66,14 +66,9 @@ def convert_eval_logs(
66
66
  "Output file {output_file} already exists (use --overwrite to overwrite existing files)"
67
67
  )
68
68
 
69
- # if the input and output files have the same format just copy
70
- if input_file.endswith(f".{to}"):
71
- copy_file(input_file, output_file)
72
-
73
- # otherwise do a full read/write
74
- else:
75
- log = read_eval_log(input_file)
76
- write_eval_log(log, output_file)
69
+ # do a full read/write (normalized deprecated constructs and adds sample summaries)
70
+ log = read_eval_log(input_file)
71
+ write_eval_log(log, output_file)
77
72
 
78
73
  if fs.info(path).type == "file":
79
74
  convert_file(path)
inspect_ai/log/_file.py CHANGED
@@ -16,6 +16,7 @@ from inspect_ai._util.file import (
16
16
  )
17
17
  from inspect_ai._util.json import jsonable_python
18
18
  from inspect_ai.log._condense import resolve_sample_attachments
19
+ from inspect_ai.log._log import EvalSampleSummary
19
20
 
20
21
  from ._log import EvalLog, EvalSample
21
22
  from ._recorders import recorder_type_for_format, recorder_type_for_location
@@ -393,6 +394,61 @@ async def read_eval_log_sample_async(
393
394
  return sample
394
395
 
395
396
 
397
+ def read_eval_log_sample_summaries(
398
+ log_file: str | Path | EvalLogInfo,
399
+ format: Literal["eval", "json", "auto"] = "auto",
400
+ ) -> list[EvalSampleSummary]:
401
+ """Read sample summaries from an eval log.
402
+
403
+ Args:
404
+ log_file (str | FileInfo): Log file to read.
405
+ format (Literal["eval", "json", "auto"]): Read from format
406
+ (defaults to 'auto' based on `log_file` extension)
407
+
408
+ Returns:
409
+ Sample summaries for eval log.
410
+ """
411
+ # don't mix trio and asyncio
412
+ if current_async_backend() == "trio":
413
+ raise RuntimeError(
414
+ "read_eval_log_sample_summaries cannot be called from a trio async context (please use read_eval_log_sample_summaries_asymc instead)"
415
+ )
416
+
417
+ # will use s3fs and is not called from main inspect solver/scorer/tool/sandbox
418
+ # flow, so force the use of asyncio
419
+ return run_coroutine(read_eval_log_sample_summaries_async(log_file, format))
420
+
421
+
422
+ async def read_eval_log_sample_summaries_async(
423
+ log_file: str | Path | EvalLogInfo,
424
+ format: Literal["eval", "json", "auto"] = "auto",
425
+ ) -> list[EvalSampleSummary]:
426
+ """Read sample summaries from an eval log.
427
+
428
+ Args:
429
+ log_file (str | FileInfo): Log file to read.
430
+ format (Literal["eval", "json", "auto"]): Read from format
431
+ (defaults to 'auto' based on `log_file` extension)
432
+
433
+ Returns:
434
+ Sample summaries for eval log.
435
+ """
436
+ # resolve to file path
437
+ log_file = (
438
+ log_file
439
+ if isinstance(log_file, str)
440
+ else log_file.as_posix()
441
+ if isinstance(log_file, Path)
442
+ else log_file.name
443
+ )
444
+
445
+ if format == "auto":
446
+ recorder_type = recorder_type_for_location(log_file)
447
+ else:
448
+ recorder_type = recorder_type_for_format(format)
449
+ return await recorder_type.read_log_sample_summaries(log_file)
450
+
451
+
396
452
  def read_eval_log_samples(
397
453
  log_file: str | Path | EvalLogInfo,
398
454
  all_samples_required: bool = True,
inspect_ai/log/_log.py CHANGED
@@ -30,6 +30,7 @@ from inspect_ai.util._store import Store
30
30
  from inspect_ai.util._store_model import SMT
31
31
 
32
32
  from ._transcript import Event
33
+ from ._util import text_input_only, thin_metadata
33
34
 
34
35
  logger = getLogger(__name__)
35
36
 
@@ -42,6 +43,7 @@ class EvalConfigDefaults(TypedDict):
42
43
  fail_on_error: bool
43
44
  sandbox_cleanup: bool
44
45
  log_samples: bool
46
+ log_realtime: bool
45
47
  log_images: bool
46
48
  score_display: bool
47
49
 
@@ -53,6 +55,7 @@ def eval_config_defaults() -> EvalConfigDefaults:
53
55
  "fail_on_error": True,
54
56
  "sandbox_cleanup": True,
55
57
  "log_samples": True,
58
+ "log_realtime": True,
56
59
  "log_images": True,
57
60
  "score_display": True,
58
61
  }
@@ -120,6 +123,9 @@ class EvalConfig(BaseModel):
120
123
  log_samples: bool | None = Field(default=None)
121
124
  """Log detailed information on each sample."""
122
125
 
126
+ log_realtime: bool | None = Field(default=None)
127
+ """Log events in realtime (enables live viewing of samples in inspect view)."""
128
+
123
129
  log_images: bool | None = Field(default=None)
124
130
  """Log base64 encoded versions of images."""
125
131
 
@@ -161,6 +167,70 @@ class EvalSampleLimit(BaseModel):
161
167
  """The limit value"""
162
168
 
163
169
 
170
+ class EvalSampleSummary(BaseModel):
171
+ """Summary information (including scoring) for a sample."""
172
+
173
+ id: int | str
174
+ """Unique id for sample."""
175
+
176
+ epoch: int
177
+ """Epoch number for sample."""
178
+
179
+ input: str | list[ChatMessage]
180
+ """Sample input (text inputs only)."""
181
+
182
+ target: str | list[str]
183
+ """Sample target value(s)"""
184
+
185
+ metadata: dict[str, Any] = Field(default_factory=dict)
186
+ """Sample metadata (scalar types only, strings truncated to 1k)."""
187
+
188
+ scores: dict[str, Score] | None = Field(default=None)
189
+ """Scores for sample (score values only, no answers, explanations, or metadata)."""
190
+
191
+ model_usage: dict[str, ModelUsage] = Field(default_factory=dict)
192
+ """Model token usage for sample."""
193
+
194
+ total_time: float | None = Field(default=None)
195
+ """Total time that the sample was running."""
196
+
197
+ working_time: float | None = Field(default=None)
198
+ """Time spent working (model generation, sandbox calls, etc.)"""
199
+
200
+ uuid: str | None = Field(default=None)
201
+ """Globally unique identifier for sample run (exists for samples created in Inspect >= 0.3.70)"""
202
+
203
+ error: str | None = Field(default=None)
204
+ """Error that halted sample."""
205
+
206
+ limit: str | None = Field(default=None)
207
+ """Limit that halted the sample"""
208
+
209
+ retries: int | None = Field(default=None)
210
+ """Number of retries for the sample."""
211
+
212
+ completed: bool = Field(default=False)
213
+ """Is the sample complete."""
214
+
215
+ @model_validator(mode="after")
216
+ def thin_data(self) -> "EvalSampleSummary":
217
+ # thin input
218
+ self.input = text_input_only(self.input)
219
+
220
+ # thin metadata
221
+ self.metadata = thin_metadata(self.metadata)
222
+
223
+ # thin score explanations and metadata
224
+ if self.scores is not None:
225
+ self.scores = {
226
+ key: Score(value=score.value) for key, score in self.scores.items()
227
+ }
228
+ return self
229
+
230
+ # allow field model_usage
231
+ model_config = ConfigDict(protected_namespaces=())
232
+
233
+
164
234
  class EvalSample(BaseModel):
165
235
  """Sample from evaluation task."""
166
236
 
@@ -271,6 +341,35 @@ class EvalSample(BaseModel):
271
341
  limit: EvalSampleLimit | None = Field(default=None)
272
342
  """The limit that halted the sample"""
273
343
 
344
+ def summary(self) -> EvalSampleSummary:
345
+ """Summary of sample.
346
+
347
+ The summary excludes potentially large fields like messages, output,
348
+ events, store, and metadata so that it is always fast to load.
349
+
350
+ If there are images, audio, or video in the input, they are
351
+ replaced with a placeholder.
352
+
353
+ Returns:
354
+ Summary of sample.
355
+ """
356
+ return EvalSampleSummary(
357
+ id=self.id,
358
+ epoch=self.epoch,
359
+ input=self.input,
360
+ target=self.target,
361
+ metadata=self.metadata,
362
+ scores=self.scores,
363
+ model_usage=self.model_usage,
364
+ total_time=self.total_time,
365
+ working_time=self.working_time,
366
+ uuid=self.uuid,
367
+ error=self.error.message if self.error is not None else None,
368
+ limit=f"{self.limit.type}" if self.limit is not None else None,
369
+ retries=len(self.error_retries) if self.error_retries is not None else None,
370
+ completed=True,
371
+ )
372
+
274
373
  # deprecated properties
275
374
 
276
375
  @property
@@ -1,3 +1,4 @@
1
+ from .._log import EvalSampleSummary
1
2
  from .create import (
2
3
  create_recorder_for_format,
3
4
  create_recorder_for_location,
@@ -7,6 +8,7 @@ from .create import (
7
8
  from .recorder import Recorder
8
9
 
9
10
  __all__ = [
11
+ "EvalSampleSummary",
10
12
  "Recorder",
11
13
  "create_recorder_for_format",
12
14
  "create_recorder_for_location",
@@ -26,7 +26,8 @@ from ..._condense import (
26
26
  walk_input,
27
27
  walk_json_dict,
28
28
  )
29
- from ..types import SampleEvent, SampleSummary
29
+ from ..._log import EvalSampleSummary
30
+ from ..types import SampleEvent
30
31
  from .filestore import (
31
32
  Manifest,
32
33
  SampleBufferFilestore,
@@ -141,7 +142,7 @@ class SampleBufferDatabase(SampleBuffer):
141
142
  )
142
143
  self._sync_time = time.monotonic()
143
144
 
144
- def start_sample(self, sample: SampleSummary) -> None:
145
+ def start_sample(self, sample: EvalSampleSummary) -> None:
145
146
  with self._get_connection(write=True) as conn:
146
147
  sample = self._consense_sample(conn, sample)
147
148
  conn.execute(
@@ -177,7 +178,7 @@ class SampleBufferDatabase(SampleBuffer):
177
178
  # Insert all rows
178
179
  conn.execute(sql, values)
179
180
 
180
- def complete_sample(self, summary: SampleSummary) -> None:
181
+ def complete_sample(self, summary: EvalSampleSummary) -> None:
181
182
  with self._get_connection(write=True) as conn:
182
183
  summary = self._consense_sample(conn, summary)
183
184
  conn.execute(
@@ -307,9 +308,9 @@ class SampleBufferDatabase(SampleBuffer):
307
308
  conn.execute("PRAGMA foreign_keys = ON")
308
309
 
309
310
  # concurrency setup
310
- conn.execute("PRAGMA journal_mode=WAL")
311
+ conn.execute("PRAGMA journal_mode=MEMORY")
311
312
  conn.execute("PRAGMA busy_timeout=10000")
312
- conn.execute("PRAGMA synchronous=NORMAL")
313
+ conn.execute("PRAGMA synchronous=OFF")
313
314
 
314
315
  # do work
315
316
  yield conn
@@ -359,7 +360,7 @@ class SampleBufferDatabase(SampleBuffer):
359
360
 
360
361
  def _get_samples(
361
362
  self, conn: Connection, resolve_attachments: bool = False
362
- ) -> Iterator[SampleSummary]:
363
+ ) -> Iterator[EvalSampleSummary]:
363
364
  cursor = conn.execute(
364
365
  """
365
366
  SELECT s.data as sample_data
@@ -369,7 +370,7 @@ class SampleBufferDatabase(SampleBuffer):
369
370
  )
370
371
 
371
372
  for row in cursor:
372
- summary = SampleSummary.model_validate_json(row["sample_data"])
373
+ summary = EvalSampleSummary.model_validate_json(row["sample_data"])
373
374
  if resolve_attachments:
374
375
  summary = self._resolve_sample_attachments(conn, summary)
375
376
  yield summary
@@ -437,8 +438,8 @@ class SampleBufferDatabase(SampleBuffer):
437
438
  )
438
439
 
439
440
  def _consense_sample(
440
- self, conn: Connection, sample: SampleSummary
441
- ) -> SampleSummary:
441
+ self, conn: Connection, sample: EvalSampleSummary
442
+ ) -> EvalSampleSummary:
442
443
  # alias attachments
443
444
  attachments: dict[str, str] = {}
444
445
  sample = sample.model_copy(
@@ -456,8 +457,8 @@ class SampleBufferDatabase(SampleBuffer):
456
457
  return sample
457
458
 
458
459
  def _resolve_sample_attachments(
459
- self, conn: Connection, sample: SampleSummary
460
- ) -> SampleSummary:
460
+ self, conn: Connection, sample: EvalSampleSummary
461
+ ) -> EvalSampleSummary:
461
462
  return sample.model_copy(
462
463
  update={
463
464
  "input": walk_input(