inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -0
- inspect_ai/_display/textual/widgets/samples.py +3 -3
- inspect_ai/_display/textual/widgets/transcript.py +3 -29
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +23 -27
- inspect_ai/_util/answer.py +26 -0
- inspect_ai/_util/constants.py +0 -1
- inspect_ai/_util/local_server.py +398 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +173 -159
- inspect_ai/_view/www/dist/assets/index.js +1417 -1142
- inspect_ai/_view/www/log-schema.json +379 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +93 -14
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +22 -12
- inspect_ai/agent/_as_tool.py +20 -6
- inspect_ai/agent/_handoff.py +12 -1
- inspect_ai/agent/_react.py +4 -3
- inspect_ai/agent/_run.py +16 -3
- inspect_ai/agent/_types.py +9 -0
- inspect_ai/dataset/_dataset.py +6 -3
- inspect_ai/log/__init__.py +14 -0
- inspect_ai/log/_convert.py +4 -9
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +14 -25
- inspect_ai/log/_transcript.py +84 -36
- inspect_ai/log/_tree.py +118 -0
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +72 -44
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +66 -88
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/hf.py +27 -1
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +247 -0
- inspect_ai/model/_providers/vllm.py +211 -400
- inspect_ai/scorer/_choice.py +1 -2
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_chain.py +1 -1
- inspect_ai/solver/_fork.py +1 -1
- inspect_ai/solver/_multiple_choice.py +5 -22
- inspect_ai/solver/_plan.py +2 -2
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/solver/_transcript.py +6 -7
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +8 -5
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_execute.py +4 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +16 -0
- inspect_ai/util/_anyio.py +11 -0
- inspect_ai/util/_collect.py +50 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- inspect_ai/util/_span.py +58 -0
- inspect_ai/util/_subtask.py +27 -42
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
- inspect_ai/_display/core/group.py +0 -79
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
inspect_ai/agent/_as_solver.py
CHANGED
@@ -2,6 +2,9 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
4
4
|
|
5
|
+
from inspect_ai.util._limit import Limit, apply_limits
|
6
|
+
from inspect_ai.util._span import span
|
7
|
+
|
5
8
|
if TYPE_CHECKING:
|
6
9
|
from inspect_ai.solver._solver import Solver
|
7
10
|
|
@@ -14,7 +17,7 @@ from inspect_ai.tool._tool_info import parse_tool_info
|
|
14
17
|
from ._agent import Agent, AgentState
|
15
18
|
|
16
19
|
|
17
|
-
def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
|
20
|
+
def as_solver(agent: Agent, limits: list[Limit] = [], **agent_kwargs: Any) -> Solver:
|
18
21
|
"""Convert an agent to a solver.
|
19
22
|
|
20
23
|
Note that agents used as solvers will only receive their first parameter
|
@@ -23,6 +26,8 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
|
|
23
26
|
|
24
27
|
Args:
|
25
28
|
agent: Agent to convert.
|
29
|
+
limits: List of limits to apply to the agent. Should a limit
|
30
|
+
be exceeded, the Sample ends and proceeds to scoring.
|
26
31
|
**agent_kwargs: Arguments to curry to Agent function (required
|
27
32
|
if the agent has parameters without default values).
|
28
33
|
|
@@ -52,17 +57,22 @@ def as_solver(agent: Agent, **agent_kwargs: Any) -> Solver:
|
|
52
57
|
@solver(name=agent_name)
|
53
58
|
def agent_to_solver() -> Solver:
|
54
59
|
async def solve(state: TaskState, generate: Generate) -> TaskState:
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
#
|
64
|
-
|
65
|
-
|
60
|
+
agent_state = AgentState(messages=state.messages)
|
61
|
+
|
62
|
+
try:
|
63
|
+
# run the agent with limits
|
64
|
+
with apply_limits(limits):
|
65
|
+
async with span(name=agent_name, type="agent"):
|
66
|
+
agent_state = await agent(agent_state, **agent_kwargs)
|
67
|
+
# if an exception occurs, we still want to update the TaskState with the
|
68
|
+
# AgentState's messages + output so that it appears in the log and is scored
|
69
|
+
finally:
|
70
|
+
# update messages
|
71
|
+
state.messages = agent_state.messages
|
72
|
+
|
73
|
+
# update output if its not empty
|
74
|
+
if agent_state.output:
|
75
|
+
state.output = agent_state.output
|
66
76
|
|
67
77
|
return state
|
68
78
|
|
inspect_ai/agent/_as_tool.py
CHANGED
@@ -10,12 +10,19 @@ from inspect_ai.tool._tool import Tool, ToolResult, tool
|
|
10
10
|
from inspect_ai.tool._tool_def import ToolDef, validate_tool_parameters
|
11
11
|
from inspect_ai.tool._tool_info import ToolInfo, parse_tool_info
|
12
12
|
from inspect_ai.tool._tool_params import ToolParam
|
13
|
+
from inspect_ai.util._limit import Limit, apply_limits
|
14
|
+
from inspect_ai.util._span import span
|
13
15
|
|
14
16
|
from ._agent import AGENT_DESCRIPTION, Agent, AgentState
|
15
17
|
|
16
18
|
|
17
19
|
@tool
|
18
|
-
def as_tool(
|
20
|
+
def as_tool(
|
21
|
+
agent: Agent,
|
22
|
+
description: str | None = None,
|
23
|
+
limits: list[Limit] = [],
|
24
|
+
**agent_kwargs: Any,
|
25
|
+
) -> Tool:
|
19
26
|
"""Convert an agent to a tool.
|
20
27
|
|
21
28
|
By default the model will see all of the agent's arguments as
|
@@ -27,6 +34,9 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
|
|
27
34
|
Args:
|
28
35
|
agent: Agent to convert.
|
29
36
|
description: Tool description (defaults to agent description)
|
37
|
+
limits: List of limits to apply to the agent. Should a limit
|
38
|
+
be exceeded, the tool call ends and returns an error
|
39
|
+
explaining that a limit was exceeded.
|
30
40
|
**agent_kwargs: Arguments to curry to Agent function (arguments
|
31
41
|
provided here will not be presented to the model as part
|
32
42
|
of the tool interface).
|
@@ -40,10 +50,17 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
|
|
40
50
|
"Agent passed to as_tool was not created by an @agent decorated function"
|
41
51
|
)
|
42
52
|
|
53
|
+
# get tool_info
|
54
|
+
tool_info = agent_tool_info(agent, description, **agent_kwargs)
|
55
|
+
|
43
56
|
async def execute(input: str, *args: Any, **kwargs: Any) -> ToolResult:
|
44
|
-
# prepare state
|
57
|
+
# prepare state
|
45
58
|
state = AgentState(messages=[ChatMessageUser(content=input, source="input")])
|
46
|
-
|
59
|
+
|
60
|
+
# run the agent with limits
|
61
|
+
with apply_limits(limits):
|
62
|
+
async with span(name=tool_info.name, type="agent"):
|
63
|
+
state = await agent(state, *args, **(agent_kwargs | kwargs))
|
47
64
|
|
48
65
|
# find assistant message to read content from (prefer output)
|
49
66
|
if not state.output.empty:
|
@@ -55,9 +72,6 @@ def as_tool(agent: Agent, description: str | None = None, **agent_kwargs: Any) -
|
|
55
72
|
else:
|
56
73
|
return ""
|
57
74
|
|
58
|
-
# get tool_info
|
59
|
-
tool_info = agent_tool_info(agent, description, **agent_kwargs)
|
60
|
-
|
61
75
|
# add "input" param
|
62
76
|
tool_info.parameters.properties = {
|
63
77
|
"input": ToolParam(type="string", description="Input message.")
|
inspect_ai/agent/_handoff.py
CHANGED
@@ -9,6 +9,7 @@ from inspect_ai._util.registry import (
|
|
9
9
|
from inspect_ai.tool._tool import Tool, ToolResult, ToolSource
|
10
10
|
from inspect_ai.tool._tool_def import ToolDef
|
11
11
|
from inspect_ai.tool._tool_description import ToolDescription, set_tool_description
|
12
|
+
from inspect_ai.util._limit import Limit
|
12
13
|
|
13
14
|
from ._agent import Agent
|
14
15
|
from ._as_tool import agent_tool_info
|
@@ -21,6 +22,7 @@ def handoff(
|
|
21
22
|
input_filter: MessageFilter | None = None,
|
22
23
|
output_filter: MessageFilter | None = None,
|
23
24
|
tool_name: str | None = None,
|
25
|
+
limits: list[Limit] = [],
|
24
26
|
**agent_kwargs: Any,
|
25
27
|
) -> Tool:
|
26
28
|
"""Create a tool that enables models to handoff to agents.
|
@@ -35,6 +37,9 @@ def handoff(
|
|
35
37
|
Use the built-in `last_message` filter to return only the last message
|
36
38
|
or alternatively specify a custom `MessageFilter` function.
|
37
39
|
tool_name: Alternate tool name (defaults to `transfer_to_{agent_name}`)
|
40
|
+
limits: List of limits to apply to the agent. Should a limit be exceeded,
|
41
|
+
the agent stops and a user message is appended explaining that a limit was
|
42
|
+
exceeded.
|
38
43
|
**agent_kwargs: Arguments to curry to `Agent` function (arguments provided here
|
39
44
|
will not be presented to the model as part of the tool interface).
|
40
45
|
|
@@ -52,7 +57,9 @@ def handoff(
|
|
52
57
|
tool_info = agent_tool_info(agent, description, **agent_kwargs)
|
53
58
|
|
54
59
|
# AgentTool calls will be intercepted by execute_tools
|
55
|
-
agent_tool = AgentTool(
|
60
|
+
agent_tool = AgentTool(
|
61
|
+
agent, tool_info.name, input_filter, output_filter, limits, **agent_kwargs
|
62
|
+
)
|
56
63
|
tool_name = tool_name or f"transfer_to_{tool_info.name}"
|
57
64
|
set_registry_info(agent_tool, RegistryInfo(type="tool", name=tool_name))
|
58
65
|
set_tool_description(
|
@@ -70,13 +77,17 @@ class AgentTool(Tool):
|
|
70
77
|
def __init__(
|
71
78
|
self,
|
72
79
|
agent: Agent,
|
80
|
+
name: str,
|
73
81
|
input_filter: MessageFilter | None = None,
|
74
82
|
output_filter: MessageFilter | None = None,
|
83
|
+
limits: list[Limit] = [],
|
75
84
|
**kwargs: Any,
|
76
85
|
):
|
77
86
|
self.agent = agent
|
87
|
+
self.name = name
|
78
88
|
self.input_filter = input_filter
|
79
89
|
self.output_filter = output_filter
|
90
|
+
self.limits = limits
|
80
91
|
self.kwargs = kwargs
|
81
92
|
|
82
93
|
@property
|
inspect_ai/agent/_react.py
CHANGED
@@ -195,9 +195,10 @@ def react(
|
|
195
195
|
answer = submission(messages)
|
196
196
|
if answer is not None:
|
197
197
|
# set the output to the answer for scoring
|
198
|
-
|
199
|
-
|
200
|
-
|
198
|
+
if submit.answer_only:
|
199
|
+
state.output.completion = answer
|
200
|
+
else:
|
201
|
+
state.output.completion = f"{state.output.completion}{submit.answer_delimiter}{answer}".strip()
|
201
202
|
|
202
203
|
# exit if we are at max_attempts
|
203
204
|
attempt_count += 1
|
inspect_ai/agent/_run.py
CHANGED
@@ -1,13 +1,19 @@
|
|
1
1
|
from copy import copy
|
2
2
|
from typing import Any
|
3
3
|
|
4
|
+
from inspect_ai._util.registry import registry_unqualified_name
|
4
5
|
from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
|
6
|
+
from inspect_ai.util._limit import Limit, apply_limits
|
7
|
+
from inspect_ai.util._span import span
|
5
8
|
|
6
9
|
from ._agent import Agent, AgentState
|
7
10
|
|
8
11
|
|
9
12
|
async def run(
|
10
|
-
agent: Agent,
|
13
|
+
agent: Agent,
|
14
|
+
input: str | list[ChatMessage] | AgentState,
|
15
|
+
limits: list[Limit] = [],
|
16
|
+
**agent_kwargs: Any,
|
11
17
|
) -> AgentState:
|
12
18
|
"""Run an agent.
|
13
19
|
|
@@ -17,6 +23,9 @@ async def run(
|
|
17
23
|
Args:
|
18
24
|
agent: Agent to run.
|
19
25
|
input: Agent input (string, list of messages, or an `AgentState`).
|
26
|
+
limits: List of limits to apply to the agent. Should a limit be
|
27
|
+
exceeded, a LimitExceededError is raised which the caller may
|
28
|
+
handle as appropriate.
|
20
29
|
**agent_kwargs: Additional arguments to pass to agent.
|
21
30
|
|
22
31
|
Returns:
|
@@ -43,5 +52,9 @@ async def run(
|
|
43
52
|
# create state
|
44
53
|
state = AgentState(messages=input_messages)
|
45
54
|
|
46
|
-
# run the agent
|
47
|
-
|
55
|
+
# run the agent with limits
|
56
|
+
with apply_limits(limits):
|
57
|
+
# run the agent
|
58
|
+
agent_name = registry_unqualified_name(agent)
|
59
|
+
async with span(name=agent_name, type="agent"):
|
60
|
+
return await agent(state, **agent_kwargs)
|
inspect_ai/agent/_types.py
CHANGED
@@ -96,3 +96,12 @@ class AgentSubmit(NamedTuple):
|
|
96
96
|
|
97
97
|
The tool should return the `answer` provided to it for scoring.
|
98
98
|
"""
|
99
|
+
|
100
|
+
answer_only: bool = False
|
101
|
+
"""Set the completion to only the answer provided by the submit tool.
|
102
|
+
|
103
|
+
By default, the answer is appended (with `answer_delimiter`) to whatever
|
104
|
+
other content the model generated along with the call to `submit()`."""
|
105
|
+
|
106
|
+
answer_delimiter: str = "\n\n"
|
107
|
+
"""Delimter used when appending submit tool answer to other content the model generated along with the call to `submit()`."""
|
inspect_ai/dataset/_dataset.py
CHANGED
@@ -16,6 +16,7 @@ from typing import (
|
|
16
16
|
from pydantic import BaseModel, Field, ValidationError
|
17
17
|
from typing_extensions import override
|
18
18
|
|
19
|
+
from inspect_ai._util.answer import answer_character, answer_index
|
19
20
|
from inspect_ai.model import ChatMessage
|
20
21
|
from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
|
21
22
|
from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
|
@@ -328,7 +329,9 @@ class MemoryDataset(Dataset):
|
|
328
329
|
shuffled_choices = [sample.choices[i] for i in positions]
|
329
330
|
|
330
331
|
# Map of original position / target letter
|
331
|
-
position_map = {
|
332
|
+
position_map = {
|
333
|
+
i: answer_character(new_i) for new_i, i in enumerate(positions)
|
334
|
+
}
|
332
335
|
|
333
336
|
# Update to the shuffled choices and target
|
334
337
|
sample.choices = shuffled_choices
|
@@ -338,9 +341,9 @@ class MemoryDataset(Dataset):
|
|
338
341
|
self, target: str | list[str], position_map: dict[int, str]
|
339
342
|
) -> str | list[str]:
|
340
343
|
if isinstance(target, list):
|
341
|
-
return [position_map[
|
344
|
+
return [position_map[answer_index(t)] for t in target]
|
342
345
|
else:
|
343
|
-
return position_map[
|
346
|
+
return position_map[answer_index(target)]
|
344
347
|
|
345
348
|
@override
|
346
349
|
def sort(
|
inspect_ai/log/__init__.py
CHANGED
@@ -9,6 +9,7 @@ from ._file import (
|
|
9
9
|
read_eval_log,
|
10
10
|
read_eval_log_async,
|
11
11
|
read_eval_log_sample,
|
12
|
+
read_eval_log_sample_summaries,
|
12
13
|
read_eval_log_samples,
|
13
14
|
write_eval_log,
|
14
15
|
write_eval_log_async,
|
@@ -28,6 +29,7 @@ from ._log import (
|
|
28
29
|
EvalSampleLimit,
|
29
30
|
EvalSampleReductions,
|
30
31
|
EvalSampleScore,
|
32
|
+
EvalSampleSummary,
|
31
33
|
EvalScore,
|
32
34
|
EvalSpec,
|
33
35
|
EvalStats,
|
@@ -46,6 +48,8 @@ from ._transcript import (
|
|
46
48
|
SampleLimitEvent,
|
47
49
|
SandboxEvent,
|
48
50
|
ScoreEvent,
|
51
|
+
SpanBeginEvent,
|
52
|
+
SpanEndEvent,
|
49
53
|
StateEvent,
|
50
54
|
StepEvent,
|
51
55
|
StoreEvent,
|
@@ -54,6 +58,7 @@ from ._transcript import (
|
|
54
58
|
Transcript,
|
55
59
|
transcript,
|
56
60
|
)
|
61
|
+
from ._tree import EventNode, EventTree, SpanNode, event_sequence, event_tree
|
57
62
|
|
58
63
|
__all__ = [
|
59
64
|
"EvalConfig",
|
@@ -70,6 +75,7 @@ __all__ = [
|
|
70
75
|
"EvalSampleLimit",
|
71
76
|
"EvalSampleScore",
|
72
77
|
"EvalSampleReductions",
|
78
|
+
"EvalSampleSummary",
|
73
79
|
"EvalScore",
|
74
80
|
"EvalSpec",
|
75
81
|
"EvalStats",
|
@@ -89,6 +95,8 @@ __all__ = [
|
|
89
95
|
"SampleLimitEvent",
|
90
96
|
"SandboxEvent",
|
91
97
|
"ScoreEvent",
|
98
|
+
"SpanBeginEvent",
|
99
|
+
"SpanEndEvent",
|
92
100
|
"StateEvent",
|
93
101
|
"StepEvent",
|
94
102
|
"StoreEvent",
|
@@ -100,6 +108,7 @@ __all__ = [
|
|
100
108
|
"read_eval_log_async",
|
101
109
|
"read_eval_log_sample",
|
102
110
|
"read_eval_log_samples",
|
111
|
+
"read_eval_log_sample_summaries",
|
103
112
|
"condense_sample",
|
104
113
|
"resolve_sample_attachments",
|
105
114
|
"write_eval_log",
|
@@ -107,4 +116,9 @@ __all__ = [
|
|
107
116
|
"write_log_dir_manifest",
|
108
117
|
"retryable_eval_logs",
|
109
118
|
"bundle_log_dir",
|
119
|
+
"event_tree",
|
120
|
+
"event_sequence",
|
121
|
+
"EventTree",
|
122
|
+
"EventNode",
|
123
|
+
"SpanNode",
|
110
124
|
]
|
inspect_ai/log/_convert.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2
2
|
from typing import Literal
|
3
3
|
|
4
4
|
from inspect_ai._util.error import PrerequisiteError
|
5
|
-
from inspect_ai._util.file import
|
5
|
+
from inspect_ai._util.file import exists, filesystem
|
6
6
|
from inspect_ai.log._file import (
|
7
7
|
log_files_from_ls,
|
8
8
|
read_eval_log,
|
@@ -66,14 +66,9 @@ def convert_eval_logs(
|
|
66
66
|
"Output file {output_file} already exists (use --overwrite to overwrite existing files)"
|
67
67
|
)
|
68
68
|
|
69
|
-
#
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
# otherwise do a full read/write
|
74
|
-
else:
|
75
|
-
log = read_eval_log(input_file)
|
76
|
-
write_eval_log(log, output_file)
|
69
|
+
# do a full read/write (normalized deprecated constructs and adds sample summaries)
|
70
|
+
log = read_eval_log(input_file)
|
71
|
+
write_eval_log(log, output_file)
|
77
72
|
|
78
73
|
if fs.info(path).type == "file":
|
79
74
|
convert_file(path)
|
inspect_ai/log/_file.py
CHANGED
@@ -16,6 +16,7 @@ from inspect_ai._util.file import (
|
|
16
16
|
)
|
17
17
|
from inspect_ai._util.json import jsonable_python
|
18
18
|
from inspect_ai.log._condense import resolve_sample_attachments
|
19
|
+
from inspect_ai.log._log import EvalSampleSummary
|
19
20
|
|
20
21
|
from ._log import EvalLog, EvalSample
|
21
22
|
from ._recorders import recorder_type_for_format, recorder_type_for_location
|
@@ -393,6 +394,61 @@ async def read_eval_log_sample_async(
|
|
393
394
|
return sample
|
394
395
|
|
395
396
|
|
397
|
+
def read_eval_log_sample_summaries(
|
398
|
+
log_file: str | Path | EvalLogInfo,
|
399
|
+
format: Literal["eval", "json", "auto"] = "auto",
|
400
|
+
) -> list[EvalSampleSummary]:
|
401
|
+
"""Read sample summaries from an eval log.
|
402
|
+
|
403
|
+
Args:
|
404
|
+
log_file (str | FileInfo): Log file to read.
|
405
|
+
format (Literal["eval", "json", "auto"]): Read from format
|
406
|
+
(defaults to 'auto' based on `log_file` extension)
|
407
|
+
|
408
|
+
Returns:
|
409
|
+
Sample summaries for eval log.
|
410
|
+
"""
|
411
|
+
# don't mix trio and asyncio
|
412
|
+
if current_async_backend() == "trio":
|
413
|
+
raise RuntimeError(
|
414
|
+
"read_eval_log_sample_summaries cannot be called from a trio async context (please use read_eval_log_sample_summaries_asymc instead)"
|
415
|
+
)
|
416
|
+
|
417
|
+
# will use s3fs and is not called from main inspect solver/scorer/tool/sandbox
|
418
|
+
# flow, so force the use of asyncio
|
419
|
+
return run_coroutine(read_eval_log_sample_summaries_async(log_file, format))
|
420
|
+
|
421
|
+
|
422
|
+
async def read_eval_log_sample_summaries_async(
|
423
|
+
log_file: str | Path | EvalLogInfo,
|
424
|
+
format: Literal["eval", "json", "auto"] = "auto",
|
425
|
+
) -> list[EvalSampleSummary]:
|
426
|
+
"""Read sample summaries from an eval log.
|
427
|
+
|
428
|
+
Args:
|
429
|
+
log_file (str | FileInfo): Log file to read.
|
430
|
+
format (Literal["eval", "json", "auto"]): Read from format
|
431
|
+
(defaults to 'auto' based on `log_file` extension)
|
432
|
+
|
433
|
+
Returns:
|
434
|
+
Sample summaries for eval log.
|
435
|
+
"""
|
436
|
+
# resolve to file path
|
437
|
+
log_file = (
|
438
|
+
log_file
|
439
|
+
if isinstance(log_file, str)
|
440
|
+
else log_file.as_posix()
|
441
|
+
if isinstance(log_file, Path)
|
442
|
+
else log_file.name
|
443
|
+
)
|
444
|
+
|
445
|
+
if format == "auto":
|
446
|
+
recorder_type = recorder_type_for_location(log_file)
|
447
|
+
else:
|
448
|
+
recorder_type = recorder_type_for_format(format)
|
449
|
+
return await recorder_type.read_log_sample_summaries(log_file)
|
450
|
+
|
451
|
+
|
396
452
|
def read_eval_log_samples(
|
397
453
|
log_file: str | Path | EvalLogInfo,
|
398
454
|
all_samples_required: bool = True,
|
inspect_ai/log/_log.py
CHANGED
@@ -30,6 +30,7 @@ from inspect_ai.util._store import Store
|
|
30
30
|
from inspect_ai.util._store_model import SMT
|
31
31
|
|
32
32
|
from ._transcript import Event
|
33
|
+
from ._util import text_input_only, thin_metadata
|
33
34
|
|
34
35
|
logger = getLogger(__name__)
|
35
36
|
|
@@ -42,6 +43,7 @@ class EvalConfigDefaults(TypedDict):
|
|
42
43
|
fail_on_error: bool
|
43
44
|
sandbox_cleanup: bool
|
44
45
|
log_samples: bool
|
46
|
+
log_realtime: bool
|
45
47
|
log_images: bool
|
46
48
|
score_display: bool
|
47
49
|
|
@@ -53,6 +55,7 @@ def eval_config_defaults() -> EvalConfigDefaults:
|
|
53
55
|
"fail_on_error": True,
|
54
56
|
"sandbox_cleanup": True,
|
55
57
|
"log_samples": True,
|
58
|
+
"log_realtime": True,
|
56
59
|
"log_images": True,
|
57
60
|
"score_display": True,
|
58
61
|
}
|
@@ -120,6 +123,9 @@ class EvalConfig(BaseModel):
|
|
120
123
|
log_samples: bool | None = Field(default=None)
|
121
124
|
"""Log detailed information on each sample."""
|
122
125
|
|
126
|
+
log_realtime: bool | None = Field(default=None)
|
127
|
+
"""Log events in realtime (enables live viewing of samples in inspect view)."""
|
128
|
+
|
123
129
|
log_images: bool | None = Field(default=None)
|
124
130
|
"""Log base64 encoded versions of images."""
|
125
131
|
|
@@ -161,6 +167,70 @@ class EvalSampleLimit(BaseModel):
|
|
161
167
|
"""The limit value"""
|
162
168
|
|
163
169
|
|
170
|
+
class EvalSampleSummary(BaseModel):
|
171
|
+
"""Summary information (including scoring) for a sample."""
|
172
|
+
|
173
|
+
id: int | str
|
174
|
+
"""Unique id for sample."""
|
175
|
+
|
176
|
+
epoch: int
|
177
|
+
"""Epoch number for sample."""
|
178
|
+
|
179
|
+
input: str | list[ChatMessage]
|
180
|
+
"""Sample input (text inputs only)."""
|
181
|
+
|
182
|
+
target: str | list[str]
|
183
|
+
"""Sample target value(s)"""
|
184
|
+
|
185
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
186
|
+
"""Sample metadata (scalar types only, strings truncated to 1k)."""
|
187
|
+
|
188
|
+
scores: dict[str, Score] | None = Field(default=None)
|
189
|
+
"""Scores for sample (score values only, no answers, explanations, or metadata)."""
|
190
|
+
|
191
|
+
model_usage: dict[str, ModelUsage] = Field(default_factory=dict)
|
192
|
+
"""Model token usage for sample."""
|
193
|
+
|
194
|
+
total_time: float | None = Field(default=None)
|
195
|
+
"""Total time that the sample was running."""
|
196
|
+
|
197
|
+
working_time: float | None = Field(default=None)
|
198
|
+
"""Time spent working (model generation, sandbox calls, etc.)"""
|
199
|
+
|
200
|
+
uuid: str | None = Field(default=None)
|
201
|
+
"""Globally unique identifier for sample run (exists for samples created in Inspect >= 0.3.70)"""
|
202
|
+
|
203
|
+
error: str | None = Field(default=None)
|
204
|
+
"""Error that halted sample."""
|
205
|
+
|
206
|
+
limit: str | None = Field(default=None)
|
207
|
+
"""Limit that halted the sample"""
|
208
|
+
|
209
|
+
retries: int | None = Field(default=None)
|
210
|
+
"""Number of retries for the sample."""
|
211
|
+
|
212
|
+
completed: bool = Field(default=False)
|
213
|
+
"""Is the sample complete."""
|
214
|
+
|
215
|
+
@model_validator(mode="after")
|
216
|
+
def thin_data(self) -> "EvalSampleSummary":
|
217
|
+
# thin input
|
218
|
+
self.input = text_input_only(self.input)
|
219
|
+
|
220
|
+
# thin metadata
|
221
|
+
self.metadata = thin_metadata(self.metadata)
|
222
|
+
|
223
|
+
# thin score explanations and metadata
|
224
|
+
if self.scores is not None:
|
225
|
+
self.scores = {
|
226
|
+
key: Score(value=score.value) for key, score in self.scores.items()
|
227
|
+
}
|
228
|
+
return self
|
229
|
+
|
230
|
+
# allow field model_usage
|
231
|
+
model_config = ConfigDict(protected_namespaces=())
|
232
|
+
|
233
|
+
|
164
234
|
class EvalSample(BaseModel):
|
165
235
|
"""Sample from evaluation task."""
|
166
236
|
|
@@ -271,6 +341,35 @@ class EvalSample(BaseModel):
|
|
271
341
|
limit: EvalSampleLimit | None = Field(default=None)
|
272
342
|
"""The limit that halted the sample"""
|
273
343
|
|
344
|
+
def summary(self) -> EvalSampleSummary:
|
345
|
+
"""Summary of sample.
|
346
|
+
|
347
|
+
The summary excludes potentially large fields like messages, output,
|
348
|
+
events, store, and metadata so that it is always fast to load.
|
349
|
+
|
350
|
+
If there are images, audio, or video in the input, they are
|
351
|
+
replaced with a placeholder.
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
Summary of sample.
|
355
|
+
"""
|
356
|
+
return EvalSampleSummary(
|
357
|
+
id=self.id,
|
358
|
+
epoch=self.epoch,
|
359
|
+
input=self.input,
|
360
|
+
target=self.target,
|
361
|
+
metadata=self.metadata,
|
362
|
+
scores=self.scores,
|
363
|
+
model_usage=self.model_usage,
|
364
|
+
total_time=self.total_time,
|
365
|
+
working_time=self.working_time,
|
366
|
+
uuid=self.uuid,
|
367
|
+
error=self.error.message if self.error is not None else None,
|
368
|
+
limit=f"{self.limit.type}" if self.limit is not None else None,
|
369
|
+
retries=len(self.error_retries) if self.error_retries is not None else None,
|
370
|
+
completed=True,
|
371
|
+
)
|
372
|
+
|
274
373
|
# deprecated properties
|
275
374
|
|
276
375
|
@property
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from .._log import EvalSampleSummary
|
1
2
|
from .create import (
|
2
3
|
create_recorder_for_format,
|
3
4
|
create_recorder_for_location,
|
@@ -7,6 +8,7 @@ from .create import (
|
|
7
8
|
from .recorder import Recorder
|
8
9
|
|
9
10
|
__all__ = [
|
11
|
+
"EvalSampleSummary",
|
10
12
|
"Recorder",
|
11
13
|
"create_recorder_for_format",
|
12
14
|
"create_recorder_for_location",
|
@@ -26,7 +26,8 @@ from ..._condense import (
|
|
26
26
|
walk_input,
|
27
27
|
walk_json_dict,
|
28
28
|
)
|
29
|
-
from
|
29
|
+
from ..._log import EvalSampleSummary
|
30
|
+
from ..types import SampleEvent
|
30
31
|
from .filestore import (
|
31
32
|
Manifest,
|
32
33
|
SampleBufferFilestore,
|
@@ -141,7 +142,7 @@ class SampleBufferDatabase(SampleBuffer):
|
|
141
142
|
)
|
142
143
|
self._sync_time = time.monotonic()
|
143
144
|
|
144
|
-
def start_sample(self, sample:
|
145
|
+
def start_sample(self, sample: EvalSampleSummary) -> None:
|
145
146
|
with self._get_connection(write=True) as conn:
|
146
147
|
sample = self._consense_sample(conn, sample)
|
147
148
|
conn.execute(
|
@@ -177,7 +178,7 @@ class SampleBufferDatabase(SampleBuffer):
|
|
177
178
|
# Insert all rows
|
178
179
|
conn.execute(sql, values)
|
179
180
|
|
180
|
-
def complete_sample(self, summary:
|
181
|
+
def complete_sample(self, summary: EvalSampleSummary) -> None:
|
181
182
|
with self._get_connection(write=True) as conn:
|
182
183
|
summary = self._consense_sample(conn, summary)
|
183
184
|
conn.execute(
|
@@ -307,9 +308,9 @@ class SampleBufferDatabase(SampleBuffer):
|
|
307
308
|
conn.execute("PRAGMA foreign_keys = ON")
|
308
309
|
|
309
310
|
# concurrency setup
|
310
|
-
conn.execute("PRAGMA journal_mode=
|
311
|
+
conn.execute("PRAGMA journal_mode=MEMORY")
|
311
312
|
conn.execute("PRAGMA busy_timeout=10000")
|
312
|
-
conn.execute("PRAGMA synchronous=
|
313
|
+
conn.execute("PRAGMA synchronous=OFF")
|
313
314
|
|
314
315
|
# do work
|
315
316
|
yield conn
|
@@ -359,7 +360,7 @@ class SampleBufferDatabase(SampleBuffer):
|
|
359
360
|
|
360
361
|
def _get_samples(
|
361
362
|
self, conn: Connection, resolve_attachments: bool = False
|
362
|
-
) -> Iterator[
|
363
|
+
) -> Iterator[EvalSampleSummary]:
|
363
364
|
cursor = conn.execute(
|
364
365
|
"""
|
365
366
|
SELECT s.data as sample_data
|
@@ -369,7 +370,7 @@ class SampleBufferDatabase(SampleBuffer):
|
|
369
370
|
)
|
370
371
|
|
371
372
|
for row in cursor:
|
372
|
-
summary =
|
373
|
+
summary = EvalSampleSummary.model_validate_json(row["sample_data"])
|
373
374
|
if resolve_attachments:
|
374
375
|
summary = self._resolve_sample_attachments(conn, summary)
|
375
376
|
yield summary
|
@@ -437,8 +438,8 @@ class SampleBufferDatabase(SampleBuffer):
|
|
437
438
|
)
|
438
439
|
|
439
440
|
def _consense_sample(
|
440
|
-
self, conn: Connection, sample:
|
441
|
-
) ->
|
441
|
+
self, conn: Connection, sample: EvalSampleSummary
|
442
|
+
) -> EvalSampleSummary:
|
442
443
|
# alias attachments
|
443
444
|
attachments: dict[str, str] = {}
|
444
445
|
sample = sample.model_copy(
|
@@ -456,8 +457,8 @@ class SampleBufferDatabase(SampleBuffer):
|
|
456
457
|
return sample
|
457
458
|
|
458
459
|
def _resolve_sample_attachments(
|
459
|
-
self, conn: Connection, sample:
|
460
|
-
) ->
|
460
|
+
self, conn: Connection, sample: EvalSampleSummary
|
461
|
+
) -> EvalSampleSummary:
|
461
462
|
return sample.model_copy(
|
462
463
|
update={
|
463
464
|
"input": walk_input(
|