inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/_cli/eval.py +27 -0
- inspect_ai/_display/textual/widgets/samples.py +3 -3
- inspect_ai/_display/textual/widgets/transcript.py +3 -29
- inspect_ai/_eval/eval.py +19 -2
- inspect_ai/_eval/evalset.py +4 -1
- inspect_ai/_eval/run.py +41 -0
- inspect_ai/_eval/task/generate.py +38 -44
- inspect_ai/_eval/task/log.py +26 -28
- inspect_ai/_eval/task/run.py +23 -27
- inspect_ai/_util/answer.py +26 -0
- inspect_ai/_util/constants.py +0 -1
- inspect_ai/_util/local_server.py +398 -0
- inspect_ai/_util/working.py +10 -4
- inspect_ai/_view/www/dist/assets/index.css +173 -159
- inspect_ai/_view/www/dist/assets/index.js +1417 -1142
- inspect_ai/_view/www/log-schema.json +379 -3
- inspect_ai/_view/www/package.json +1 -1
- inspect_ai/_view/www/src/@types/log.d.ts +93 -14
- inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
- inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
- inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
- inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
- inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
- inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
- inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
- inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
- inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
- inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
- inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
- inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
- inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
- inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
- inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
- inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
- inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
- inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
- inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
- inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
- inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
- inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
- inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
- inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
- inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
- inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
- inspect_ai/_view/www/src/components/Card.css +0 -1
- inspect_ai/_view/www/src/constants.ts +2 -0
- inspect_ai/_view/www/src/utils/numeric.ts +17 -0
- inspect_ai/agent/_agent.py +3 -3
- inspect_ai/agent/_as_solver.py +22 -12
- inspect_ai/agent/_as_tool.py +20 -6
- inspect_ai/agent/_handoff.py +12 -1
- inspect_ai/agent/_react.py +4 -3
- inspect_ai/agent/_run.py +16 -3
- inspect_ai/agent/_types.py +9 -0
- inspect_ai/dataset/_dataset.py +6 -3
- inspect_ai/log/__init__.py +14 -0
- inspect_ai/log/_convert.py +4 -9
- inspect_ai/log/_file.py +56 -0
- inspect_ai/log/_log.py +99 -0
- inspect_ai/log/_recorders/__init__.py +2 -0
- inspect_ai/log/_recorders/buffer/database.py +12 -11
- inspect_ai/log/_recorders/buffer/filestore.py +2 -2
- inspect_ai/log/_recorders/buffer/types.py +2 -2
- inspect_ai/log/_recorders/eval.py +20 -65
- inspect_ai/log/_recorders/file.py +28 -6
- inspect_ai/log/_recorders/recorder.py +7 -0
- inspect_ai/log/_recorders/types.py +1 -23
- inspect_ai/log/_samples.py +14 -25
- inspect_ai/log/_transcript.py +84 -36
- inspect_ai/log/_tree.py +118 -0
- inspect_ai/log/_util.py +52 -0
- inspect_ai/model/__init__.py +5 -1
- inspect_ai/model/_call_tools.py +72 -44
- inspect_ai/model/_generate_config.py +14 -8
- inspect_ai/model/_model.py +66 -88
- inspect_ai/model/_model_output.py +25 -0
- inspect_ai/model/_openai.py +2 -0
- inspect_ai/model/_providers/anthropic.py +13 -23
- inspect_ai/model/_providers/hf.py +27 -1
- inspect_ai/model/_providers/openai_o1.py +8 -2
- inspect_ai/model/_providers/providers.py +18 -4
- inspect_ai/model/_providers/sglang.py +247 -0
- inspect_ai/model/_providers/vllm.py +211 -400
- inspect_ai/scorer/_choice.py +1 -2
- inspect_ai/solver/__init__.py +7 -2
- inspect_ai/solver/_basic_agent.py +3 -10
- inspect_ai/solver/_chain.py +1 -1
- inspect_ai/solver/_fork.py +1 -1
- inspect_ai/solver/_multiple_choice.py +5 -22
- inspect_ai/solver/_plan.py +2 -2
- inspect_ai/solver/_task_state.py +26 -88
- inspect_ai/solver/_transcript.py +6 -7
- inspect_ai/tool/_json_rpc_helpers.py +45 -17
- inspect_ai/tool/_mcp/_mcp.py +8 -5
- inspect_ai/tool/_mcp/_sandbox.py +8 -2
- inspect_ai/tool/_mcp/server.py +3 -1
- inspect_ai/tool/_tool_call.py +4 -1
- inspect_ai/tool/_tool_support_helpers.py +51 -12
- inspect_ai/tool/_tools/_bash_session.py +190 -68
- inspect_ai/tool/_tools/_computer/_computer.py +25 -1
- inspect_ai/tool/_tools/_execute.py +4 -1
- inspect_ai/tool/_tools/_text_editor.py +4 -3
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
- inspect_ai/util/__init__.py +16 -0
- inspect_ai/util/_anyio.py +11 -0
- inspect_ai/util/_collect.py +50 -0
- inspect_ai/util/_limit.py +393 -0
- inspect_ai/util/_limited_conversation.py +57 -0
- inspect_ai/util/_span.py +58 -0
- inspect_ai/util/_subtask.py +27 -42
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
- inspect_ai/_display/core/group.py +0 -79
- inspect_ai/solver/_limit.py +0 -39
- inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
- inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
- inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
- inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
- inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
- inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
- inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
- inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
- inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
- inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
- inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
- inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
- inspect_ai/tool/_tools/_computer/test_args.py +0 -151
- /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
- {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -7,13 +7,17 @@ It includes definitions for JSON-RPC request and response models, as well as fun
|
|
7
7
|
from textwrap import dedent
|
8
8
|
from typing import Type
|
9
9
|
|
10
|
+
import semver
|
11
|
+
|
10
12
|
from inspect_ai._util.error import PrerequisiteError
|
13
|
+
from inspect_ai.tool._tool import ToolError
|
11
14
|
from inspect_ai.util import sandbox_with
|
12
15
|
from inspect_ai.util._sandbox.environment import SandboxEnvironment
|
13
16
|
|
14
17
|
from ._json_rpc_helpers import (
|
15
18
|
BaseModelT,
|
16
19
|
JSONRPCParamsType,
|
20
|
+
JSONRPCServerErrorMapper,
|
17
21
|
JSONRPCTransport,
|
18
22
|
ScalarT,
|
19
23
|
_rpc_call_description,
|
@@ -29,7 +33,7 @@ async def exec_scalar_request(
|
|
29
33
|
method: str,
|
30
34
|
params: JSONRPCParamsType,
|
31
35
|
result_type: Type[ScalarT],
|
32
|
-
timeout: int
|
36
|
+
timeout: int,
|
33
37
|
user: str | None = None,
|
34
38
|
) -> ScalarT:
|
35
39
|
return await scalar_request(
|
@@ -37,6 +41,7 @@ async def exec_scalar_request(
|
|
37
41
|
params,
|
38
42
|
result_type,
|
39
43
|
transport=ToolSupportSandboxTransport(sandbox, timeout, user),
|
44
|
+
server_error_mapper=ToolSupportServerErrorMapper(),
|
40
45
|
)
|
41
46
|
|
42
47
|
|
@@ -45,7 +50,7 @@ async def exec_model_request(
|
|
45
50
|
method: str,
|
46
51
|
params: JSONRPCParamsType,
|
47
52
|
result_type: Type[BaseModelT],
|
48
|
-
timeout: int
|
53
|
+
timeout: int,
|
49
54
|
user: str | None = None,
|
50
55
|
) -> BaseModelT:
|
51
56
|
return await model_request(
|
@@ -53,6 +58,7 @@ async def exec_model_request(
|
|
53
58
|
params,
|
54
59
|
result_type,
|
55
60
|
transport=ToolSupportSandboxTransport(sandbox, timeout, user),
|
61
|
+
server_error_mapper=ToolSupportServerErrorMapper(),
|
56
62
|
)
|
57
63
|
|
58
64
|
|
@@ -60,7 +66,7 @@ async def exec_notification(
|
|
60
66
|
sandbox: SandboxEnvironment,
|
61
67
|
method: str,
|
62
68
|
params: JSONRPCParamsType,
|
63
|
-
timeout: int
|
69
|
+
timeout: int,
|
64
70
|
user: str | None = None,
|
65
71
|
) -> None:
|
66
72
|
return await notification_helper(
|
@@ -68,19 +74,33 @@ async def exec_notification(
|
|
68
74
|
)
|
69
75
|
|
70
76
|
|
77
|
+
class ToolSupportServerErrorMapper(JSONRPCServerErrorMapper):
|
78
|
+
def __call__(
|
79
|
+
self, code: int, message: str, method: str, params: JSONRPCParamsType
|
80
|
+
) -> Exception:
|
81
|
+
"""Map `inspect-tool-support` defined custom codes to an exception."""
|
82
|
+
match code:
|
83
|
+
case -32099: # This is a ToolException from the container
|
84
|
+
return ToolError(message)
|
85
|
+
case -32098: # This is an unexpected exception inside the container
|
86
|
+
return RuntimeError(message)
|
87
|
+
case _:
|
88
|
+
return RuntimeError(message)
|
89
|
+
|
90
|
+
|
71
91
|
class ToolSupportSandboxTransport(JSONRPCTransport):
|
72
92
|
"""
|
73
|
-
A transport
|
93
|
+
A transport that uses a sandbox for RPC communication.
|
74
94
|
|
75
|
-
This class implements the TransportCallable protocol and encapsulates
|
76
|
-
|
77
|
-
|
95
|
+
This class implements the TransportCallable protocol and encapsulates the
|
96
|
+
sandbox, timeout, and user parameters needed for sandbox-based RPC
|
97
|
+
communication.
|
78
98
|
"""
|
79
99
|
|
80
100
|
def __init__(
|
81
101
|
self,
|
82
102
|
sandbox: SandboxEnvironment,
|
83
|
-
timeout: int
|
103
|
+
timeout: int,
|
84
104
|
user: str | None = None,
|
85
105
|
):
|
86
106
|
"""
|
@@ -128,13 +148,32 @@ class ToolSupportSandboxTransport(JSONRPCTransport):
|
|
128
148
|
|
129
149
|
SANDBOX_CLI = "inspect-tool-support"
|
130
150
|
INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB = "aisiuk/inspect-tool-support"
|
151
|
+
FIRST_PUBLISHED_VERSION = semver.Version.parse("0.1.6")
|
152
|
+
MIN_SUPPORTED_VERSION = FIRST_PUBLISHED_VERSION
|
153
|
+
MIN_NON_DEPRECATED_VERSION = semver.Version.parse("1.0.0")
|
154
|
+
|
155
|
+
|
156
|
+
async def _get_sandbox_tool_support_version(
|
157
|
+
sandbox: SandboxEnvironment,
|
158
|
+
) -> semver.Version:
|
159
|
+
try:
|
160
|
+
return semver.Version.parse(
|
161
|
+
await exec_scalar_request(sandbox, "version", {}, str, 5)
|
162
|
+
)
|
163
|
+
except RuntimeError as rte:
|
164
|
+
if "-32601" in str(rte):
|
165
|
+
# The container doesn't even have a version method. The first version
|
166
|
+
# published was 0.1.6, so we'll have to assume it was that old.
|
167
|
+
return FIRST_PUBLISHED_VERSION
|
168
|
+
raise rte
|
131
169
|
|
132
170
|
|
133
|
-
async def
|
171
|
+
async def tool_support_sandbox(
|
134
172
|
tool_name: str, *, sandbox_name: str | None = None
|
135
|
-
) -> SandboxEnvironment:
|
173
|
+
) -> tuple[SandboxEnvironment, semver.Version]:
|
136
174
|
if sb := await sandbox_with(SANDBOX_CLI, True, name=sandbox_name):
|
137
|
-
|
175
|
+
current_version = await _get_sandbox_tool_support_version(sb)
|
176
|
+
return (sb, current_version)
|
138
177
|
|
139
178
|
# This sort of programmatic sentence building will not cut it if we ever
|
140
179
|
# support other languages.
|
@@ -160,7 +199,7 @@ async def tool_container_sandbox(
|
|
160
199
|
|
161
200
|
|
162
201
|
def create_sandbox_transport(
|
163
|
-
sandbox: SandboxEnvironment, timeout: int
|
202
|
+
sandbox: SandboxEnvironment, timeout: int, user: str | None = None
|
164
203
|
) -> JSONRPCTransport:
|
165
204
|
"""
|
166
205
|
Create a transport callable that uses a sandbox for RPC communication.
|
@@ -1,20 +1,27 @@
|
|
1
|
-
from
|
1
|
+
from textwrap import dedent
|
2
|
+
from typing import Annotated, Literal
|
3
|
+
|
4
|
+
from pydantic import BaseModel, Discriminator, Field, RootModel
|
5
|
+
from semver import Version
|
2
6
|
from shortuuid import uuid
|
3
7
|
|
8
|
+
from inspect_ai._util.error import PrerequisiteError
|
4
9
|
from inspect_ai.tool import ToolResult
|
5
|
-
from inspect_ai.tool._tool_support_helpers import (
|
6
|
-
exec_model_request,
|
7
|
-
tool_container_sandbox,
|
8
|
-
)
|
9
10
|
from inspect_ai.util import StoreModel, store_as
|
11
|
+
from inspect_ai.util._sandbox.environment import SandboxEnvironment
|
10
12
|
|
11
13
|
from .._tool import Tool, ToolParsingError, tool
|
12
|
-
from ..
|
13
|
-
|
14
|
+
from .._tool_support_helpers import (
|
15
|
+
exec_model_request,
|
16
|
+
exec_scalar_request,
|
17
|
+
tool_support_sandbox,
|
18
|
+
)
|
14
19
|
|
15
20
|
# These models are cloned from the container code. If/when we decide to create
|
16
21
|
# a package that is shared between the inspect and tool-container codebases, we'll
|
17
22
|
# just have to live with it.
|
23
|
+
|
24
|
+
|
18
25
|
class NewSessionResult(BaseModel):
|
19
26
|
session_name: str
|
20
27
|
|
@@ -23,106 +30,221 @@ class BashRestartResult(BaseModel):
|
|
23
30
|
pass
|
24
31
|
|
25
32
|
|
26
|
-
class
|
27
|
-
|
28
|
-
|
29
|
-
stderr: str
|
33
|
+
class BashSessionStore(StoreModel):
|
34
|
+
session_id: str = Field(default_factory=str)
|
35
|
+
sandbox: SandboxEnvironment | None = Field(default=None)
|
30
36
|
|
31
37
|
|
32
|
-
|
33
|
-
pass
|
38
|
+
# Action-specific parameter models
|
34
39
|
|
35
40
|
|
36
|
-
class
|
37
|
-
|
41
|
+
class TypeParams(BaseModel):
|
42
|
+
action: Literal["type"] = "type"
|
43
|
+
input: str
|
38
44
|
|
39
45
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
content=f"```{language}\n" + code + "\n```\n",
|
49
|
-
)
|
50
|
-
return ToolCallView(call=call)
|
46
|
+
class TypeSubmitParams(BaseModel):
|
47
|
+
action: Literal["type_submit"] = "type_submit"
|
48
|
+
input: str
|
49
|
+
|
50
|
+
|
51
|
+
class RestartParams(BaseModel):
|
52
|
+
action: Literal["restart"] = "restart"
|
53
|
+
|
51
54
|
|
52
|
-
|
55
|
+
class ReadParams(BaseModel):
|
56
|
+
action: Literal["read"] = "read"
|
53
57
|
|
54
58
|
|
55
|
-
|
56
|
-
|
57
|
-
"""Bash shell session command execution tool.
|
59
|
+
class InterruptParams(BaseModel):
|
60
|
+
action: Literal["interrupt"] = "interrupt"
|
58
61
|
|
59
|
-
|
62
|
+
|
63
|
+
class BashSessionParams(
|
64
|
+
RootModel[
|
65
|
+
TypeParams | TypeSubmitParams | RestartParams | ReadParams | InterruptParams
|
66
|
+
]
|
67
|
+
):
|
68
|
+
root: Annotated[
|
69
|
+
TypeParams | TypeSubmitParams | RestartParams | ReadParams | InterruptParams,
|
70
|
+
Discriminator("action"),
|
71
|
+
]
|
72
|
+
|
73
|
+
|
74
|
+
DEFAULT_WAIT_FOR_OUTPUT = 30
|
75
|
+
DEFAULT_IDLE_TIME = 0.5
|
76
|
+
# this is how long we're willing to wait for the basic RPC call overhead.
|
77
|
+
TRANSPORT_TIMEOUT = 5
|
78
|
+
|
79
|
+
|
80
|
+
@tool()
|
81
|
+
def bash_session(
|
82
|
+
*,
|
83
|
+
timeout: int | None = None, # default is max_wait + 5 seconds
|
84
|
+
wait_for_output: int | None = None, # default is 30 seconds
|
85
|
+
instance: str | None = uuid(),
|
86
|
+
) -> Tool:
|
87
|
+
"""Interactive bash shell session tool.
|
88
|
+
|
89
|
+
Interact with a bash shell in a long running session using a sandbox
|
90
|
+
environment (e.g. "docker"). This tool allows sending text to the shell,
|
91
|
+
which could be a command followed by a newline character or any other input
|
92
|
+
text such as the response to a password prompt.
|
60
93
|
|
61
94
|
By default, a separate bash process is created within the sandbox for each
|
62
|
-
call to `bash_session()`. You can modify this behavior by passing
|
63
|
-
(which will result in a single bash process for the entire
|
64
|
-
`instance` values that implement another scheme).
|
95
|
+
call to `bash_session()`. You can modify this behavior by passing
|
96
|
+
`instance=None` (which will result in a single bash process for the entire
|
97
|
+
sample) or use other `instance` values that implement another scheme).
|
65
98
|
|
66
99
|
See complete documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-bash-session>.
|
67
100
|
|
68
101
|
Args:
|
69
102
|
timeout: Timeout (in seconds) for command.
|
103
|
+
wait_for_output: Maximum time (in seconds) to wait for output. If no
|
104
|
+
output is received within this period, the function will return an
|
105
|
+
empty string. The model may need to make multiple tool calls to obtain
|
106
|
+
all output from a given command.
|
70
107
|
instance: Instance id (each unique instance id has its own bash process)
|
71
108
|
|
72
109
|
Returns:
|
73
|
-
String with
|
110
|
+
String with output from the shell.
|
74
111
|
"""
|
112
|
+
wait_for_output = wait_for_output or DEFAULT_WAIT_FOR_OUTPUT
|
113
|
+
min_timeout = wait_for_output + TRANSPORT_TIMEOUT
|
114
|
+
if timeout is None:
|
115
|
+
timeout = min_timeout
|
116
|
+
elif timeout < min_timeout:
|
117
|
+
raise ValueError(
|
118
|
+
f"Timeout must be at least {min_timeout} seconds, but got {timeout}."
|
119
|
+
)
|
75
120
|
|
76
121
|
async def execute(
|
77
|
-
|
78
|
-
|
122
|
+
action: Literal["type", "type_submit", "restart", "read", "interrupt"],
|
123
|
+
input: str | None = None,
|
79
124
|
) -> ToolResult:
|
80
|
-
"""
|
81
|
-
|
125
|
+
r"""
|
126
|
+
Interact with a bash shell.
|
127
|
+
|
128
|
+
Interact with a bash shell by sending it input text and retrieving output
|
129
|
+
from it. There is no guarantee that all output will be returned in a
|
130
|
+
single call. Call this function multiple times to retrieve additional
|
131
|
+
output from the shell.
|
132
|
+
|
133
|
+
USAGE NOTES:
|
134
|
+
- Ensure that the shell is at a command prompt (typically when the
|
135
|
+
output ends in "$ " or "# ") before submitting a new command.
|
136
|
+
- Control characters must be sent as Unicode escape sequences (e.g., use
|
137
|
+
"\u0003" for Ctrl+C/ETX, "\u0004" for Ctrl+D/EOT). The literal string
|
138
|
+
"Ctrl+C" will not be interpreted as a control character.
|
139
|
+
- Use the "read" action to retrieve output from the shell without
|
140
|
+
sending any input. This is useful for long-running commands that
|
141
|
+
produce output over time. The "read" action will return any new output
|
142
|
+
since the last call.
|
143
|
+
- If a long-running command is in progress, additional input to execute
|
144
|
+
a new command will not be processed until the previous completes. To
|
145
|
+
abort a long-running command, use the "interrupt" action:
|
146
|
+
`bash_session(action="interrupt")`
|
147
|
+
|
148
|
+
Example use case:
|
149
|
+
- For a short-running command with a nominal amount of output, a single
|
150
|
+
call may suffice.
|
151
|
+
```
|
152
|
+
bash_session(action="type_submit", input="echo foo") -> "foo\nuser@host:/# "
|
153
|
+
```
|
154
|
+
- For a long-running command with output over time, multiple calls to are needed.
|
155
|
+
```
|
156
|
+
bash_session(action="type_submit", input="tail -f /tmp/foo.log") -> <some output>
|
157
|
+
bash_session(action="read") -> <more output>
|
158
|
+
# Send interrupt (Ctrl+C)
|
159
|
+
bash_session(action="interrupt") -> "<final output>^Cuser@host:/# "
|
160
|
+
```
|
161
|
+
- Interactive command awaiting more input from the user.
|
162
|
+
```
|
163
|
+
bash_session(action="type_submit", input="ssh fred@foo.com") -> "foo.com's password: "
|
164
|
+
bash_session(action="type_submit", input="secret") -> "fred@foo.com:~$ "
|
165
|
+
```
|
82
166
|
|
83
167
|
Args:
|
84
|
-
|
85
|
-
|
168
|
+
action: The action to execute:
|
169
|
+
- "type": Send input without a return key
|
170
|
+
- "type_submit": Send input followed by a return key
|
171
|
+
- "read": Read any new output without sending input
|
172
|
+
- "interrupt": Send a Ctrl+C (ETX character) to interrupt the current process
|
173
|
+
- "restart": Restart the bash session
|
174
|
+
input: The input to send to the shell.
|
175
|
+
Required for "type". Optional for "type_submit" actions. Must
|
176
|
+
not be provided for "restart", "read", or "interrupt" actions.
|
86
177
|
|
87
178
|
Returns:
|
88
|
-
The output of the
|
179
|
+
The accumulated output of the shell.
|
89
180
|
"""
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
181
|
+
# Validate parameters based on action
|
182
|
+
match action:
|
183
|
+
case "type":
|
184
|
+
if input is None:
|
185
|
+
raise ToolParsingError(
|
186
|
+
f"'input' is required for '{action}' action."
|
187
|
+
)
|
188
|
+
case "restart" | "read" | "interrupt":
|
189
|
+
if input is not None:
|
190
|
+
raise ToolParsingError(
|
191
|
+
f"Do not provide 'input' with '{action}' action."
|
192
|
+
)
|
95
193
|
|
96
|
-
sandbox = await tool_container_sandbox("bash session")
|
97
194
|
store = store_as(BashSessionStore, instance=instance)
|
195
|
+
sandbox = await _get_sandbox(store)
|
98
196
|
|
99
197
|
if not store.session_id:
|
100
198
|
store.session_id = (
|
101
199
|
await exec_model_request(
|
102
|
-
sandbox
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
200
|
+
sandbox,
|
201
|
+
"bash_session_new_session",
|
202
|
+
{},
|
203
|
+
NewSessionResult,
|
204
|
+
TRANSPORT_TIMEOUT,
|
107
205
|
)
|
108
206
|
).session_name
|
109
207
|
|
110
|
-
|
208
|
+
timing: dict[str, object] = {
|
209
|
+
"wait_for_output": wait_for_output,
|
210
|
+
"idle_timeout": DEFAULT_IDLE_TIME,
|
211
|
+
}
|
212
|
+
action_specific: dict[str, dict[str, object]] = {
|
213
|
+
"type": {"input": input, **timing},
|
214
|
+
"type_submit": {"input": f"{input}\n", **timing},
|
215
|
+
"interrupt": {"input": "\u0003", **timing},
|
216
|
+
"read": timing,
|
217
|
+
"restart": {"restart": True},
|
218
|
+
}
|
219
|
+
|
220
|
+
result = await exec_scalar_request(
|
221
|
+
sandbox,
|
222
|
+
"bash_session",
|
223
|
+
{"session_name": store.session_id, **(action_specific[action])},
|
224
|
+
str,
|
225
|
+
timeout,
|
226
|
+
)
|
111
227
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
timeout=timeout,
|
119
|
-
)
|
120
|
-
).root
|
228
|
+
# Return the appropriate response
|
229
|
+
return (
|
230
|
+
"Bash session restarted."
|
231
|
+
if isinstance(result, BashRestartResult)
|
232
|
+
else result
|
233
|
+
)
|
121
234
|
|
122
|
-
|
123
|
-
return "Bash session restarted."
|
235
|
+
return execute
|
124
236
|
|
125
|
-
# return output (including stderr if any)
|
126
|
-
return f"{result.stderr}\n{result.stdout}" if result.stderr else result.stdout
|
127
237
|
|
128
|
-
|
238
|
+
async def _get_sandbox(store: BashSessionStore) -> SandboxEnvironment:
|
239
|
+
if not store.sandbox:
|
240
|
+
(sandbox, sandbox_version) = await tool_support_sandbox("bash session")
|
241
|
+
required_version = Version.parse("1.0.0")
|
242
|
+
if sandbox_version < required_version:
|
243
|
+
raise PrerequisiteError(
|
244
|
+
dedent(f"""
|
245
|
+
The 'inspect-tool-support' version in your container is '{sandbox_version}'. The 'bash_session' tool requires version '{required_version}' or newer. Please update your container image to the latest version of 'inspect-tool-support'.
|
246
|
+
""").strip()
|
247
|
+
)
|
248
|
+
store.sandbox = sandbox
|
249
|
+
|
250
|
+
return store.sandbox
|
@@ -6,7 +6,31 @@ from inspect_ai.tool._tool import TOOL_INIT_MODEL_INPUT, ToolParsingError
|
|
6
6
|
from inspect_ai.tool._tool_call import ToolCallModelInput, ToolCallModelInputHints
|
7
7
|
|
8
8
|
from . import _common as common
|
9
|
-
|
9
|
+
|
10
|
+
# this is duplicated from ._resources.tool._constants import Action
|
11
|
+
# changes should be synchronized!
|
12
|
+
|
13
|
+
Action = Literal[
|
14
|
+
"key",
|
15
|
+
"hold_key",
|
16
|
+
"type",
|
17
|
+
"cursor_position",
|
18
|
+
"mouse_move",
|
19
|
+
"left_mouse_down",
|
20
|
+
"left_mouse_up",
|
21
|
+
"left_click",
|
22
|
+
"left_click_drag",
|
23
|
+
"right_click",
|
24
|
+
"middle_click",
|
25
|
+
"back_click",
|
26
|
+
"forward_click",
|
27
|
+
"double_click",
|
28
|
+
"triple_click",
|
29
|
+
"scroll",
|
30
|
+
"wait",
|
31
|
+
"screenshot",
|
32
|
+
]
|
33
|
+
|
10
34
|
|
11
35
|
ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]]
|
12
36
|
|
@@ -96,7 +96,10 @@ def python(
|
|
96
96
|
The output of the Python code.
|
97
97
|
"""
|
98
98
|
result = await sandbox_env(sandbox).exec(
|
99
|
-
cmd=["
|
99
|
+
cmd=["bash", "--login", "-c", "python3 -"],
|
100
|
+
input=code,
|
101
|
+
timeout=timeout,
|
102
|
+
user=user,
|
100
103
|
)
|
101
104
|
# return output (including stderr if any)
|
102
105
|
output = ""
|
@@ -6,7 +6,7 @@ from pydantic import BaseModel, Discriminator, RootModel
|
|
6
6
|
from inspect_ai.tool import ToolResult
|
7
7
|
from inspect_ai.tool._tool_support_helpers import (
|
8
8
|
exec_scalar_request,
|
9
|
-
|
9
|
+
tool_support_sandbox,
|
10
10
|
)
|
11
11
|
|
12
12
|
from .._tool import Tool, tool
|
@@ -70,12 +70,13 @@ def text_editor(timeout: int | None = None, user: str | None = None) -> Tool:
|
|
70
70
|
that a change made to a file by on Subtask will be visible to another Subtask.
|
71
71
|
|
72
72
|
Args:
|
73
|
-
timeout: Timeout (in seconds) for command.
|
73
|
+
timeout: Timeout (in seconds) for command. Defaults to 180 if not provided.
|
74
74
|
user: User to execute commands as.
|
75
75
|
|
76
76
|
Returns:
|
77
77
|
String with command output (stdout) or command error (stderr).
|
78
78
|
"""
|
79
|
+
timeout = timeout or 180
|
79
80
|
|
80
81
|
async def execute(
|
81
82
|
command: Literal["view", "create", "str_replace", "insert", "undo_edit"],
|
@@ -101,7 +102,7 @@ def text_editor(timeout: int | None = None, user: str | None = None) -> Tool:
|
|
101
102
|
Returns:
|
102
103
|
The output of the command.
|
103
104
|
"""
|
104
|
-
sandbox = await
|
105
|
+
(sandbox, _) = await tool_support_sandbox("editor")
|
105
106
|
|
106
107
|
# Create a dictionary of the parameters
|
107
108
|
params = {
|
@@ -10,7 +10,7 @@ from inspect_ai.tool._tool_call import ToolCall, ToolCallContent, ToolCallView
|
|
10
10
|
from inspect_ai.tool._tool_info import parse_tool_info
|
11
11
|
from inspect_ai.tool._tool_support_helpers import (
|
12
12
|
exec_model_request,
|
13
|
-
|
13
|
+
tool_support_sandbox,
|
14
14
|
)
|
15
15
|
from inspect_ai.tool._tool_with import tool_with
|
16
16
|
from inspect_ai.util._store_model import StoreModel, store_as
|
@@ -397,8 +397,10 @@ def web_browser_refresh(instance: str | None = None) -> Tool:
|
|
397
397
|
async def _web_browser_cmd(
|
398
398
|
tool_name: str, instance: str | None, params: dict[str, object]
|
399
399
|
) -> ToolResult:
|
400
|
+
# TODO: Is it worth it to plumb this down from the @tool?
|
401
|
+
timeout = 180
|
400
402
|
try:
|
401
|
-
sandbox_env = await
|
403
|
+
(sandbox_env, _) = await tool_support_sandbox("web browser")
|
402
404
|
except PrerequisiteError as e:
|
403
405
|
# The user may have the old, incompatible, sandbox. If so, use that and
|
404
406
|
# execute the old compatible code.
|
@@ -419,13 +421,18 @@ async def _web_browser_cmd(
|
|
419
421
|
method="web_new_session",
|
420
422
|
params={"headful": False},
|
421
423
|
result_type=NewSessionResult,
|
424
|
+
timeout=timeout,
|
422
425
|
)
|
423
426
|
).session_name
|
424
427
|
|
425
428
|
params["session_name"] = store.session_id
|
426
429
|
|
427
430
|
crawler_result = await exec_model_request(
|
428
|
-
sandbox=sandbox_env,
|
431
|
+
sandbox=sandbox_env,
|
432
|
+
method=tool_name,
|
433
|
+
params=params,
|
434
|
+
result_type=CrawlerResult,
|
435
|
+
timeout=timeout,
|
429
436
|
)
|
430
437
|
if crawler_result.error and crawler_result.error.strip() != "":
|
431
438
|
raise ToolError(crawler_result.error)
|
inspect_ai/util/__init__.py
CHANGED
@@ -1,6 +1,14 @@
|
|
1
1
|
from inspect_ai._util.registry import RegistryType, registry_create
|
2
2
|
from inspect_ai._util.trace import trace_action, trace_message
|
3
|
+
from inspect_ai.util._limit import (
|
4
|
+
Limit,
|
5
|
+
LimitExceededError,
|
6
|
+
apply_limits,
|
7
|
+
message_limit,
|
8
|
+
token_limit,
|
9
|
+
)
|
3
10
|
|
11
|
+
from ._collect import collect
|
4
12
|
from ._concurrency import concurrency
|
5
13
|
from ._console import input_screen
|
6
14
|
from ._display import DisplayType, display_counter, display_type
|
@@ -21,6 +29,7 @@ from ._sandbox import (
|
|
21
29
|
sandbox_with,
|
22
30
|
sandboxenv,
|
23
31
|
)
|
32
|
+
from ._span import span
|
24
33
|
from ._store import Store, store
|
25
34
|
from ._store_model import StoreModel, store_as
|
26
35
|
from ._subprocess import (
|
@@ -31,6 +40,7 @@ from ._subtask import Subtask, subtask
|
|
31
40
|
from ._throttle import throttle
|
32
41
|
|
33
42
|
__all__ = [
|
43
|
+
"apply_limits",
|
34
44
|
"ExecResult",
|
35
45
|
"concurrency",
|
36
46
|
"DisplayType",
|
@@ -42,9 +52,12 @@ __all__ = [
|
|
42
52
|
"JSONType",
|
43
53
|
"JSONSchema",
|
44
54
|
"json_schema",
|
55
|
+
"Limit",
|
56
|
+
"message_limit",
|
45
57
|
"OutputLimitExceededError",
|
46
58
|
"resource",
|
47
59
|
"subprocess",
|
60
|
+
"LimitExceededError",
|
48
61
|
"SandboxEnvironment",
|
49
62
|
"SandboxEnvironmentConfigType",
|
50
63
|
"SandboxEnvironmentLimits",
|
@@ -60,9 +73,12 @@ __all__ = [
|
|
60
73
|
"store",
|
61
74
|
"StoreModel",
|
62
75
|
"store_as",
|
76
|
+
"span",
|
77
|
+
"collect",
|
63
78
|
"Subtask",
|
64
79
|
"subtask",
|
65
80
|
"throttle",
|
81
|
+
"token_limit",
|
66
82
|
"trace_action",
|
67
83
|
"trace_message",
|
68
84
|
"RegistryType",
|
inspect_ai/util/_anyio.py
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
import itertools
|
2
2
|
import sys
|
3
3
|
|
4
|
+
import anyio
|
5
|
+
|
6
|
+
from inspect_ai._util._async import current_async_backend
|
7
|
+
|
4
8
|
if sys.version_info < (3, 11):
|
5
9
|
from exceptiongroup import ExceptionGroup
|
6
10
|
|
@@ -36,3 +40,10 @@ def _flatten_exception(exc: Exception) -> list[Exception]:
|
|
36
40
|
]
|
37
41
|
|
38
42
|
return maybe_this_exception + other_exceptions
|
43
|
+
|
44
|
+
|
45
|
+
def safe_current_task_id() -> int | None:
|
46
|
+
if current_async_backend() is not None:
|
47
|
+
return anyio.get_current_task().id
|
48
|
+
else:
|
49
|
+
return None
|