inspect-ai 0.3.74__py3-none-any.whl → 0.3.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inspect_ai/__init__.py +3 -2
- inspect_ai/_cli/cache.py +1 -1
- inspect_ai/_cli/common.py +15 -0
- inspect_ai/_cli/eval.py +4 -5
- inspect_ai/_cli/log.py +1 -1
- inspect_ai/_cli/sandbox.py +1 -1
- inspect_ai/_cli/trace.py +1 -1
- inspect_ai/_cli/view.py +1 -1
- inspect_ai/_display/core/config.py +3 -1
- inspect_ai/_eval/eval.py +55 -61
- inspect_ai/_eval/evalset.py +64 -154
- inspect_ai/_eval/loader.py +27 -54
- inspect_ai/_eval/registry.py +4 -15
- inspect_ai/_eval/run.py +7 -4
- inspect_ai/_eval/task/__init__.py +8 -2
- inspect_ai/_eval/task/log.py +9 -1
- inspect_ai/_eval/task/resolved.py +35 -0
- inspect_ai/_eval/task/run.py +4 -0
- inspect_ai/_eval/task/task.py +50 -69
- inspect_ai/_eval/task/tasks.py +30 -0
- inspect_ai/_util/constants.py +3 -0
- inspect_ai/_util/dotenv.py +17 -0
- inspect_ai/_util/logger.py +3 -0
- inspect_ai/_util/registry.py +43 -2
- inspect_ai/_view/server.py +28 -10
- inspect_ai/_view/www/dist/assets/index.css +32 -19
- inspect_ai/_view/www/dist/assets/index.js +17682 -29989
- inspect_ai/_view/www/log-schema.json +79 -9
- inspect_ai/_view/www/package.json +2 -2
- inspect_ai/_view/www/src/appearance/styles.ts +6 -5
- inspect_ai/_view/www/src/components/AnsiDisplay.tsx +2 -2
- inspect_ai/_view/www/src/constants.ts +3 -0
- inspect_ai/_view/www/src/logfile/remoteZipFile.ts +141 -20
- inspect_ai/_view/www/src/plan/PlanDetailView.tsx +2 -1
- inspect_ai/_view/www/src/samples/SampleSummaryView.tsx +1 -1
- inspect_ai/_view/www/src/samples/chat/tools/tool.ts +7 -5
- inspect_ai/_view/www/src/samples/descriptor/score/CategoricalScoreDescriptor.tsx +1 -1
- inspect_ai/_view/www/src/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -2
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.module.css +1 -0
- inspect_ai/_view/www/src/samples/error/FlatSampleErrorView.tsx +3 -1
- inspect_ai/_view/www/src/samples/sample-tools/SortFilter.tsx +1 -1
- inspect_ai/_view/www/src/samples/sample-tools/sample-filter/SampleFilter.tsx +5 -2
- inspect_ai/_view/www/src/samples/transcript/ModelEventView.module.css +2 -2
- inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +5 -1
- inspect_ai/_view/www/src/types/log.d.ts +11 -5
- inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +17 -12
- inspect_ai/_view/www/src/workspace/sidebar/SidebarLogEntry.tsx +2 -1
- inspect_ai/_view/www/yarn.lock +12 -5
- inspect_ai/log/_log.py +10 -1
- inspect_ai/log/_recorders/eval.py +27 -8
- inspect_ai/log/_recorders/json.py +10 -2
- inspect_ai/log/_transcript.py +13 -4
- inspect_ai/model/_call_tools.py +13 -4
- inspect_ai/model/_chat_message.py +15 -1
- inspect_ai/model/_model.py +30 -12
- inspect_ai/model/_model_output.py +6 -1
- inspect_ai/model/_openai.py +11 -6
- inspect_ai/model/_providers/anthropic.py +167 -77
- inspect_ai/model/_providers/google.py +6 -2
- inspect_ai/model/_providers/none.py +31 -0
- inspect_ai/model/_providers/openai.py +11 -8
- inspect_ai/model/_providers/providers.py +7 -0
- inspect_ai/model/_providers/vertex.py +5 -2
- inspect_ai/solver/_bridge/bridge.py +1 -1
- inspect_ai/solver/_chain.py +7 -6
- inspect_ai/tool/__init__.py +4 -0
- inspect_ai/tool/_tool_call.py +5 -2
- inspect_ai/tool/_tool_support_helpers.py +200 -0
- inspect_ai/tool/_tools/_bash_session.py +119 -0
- inspect_ai/tool/_tools/_computer/_computer.py +1 -1
- inspect_ai/tool/_tools/_text_editor.py +121 -0
- inspect_ai/tool/_tools/_web_browser/_back_compat.py +150 -0
- inspect_ai/tool/_tools/_web_browser/_web_browser.py +75 -130
- inspect_ai/tool/_tools/_web_search.py +2 -2
- inspect_ai/util/_json.py +28 -0
- inspect_ai/util/_sandbox/context.py +18 -8
- inspect_ai/util/_sandbox/docker/config.py +1 -1
- inspect_ai/util/_sandbox/docker/internal.py +3 -3
- inspect_ai/util/_sandbox/environment.py +17 -2
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/METADATA +8 -5
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/RECORD +85 -108
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/WHEEL +1 -1
- inspect_ai/tool/_tools/_web_browser/_resources/.pylintrc +0 -8
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/launch.json +0 -24
- inspect_ai/tool/_tools/_web_browser/_resources/.vscode/settings.json +0 -25
- inspect_ai/tool/_tools/_web_browser/_resources/Dockerfile +0 -22
- inspect_ai/tool/_tools/_web_browser/_resources/README.md +0 -63
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree.py +0 -71
- inspect_ai/tool/_tools/_web_browser/_resources/accessibility_tree_node.py +0 -323
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/__init__.py +0 -5
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/a11y.py +0 -279
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom.py +0 -9
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/dom_snapshot.py +0 -293
- inspect_ai/tool/_tools/_web_browser/_resources/cdp/page.py +0 -94
- inspect_ai/tool/_tools/_web_browser/_resources/constants.py +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/images/usage_diagram.svg +0 -2
- inspect_ai/tool/_tools/_web_browser/_resources/mock_environment.py +0 -45
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_browser.py +0 -50
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_crawler.py +0 -48
- inspect_ai/tool/_tools/_web_browser/_resources/playwright_page_crawler.py +0 -280
- inspect_ai/tool/_tools/_web_browser/_resources/pyproject.toml +0 -65
- inspect_ai/tool/_tools/_web_browser/_resources/rectangle.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/rpc_client_helpers.py +0 -146
- inspect_ai/tool/_tools/_web_browser/_resources/scale_factor.py +0 -64
- inspect_ai/tool/_tools/_web_browser/_resources/test_accessibility_tree_node.py +0 -180
- inspect_ai/tool/_tools/_web_browser/_resources/test_playwright_crawler.py +0 -99
- inspect_ai/tool/_tools/_web_browser/_resources/test_rectangle.py +0 -15
- inspect_ai/tool/_tools/_web_browser/_resources/test_web_client.py +0 -44
- inspect_ai/tool/_tools/_web_browser/_resources/web_browser_rpc_types.py +0 -39
- inspect_ai/tool/_tools/_web_browser/_resources/web_client.py +0 -214
- inspect_ai/tool/_tools/_web_browser/_resources/web_client_new_session.py +0 -35
- inspect_ai/tool/_tools/_web_browser/_resources/web_server.py +0 -192
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/entry_points.txt +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info/licenses}/LICENSE +0 -0
- {inspect_ai-0.3.74.dist-info → inspect_ai-0.3.76.dist-info}/top_level.txt +0 -0
inspect_ai/tool/_tool_call.py
CHANGED
@@ -44,8 +44,11 @@ class ToolCall:
|
|
44
44
|
arguments: dict[str, Any]
|
45
45
|
"""Arguments to function."""
|
46
46
|
|
47
|
-
type:
|
48
|
-
"""Type of tool call (
|
47
|
+
type: str
|
48
|
+
"""Type of tool call ('function' or a model specific internal tool type)"""
|
49
|
+
|
50
|
+
internal_name: str | None = field(default=None)
|
51
|
+
"""Model's internal name for the tool - if any."""
|
49
52
|
|
50
53
|
parse_error: str | None = field(default=None)
|
51
54
|
"""Error which occurred parsing tool call."""
|
@@ -0,0 +1,200 @@
|
|
1
|
+
"""
|
2
|
+
This module provides helper code for handling JSON-RPC communication between the inspect process and the `inspect-tool-support` package code running in the sandbox environment.
|
3
|
+
|
4
|
+
It includes definitions for JSON-RPC request and response models, as well as functions to create and parse JSON-RPC requests and responses.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import json
|
8
|
+
from itertools import count
|
9
|
+
from textwrap import dedent
|
10
|
+
from typing import Literal, Type, TypeVar, cast
|
11
|
+
|
12
|
+
from pydantic import BaseModel, RootModel
|
13
|
+
|
14
|
+
from inspect_ai._util.error import PrerequisiteError
|
15
|
+
from inspect_ai.tool._tool import ToolError, ToolParsingError
|
16
|
+
from inspect_ai.util import sandbox_with
|
17
|
+
from inspect_ai.util._sandbox.environment import SandboxEnvironment
|
18
|
+
|
19
|
+
|
20
|
+
class JSONRPCResponseBase(BaseModel):
|
21
|
+
jsonrpc: Literal["2.0"]
|
22
|
+
id: int | float | str
|
23
|
+
|
24
|
+
|
25
|
+
class JSONRPCSuccessResponse(JSONRPCResponseBase):
|
26
|
+
result: object
|
27
|
+
|
28
|
+
|
29
|
+
class JSONRPCError(BaseModel):
|
30
|
+
"""See: https://www.jsonrpc.org/specification#error_object"""
|
31
|
+
|
32
|
+
code: int
|
33
|
+
message: str
|
34
|
+
data: object | None = None
|
35
|
+
|
36
|
+
|
37
|
+
class JSONRPCErrorResponse(JSONRPCResponseBase):
|
38
|
+
error: JSONRPCError
|
39
|
+
|
40
|
+
|
41
|
+
class JSONRPCResponse(RootModel[JSONRPCSuccessResponse | JSONRPCErrorResponse]):
|
42
|
+
pass
|
43
|
+
|
44
|
+
|
45
|
+
BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
|
46
|
+
StrOrModelT = TypeVar("StrOrModelT", bound=str | BaseModel)
|
47
|
+
|
48
|
+
id_generator = count(666)
|
49
|
+
|
50
|
+
|
51
|
+
async def exec_sandbox_rpc(
|
52
|
+
sandbox: SandboxEnvironment,
|
53
|
+
method: str,
|
54
|
+
params: dict[str, object] | tuple[object, ...],
|
55
|
+
result_cls: Type[StrOrModelT],
|
56
|
+
timeout: int | None = None,
|
57
|
+
user: str | None = None,
|
58
|
+
) -> StrOrModelT:
|
59
|
+
"""
|
60
|
+
Execute a JSON-RPC command to a sandbox environment.
|
61
|
+
|
62
|
+
Note that the JSON RPC request is sent to the exec'ed program via stdin.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
sandbox (SandboxEnvironment): The sandbox environment to execute the command in.
|
66
|
+
method (str): The JSON-RPC method to call.
|
67
|
+
params (dict[str, object] | tuple[object, ...]): The parameters for the JSON-RPC method.
|
68
|
+
result_cls (Type[BaseModelT]): The class to use for parsing the result.
|
69
|
+
timeout (int | None, optional): The timeout for the execution. Defaults to None.
|
70
|
+
user: Optional username or UID to run the command as.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
BaseModelT: The parsed result of the JSON-RPC call.
|
74
|
+
|
75
|
+
Raises:
|
76
|
+
RuntimeError: If the sandbox execution fails or if there is an error in the JSON-RPC response.
|
77
|
+
ToolParsingError: If the JSON-RPC response contains a specific error code indicating a parsing error.
|
78
|
+
"""
|
79
|
+
exec_result = await sandbox.exec(
|
80
|
+
[SANDBOX_CLI, "exec"],
|
81
|
+
input=_create_json_rpc_request(method, params),
|
82
|
+
timeout=timeout,
|
83
|
+
user=user,
|
84
|
+
)
|
85
|
+
|
86
|
+
if not exec_result.success:
|
87
|
+
raise RuntimeError(
|
88
|
+
f"Sandbox.exec failure executing {_rpc_call_description(method, params)}: {exec_result.stderr}"
|
89
|
+
)
|
90
|
+
|
91
|
+
match _parse_json_rpc_response(exec_result.stdout, result_cls):
|
92
|
+
case JSONRPCError(code=-32601 | -32602, message=message):
|
93
|
+
raise ToolParsingError(message)
|
94
|
+
case JSONRPCError(code=-32000, message=message):
|
95
|
+
raise ToolError(message)
|
96
|
+
case JSONRPCError(code=code, message=message):
|
97
|
+
raise RuntimeError(
|
98
|
+
f"Error executing tool command {_rpc_call_description(method, params)}: {code=} {message}"
|
99
|
+
)
|
100
|
+
# case result_cls() as model: yields a mypy error since it has narrowed model down
|
101
|
+
# to BaseModel and not BaseModelT. ???
|
102
|
+
case model if isinstance(model, result_cls):
|
103
|
+
return model
|
104
|
+
case not_possible:
|
105
|
+
raise RuntimeError(
|
106
|
+
f"Error executing tool command {_rpc_call_description(method, params)}: {not_possible}"
|
107
|
+
)
|
108
|
+
|
109
|
+
|
110
|
+
SANDBOX_CLI = "inspect-tool-support"
|
111
|
+
INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB = "aisiuk/inspect-tool-support"
|
112
|
+
|
113
|
+
|
114
|
+
async def tool_container_sandbox(tool_name: str) -> SandboxEnvironment:
|
115
|
+
sb = await sandbox_with(SANDBOX_CLI, True)
|
116
|
+
if sb:
|
117
|
+
return sb
|
118
|
+
else:
|
119
|
+
msg = dedent(f"""
|
120
|
+
The {tool_name} service was not found in any of the sandboxes for this sample. Please add the {tool_name} to your configuration.
|
121
|
+
|
122
|
+
For example, the following Docker compose file uses the {INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB} reference image as its default sandbox:
|
123
|
+
|
124
|
+
services:
|
125
|
+
default:
|
126
|
+
image: "{INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB}"
|
127
|
+
init: true
|
128
|
+
|
129
|
+
Alternatively, you can include the service into your own Dockerfile:
|
130
|
+
|
131
|
+
RUN python -m venv /opt/inspect_tool_support
|
132
|
+
ENV PATH="/opt/inspect_tool_support/bin:$PATH"
|
133
|
+
RUN pip install inspect-tool-support
|
134
|
+
RUN inspect-tool-support post-install
|
135
|
+
""").strip()
|
136
|
+
raise PrerequisiteError(msg)
|
137
|
+
|
138
|
+
|
139
|
+
def _create_json_rpc_request(
|
140
|
+
method: str, params: dict[str, object] | tuple[object, ...]
|
141
|
+
) -> str:
|
142
|
+
return json.dumps(
|
143
|
+
{
|
144
|
+
"jsonrpc": "2.0",
|
145
|
+
"method": method,
|
146
|
+
"id": next(id_generator),
|
147
|
+
"params": list(params) if isinstance(params, tuple) else params,
|
148
|
+
}
|
149
|
+
)
|
150
|
+
|
151
|
+
|
152
|
+
def _rpc_call_description(
|
153
|
+
method: str, params: dict[str, object] | tuple[object, ...]
|
154
|
+
) -> str:
|
155
|
+
"""
|
156
|
+
Generate a string description of an RPC call.
|
157
|
+
|
158
|
+
Args:
|
159
|
+
method (str): The name of the RPC method.
|
160
|
+
params (dict[str, object] | tuple[object, ...]): The parameters for the RPC method.
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
str: A string description of the RPC call.
|
164
|
+
|
165
|
+
Examples:
|
166
|
+
>>> _rpc_call_description("subtract", {"minuend": 42, "subtrahend": 23})
|
167
|
+
'subtract(minuend: 42, subtrahend: 23)'
|
168
|
+
|
169
|
+
>>> _rpc_call_description("subtract", (42, 23))
|
170
|
+
'subtract(42, 23)'
|
171
|
+
"""
|
172
|
+
normalized_params = (
|
173
|
+
list(map(str, params))
|
174
|
+
if isinstance(params, tuple)
|
175
|
+
else [f"{k}: {v}" for k, v in params.items()]
|
176
|
+
)
|
177
|
+
return f"{method}({', '.join(normalized_params)})"
|
178
|
+
|
179
|
+
|
180
|
+
def _parse_json_rpc_response(
|
181
|
+
response_str: str,
|
182
|
+
result_cls: Type[StrOrModelT],
|
183
|
+
) -> StrOrModelT | JSONRPCError:
|
184
|
+
match JSONRPCResponse.model_validate_json(response_str).root:
|
185
|
+
case JSONRPCErrorResponse(error=error):
|
186
|
+
return error
|
187
|
+
case JSONRPCSuccessResponse(result=rpc_result):
|
188
|
+
# TODO: Wow. Is there really no way to convince Python to narrow these types
|
189
|
+
# and avoid the cast's
|
190
|
+
if result_cls is str:
|
191
|
+
if not isinstance(rpc_result, str):
|
192
|
+
raise ValueError(f"Expected string result, got {type(rpc_result)}")
|
193
|
+
return cast(StrOrModelT, rpc_result)
|
194
|
+
else:
|
195
|
+
return cast(
|
196
|
+
StrOrModelT,
|
197
|
+
cast(BaseModel, result_cls).model_validate(rpc_result, strict=True),
|
198
|
+
)
|
199
|
+
case _:
|
200
|
+
raise ValueError(f"Unexpected JSON RPC response: {response_str}")
|
@@ -0,0 +1,119 @@
|
|
1
|
+
from pydantic import BaseModel, Field, RootModel
|
2
|
+
|
3
|
+
from inspect_ai.tool import ToolResult
|
4
|
+
from inspect_ai.tool._tool_support_helpers import (
|
5
|
+
exec_sandbox_rpc,
|
6
|
+
tool_container_sandbox,
|
7
|
+
)
|
8
|
+
from inspect_ai.util import StoreModel, store_as
|
9
|
+
|
10
|
+
from .._tool import Tool, ToolParsingError, tool
|
11
|
+
from .._tool_call import ToolCall, ToolCallContent, ToolCallView, ToolCallViewer
|
12
|
+
|
13
|
+
|
14
|
+
# These models are cloned from the container code. If/when we decide to create
|
15
|
+
# a package that is shared between the inspect and tool-container codebases, we'll
|
16
|
+
# just have to live with it.
|
17
|
+
class NewSessionResult(BaseModel):
|
18
|
+
session_name: str
|
19
|
+
|
20
|
+
|
21
|
+
class BashRestartResult(BaseModel):
|
22
|
+
pass
|
23
|
+
|
24
|
+
|
25
|
+
class BashCommandResult(BaseModel):
|
26
|
+
status: int
|
27
|
+
stdout: str
|
28
|
+
stderr: str
|
29
|
+
|
30
|
+
|
31
|
+
class BashResult(RootModel[BashRestartResult | BashCommandResult]):
|
32
|
+
pass
|
33
|
+
|
34
|
+
|
35
|
+
class BashSessionStore(StoreModel):
|
36
|
+
session_id: str = Field(default_factory=str)
|
37
|
+
|
38
|
+
|
39
|
+
# custom viewer for bash
|
40
|
+
def code_viewer(language: str, code_param: str) -> ToolCallViewer:
|
41
|
+
def viewer(tool_call: ToolCall) -> ToolCallView:
|
42
|
+
code = tool_call.arguments.get(code_param, None)
|
43
|
+
code = (code or tool_call.function).strip()
|
44
|
+
call = ToolCallContent(
|
45
|
+
title=language,
|
46
|
+
format="markdown",
|
47
|
+
content=f"```{language}\n" + code + "\n```\n",
|
48
|
+
)
|
49
|
+
return ToolCallView(call=call)
|
50
|
+
|
51
|
+
return viewer
|
52
|
+
|
53
|
+
|
54
|
+
@tool(viewer=code_viewer("bash", "command"))
|
55
|
+
def bash_session(timeout: int | None = None) -> Tool:
|
56
|
+
"""Bash shell session command execution tool.
|
57
|
+
|
58
|
+
Execute bash shell commands in a long running session using a sandbox environment (e.g. "docker").
|
59
|
+
|
60
|
+
Args:
|
61
|
+
timeout: Timeout (in seconds) for command.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
String with command output (stdout) or command error (stderr).
|
65
|
+
"""
|
66
|
+
|
67
|
+
async def execute(
|
68
|
+
command: str | None = None,
|
69
|
+
restart: bool | None = None,
|
70
|
+
) -> ToolResult:
|
71
|
+
"""
|
72
|
+
Use this function to execute bash commands.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
command: The bash command to run. Required unless the tool is being restarted.
|
76
|
+
restart: Specifying true will restart this tool. Otherwise, leave this unspecified.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
The output of the command.
|
80
|
+
"""
|
81
|
+
if not ((command is None) ^ (restart is None)):
|
82
|
+
raise ToolParsingError(
|
83
|
+
"Either 'command' or 'restart' must be specified, but not both."
|
84
|
+
)
|
85
|
+
params: dict[str, object] = {"command": command, "restart": restart}
|
86
|
+
|
87
|
+
sandbox = await tool_container_sandbox("bash session")
|
88
|
+
store = store_as(BashSessionStore)
|
89
|
+
|
90
|
+
if not store.session_id:
|
91
|
+
store.session_id = (
|
92
|
+
await exec_sandbox_rpc(
|
93
|
+
sandbox,
|
94
|
+
"bash_session_new_session",
|
95
|
+
{},
|
96
|
+
NewSessionResult,
|
97
|
+
timeout=timeout,
|
98
|
+
)
|
99
|
+
).session_name
|
100
|
+
|
101
|
+
params["session_name"] = store.session_id
|
102
|
+
|
103
|
+
result = (
|
104
|
+
await exec_sandbox_rpc(
|
105
|
+
sandbox,
|
106
|
+
"bash_session",
|
107
|
+
params,
|
108
|
+
BashResult,
|
109
|
+
timeout=timeout,
|
110
|
+
)
|
111
|
+
).root
|
112
|
+
|
113
|
+
if isinstance(result, BashRestartResult):
|
114
|
+
return "Bash session restarted."
|
115
|
+
|
116
|
+
# return output (including stderr if any)
|
117
|
+
return f"{result.stderr}\n{result.stdout}" if result.stderr else result.stdout
|
118
|
+
|
119
|
+
return execute
|
@@ -15,7 +15,7 @@ ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]]
|
|
15
15
|
def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool:
|
16
16
|
"""Desktop computer tool.
|
17
17
|
|
18
|
-
See documentation at <https://inspect.
|
18
|
+
See documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-computer>.
|
19
19
|
|
20
20
|
Args:
|
21
21
|
max_screenshots: The maximum number of screenshots to play
|
@@ -0,0 +1,121 @@
|
|
1
|
+
import inspect
|
2
|
+
from typing import Annotated, Literal
|
3
|
+
|
4
|
+
from pydantic import BaseModel, Discriminator, RootModel
|
5
|
+
|
6
|
+
from inspect_ai.tool import ToolResult
|
7
|
+
from inspect_ai.tool._tool_support_helpers import (
|
8
|
+
exec_sandbox_rpc,
|
9
|
+
tool_container_sandbox,
|
10
|
+
)
|
11
|
+
|
12
|
+
from .._tool import Tool, tool
|
13
|
+
|
14
|
+
# These models are cloned from the container code. If/when we decide to create
|
15
|
+
# a package that is shared between the inspect and tool-container codebases, we'll
|
16
|
+
# just have to live with it.
|
17
|
+
|
18
|
+
|
19
|
+
class BaseParams(BaseModel):
|
20
|
+
path: str
|
21
|
+
|
22
|
+
|
23
|
+
class ViewParams(BaseParams):
|
24
|
+
command: Literal["view"] = "view"
|
25
|
+
view_range: list[int] | None = None
|
26
|
+
|
27
|
+
|
28
|
+
class CreateParams(BaseParams):
|
29
|
+
command: Literal["create"] = "create"
|
30
|
+
file_text: str
|
31
|
+
|
32
|
+
|
33
|
+
class StrReplaceParams(BaseParams):
|
34
|
+
command: Literal["str_replace"] = "str_replace"
|
35
|
+
old_str: str
|
36
|
+
new_str: str | None = None
|
37
|
+
|
38
|
+
|
39
|
+
class InsertParams(BaseParams):
|
40
|
+
command: Literal["insert"] = "insert"
|
41
|
+
insert_line: int
|
42
|
+
new_str: str
|
43
|
+
|
44
|
+
|
45
|
+
class UndoEditParams(BaseParams):
|
46
|
+
command: Literal["undo_edit"] = "undo_edit"
|
47
|
+
|
48
|
+
|
49
|
+
class TextEditorParams(
|
50
|
+
RootModel[
|
51
|
+
ViewParams | CreateParams | StrReplaceParams | InsertParams | UndoEditParams
|
52
|
+
]
|
53
|
+
):
|
54
|
+
root: Annotated[
|
55
|
+
ViewParams | CreateParams | StrReplaceParams | InsertParams | UndoEditParams,
|
56
|
+
Discriminator("command"),
|
57
|
+
]
|
58
|
+
|
59
|
+
|
60
|
+
TextEditorResult = str
|
61
|
+
|
62
|
+
|
63
|
+
@tool()
|
64
|
+
def text_editor(timeout: int | None = None, user: str | None = None) -> Tool:
|
65
|
+
"""Custom editing tool for viewing, creating and editing files.
|
66
|
+
|
67
|
+
Perform text editor operations using a sandbox environment (e.g. "docker").
|
68
|
+
|
69
|
+
IMPORTANT: This tool does not currently support Subtask isolation. This means
|
70
|
+
that a change made to a file by on Subtask will be visible to another Subtask.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
timeout: Timeout (in seconds) for command.
|
74
|
+
user: User to execute commands as.
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
String with command output (stdout) or command error (stderr).
|
78
|
+
"""
|
79
|
+
|
80
|
+
async def execute(
|
81
|
+
command: Literal["view", "create", "str_replace", "insert", "undo_edit"],
|
82
|
+
path: str,
|
83
|
+
file_text: str | None = None,
|
84
|
+
insert_line: int | None = None,
|
85
|
+
new_str: str | None = None,
|
86
|
+
old_str: str | None = None,
|
87
|
+
view_range: list[int] | None = None,
|
88
|
+
) -> ToolResult:
|
89
|
+
"""
|
90
|
+
Use this function to execute text editing commands.
|
91
|
+
|
92
|
+
Args:
|
93
|
+
command: The command to execute.
|
94
|
+
path: Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
|
95
|
+
file_text: Required parameter of `create` command, with the content of the file to be created.
|
96
|
+
insert_line: Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.
|
97
|
+
new_str: Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.
|
98
|
+
old_str: Required parameter of `str_replace` command containing the string in `path` to replace.
|
99
|
+
view_range: Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
The output of the command.
|
103
|
+
"""
|
104
|
+
sandbox = await tool_container_sandbox("editor")
|
105
|
+
|
106
|
+
# Create a dictionary of the parameters
|
107
|
+
params = {
|
108
|
+
k: v
|
109
|
+
for k, v in locals().items()
|
110
|
+
if k in inspect.signature(execute).parameters
|
111
|
+
}
|
112
|
+
|
113
|
+
return await exec_sandbox_rpc(
|
114
|
+
sandbox,
|
115
|
+
"text_editor",
|
116
|
+
params,
|
117
|
+
TextEditorResult,
|
118
|
+
timeout=timeout,
|
119
|
+
)
|
120
|
+
|
121
|
+
return execute
|
@@ -0,0 +1,150 @@
|
|
1
|
+
"""This module provides the "old" client code for running against the, now deprecated, `aisiuk/inspect-web-browser-tool` image."""
|
2
|
+
|
3
|
+
import re
|
4
|
+
from logging import getLogger
|
5
|
+
from textwrap import dedent
|
6
|
+
|
7
|
+
from pydantic import Field
|
8
|
+
|
9
|
+
from inspect_ai._util.content import ContentText
|
10
|
+
from inspect_ai._util.error import PrerequisiteError
|
11
|
+
from inspect_ai._util.logger import warn_once
|
12
|
+
from inspect_ai.tool import ToolError, ToolResult
|
13
|
+
from inspect_ai.util import SandboxEnvironment, StoreModel, sandbox_with, store_as
|
14
|
+
from inspect_ai.util._sandbox.docker.internal import (
|
15
|
+
INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB_DEPRECATED,
|
16
|
+
)
|
17
|
+
|
18
|
+
logger = getLogger("web_browser")
|
19
|
+
|
20
|
+
WEB_CLIENT_REQUEST = "/app/web_browser/web_client.py"
|
21
|
+
WEB_CLIENT_NEW_SESSION = "/app/web_browser/web_client_new_session.py"
|
22
|
+
|
23
|
+
|
24
|
+
class WebBrowserStore(StoreModel):
|
25
|
+
main_content: str = Field(default_factory=str)
|
26
|
+
web_at: str = Field(default_factory=str)
|
27
|
+
session_id: str = Field(default_factory=str)
|
28
|
+
|
29
|
+
|
30
|
+
async def old_web_browser_cmd(cmd: str, *args: str) -> ToolResult:
|
31
|
+
sandbox_env = await _web_browser_sandbox()
|
32
|
+
warn_once(
|
33
|
+
logger,
|
34
|
+
"WARNING: Use of the `aisiuk/inspect-web-browser-tool` image is deprecated. Please update your configuration to use the `aisiuk/inspect-tool-support` image or install the `inspect-tool-support` package into your own image.",
|
35
|
+
)
|
36
|
+
|
37
|
+
store = store_as(WebBrowserStore)
|
38
|
+
if not store.session_id:
|
39
|
+
result = await sandbox_env.exec(
|
40
|
+
["python3", WEB_CLIENT_NEW_SESSION], timeout=180
|
41
|
+
)
|
42
|
+
|
43
|
+
if not result.success:
|
44
|
+
raise RuntimeError(
|
45
|
+
f"Error creating new web browser session: {result.stderr}"
|
46
|
+
)
|
47
|
+
|
48
|
+
store.session_id = result.stdout.strip("\n")
|
49
|
+
|
50
|
+
session_flag = f"--session_name={store.session_id}"
|
51
|
+
|
52
|
+
arg_list = None
|
53
|
+
if session_flag:
|
54
|
+
arg_list = ["python3", WEB_CLIENT_REQUEST, session_flag, cmd] + list(args)
|
55
|
+
else:
|
56
|
+
arg_list = ["python3", WEB_CLIENT_REQUEST, cmd] + list(args)
|
57
|
+
|
58
|
+
result = await sandbox_env.exec(arg_list, timeout=180)
|
59
|
+
if not result.success:
|
60
|
+
raise RuntimeError(
|
61
|
+
f"Error executing web browser command {cmd}({', '.join(args)}): {result.stderr}"
|
62
|
+
)
|
63
|
+
else:
|
64
|
+
response = _parse_web_browser_output(result.stdout)
|
65
|
+
if "error" in response and response.get("error", "").strip() != "":
|
66
|
+
raise ToolError(str(response.get("error")) or "(unknown error)")
|
67
|
+
elif "web_at" in response:
|
68
|
+
main_content = str(response.get("main_content")) or None
|
69
|
+
web_at = (
|
70
|
+
str(response.get("web_at")) or "(no web accessibility tree available)"
|
71
|
+
)
|
72
|
+
# Remove base64 data from images.
|
73
|
+
web_at_lines = web_at.split("\n")
|
74
|
+
web_at_lines = [
|
75
|
+
line.partition("data:image/png;base64")[0] for line in web_at_lines
|
76
|
+
]
|
77
|
+
|
78
|
+
store_as(WebBrowserStore).main_content = (
|
79
|
+
main_content or "(no main text summary)"
|
80
|
+
)
|
81
|
+
store_as(WebBrowserStore).web_at = web_at
|
82
|
+
|
83
|
+
web_at = "\n".join(web_at_lines)
|
84
|
+
return (
|
85
|
+
[
|
86
|
+
ContentText(text=f"main content:\n{main_content}\n\n"),
|
87
|
+
ContentText(text=f"accessibility tree:\n{web_at}"),
|
88
|
+
]
|
89
|
+
if main_content
|
90
|
+
else web_at
|
91
|
+
)
|
92
|
+
else:
|
93
|
+
raise RuntimeError(
|
94
|
+
f"web_browser output must contain either 'error' or 'web_at' field: {result.stdout}"
|
95
|
+
)
|
96
|
+
|
97
|
+
|
98
|
+
async def _web_browser_sandbox() -> SandboxEnvironment:
|
99
|
+
sb = await sandbox_with(WEB_CLIENT_REQUEST)
|
100
|
+
if sb:
|
101
|
+
return sb
|
102
|
+
else:
|
103
|
+
msg = dedent(f"""
|
104
|
+
The web browser service was not found in any of the sandboxes for this sample. Please add the web browser service to your configuration. For example, the following Docker compose file uses the {INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB_DEPRECATED} image as its default sandbox:
|
105
|
+
|
106
|
+
services:
|
107
|
+
default:
|
108
|
+
image: "{INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB_DEPRECATED}"
|
109
|
+
init: true
|
110
|
+
|
111
|
+
Alternatively, this Docker compose file creates a dedicated image for the web browser service:
|
112
|
+
|
113
|
+
services:
|
114
|
+
default:
|
115
|
+
image: "python:3.12-bookworm"
|
116
|
+
init: true
|
117
|
+
command: "tail -f /dev/null"
|
118
|
+
|
119
|
+
web_browser:
|
120
|
+
image: "{INSPECT_WEB_BROWSER_IMAGE_DOCKERHUB_DEPRECATED}"
|
121
|
+
init: true
|
122
|
+
""").strip()
|
123
|
+
raise PrerequisiteError(msg)
|
124
|
+
|
125
|
+
|
126
|
+
def _parse_web_browser_output(output: str) -> dict[str, str]:
|
127
|
+
response: dict[str, str] = dict(
|
128
|
+
web_url="", main_content="", web_at="", info="", error=""
|
129
|
+
)
|
130
|
+
active_field: str | None = None
|
131
|
+
active_field_lines: list[str] = []
|
132
|
+
|
133
|
+
def collect_active_field() -> None:
|
134
|
+
if active_field is not None:
|
135
|
+
response[active_field] = "\n".join(active_field_lines)
|
136
|
+
active_field_lines.clear()
|
137
|
+
|
138
|
+
for line in output.splitlines():
|
139
|
+
field_match = re.match(
|
140
|
+
r"^(error|main_content|web_at|web_url|info)\s*:\s*(.+)$", line
|
141
|
+
)
|
142
|
+
if field_match:
|
143
|
+
collect_active_field()
|
144
|
+
active_field = field_match.group(1)
|
145
|
+
active_field_lines.append(field_match.group(2))
|
146
|
+
else:
|
147
|
+
active_field_lines.append(line)
|
148
|
+
collect_active_field()
|
149
|
+
|
150
|
+
return response
|