inspect-ai 0.3.92__py3-none-any.whl → 0.3.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. inspect_ai/_cli/eval.py +27 -0
  2. inspect_ai/_display/textual/widgets/samples.py +3 -3
  3. inspect_ai/_display/textual/widgets/transcript.py +3 -29
  4. inspect_ai/_eval/eval.py +19 -2
  5. inspect_ai/_eval/evalset.py +4 -1
  6. inspect_ai/_eval/run.py +41 -0
  7. inspect_ai/_eval/task/generate.py +38 -44
  8. inspect_ai/_eval/task/log.py +26 -28
  9. inspect_ai/_eval/task/run.py +23 -27
  10. inspect_ai/_util/answer.py +26 -0
  11. inspect_ai/_util/constants.py +0 -1
  12. inspect_ai/_util/local_server.py +398 -0
  13. inspect_ai/_util/working.py +10 -4
  14. inspect_ai/_view/www/dist/assets/index.css +173 -159
  15. inspect_ai/_view/www/dist/assets/index.js +1417 -1142
  16. inspect_ai/_view/www/log-schema.json +379 -3
  17. inspect_ai/_view/www/package.json +1 -1
  18. inspect_ai/_view/www/src/@types/log.d.ts +93 -14
  19. inspect_ai/_view/www/src/app/content/MetaDataGrid.tsx +2 -2
  20. inspect_ai/_view/www/src/app/content/MetaDataView.module.css +1 -1
  21. inspect_ai/_view/www/src/app/content/MetadataGrid.module.css +1 -1
  22. inspect_ai/_view/www/src/app/content/RenderedContent.tsx +1 -1
  23. inspect_ai/_view/www/src/app/log-view/LogView.tsx +11 -0
  24. inspect_ai/_view/www/src/app/log-view/tabs/InfoTab.tsx +2 -9
  25. inspect_ai/_view/www/src/app/log-view/tabs/ModelsTab.tsx +51 -0
  26. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.module.css +6 -0
  27. inspect_ai/_view/www/src/app/log-view/tabs/TaskTab.tsx +143 -0
  28. inspect_ai/_view/www/src/app/plan/ModelCard.tsx +1 -2
  29. inspect_ai/_view/www/src/app/plan/PlanCard.tsx +29 -7
  30. inspect_ai/_view/www/src/app/plan/PlanDetailView.module.css +1 -1
  31. inspect_ai/_view/www/src/app/plan/PlanDetailView.tsx +1 -198
  32. inspect_ai/_view/www/src/app/samples/descriptor/score/NumericScoreDescriptor.tsx +2 -1
  33. inspect_ai/_view/www/src/app/samples/transcript/SandboxEventView.module.css +2 -1
  34. inspect_ai/_view/www/src/app/samples/transcript/SpanEventView.tsx +174 -0
  35. inspect_ai/_view/www/src/app/samples/transcript/ToolEventView.tsx +8 -8
  36. inspect_ai/_view/www/src/app/samples/transcript/TranscriptView.tsx +12 -2
  37. inspect_ai/_view/www/src/app/samples/transcript/TranscriptVirtualListComponent.module.css +1 -1
  38. inspect_ai/_view/www/src/app/samples/transcript/event/EventPanel.tsx +0 -3
  39. inspect_ai/_view/www/src/app/samples/transcript/transform/fixups.ts +87 -25
  40. inspect_ai/_view/www/src/app/samples/transcript/transform/treeify.ts +229 -17
  41. inspect_ai/_view/www/src/app/samples/transcript/transform/utils.ts +11 -0
  42. inspect_ai/_view/www/src/app/samples/transcript/types.ts +5 -1
  43. inspect_ai/_view/www/src/app/usage/ModelUsagePanel.tsx +3 -2
  44. inspect_ai/_view/www/src/app/usage/TokenTable.module.css +4 -1
  45. inspect_ai/_view/www/src/app/usage/TokenTable.tsx +2 -2
  46. inspect_ai/_view/www/src/app/usage/UsageCard.module.css +8 -3
  47. inspect_ai/_view/www/src/app/usage/UsageCard.tsx +1 -35
  48. inspect_ai/_view/www/src/components/Card.css +0 -1
  49. inspect_ai/_view/www/src/constants.ts +2 -0
  50. inspect_ai/_view/www/src/utils/numeric.ts +17 -0
  51. inspect_ai/agent/_agent.py +3 -3
  52. inspect_ai/agent/_as_solver.py +22 -12
  53. inspect_ai/agent/_as_tool.py +20 -6
  54. inspect_ai/agent/_handoff.py +12 -1
  55. inspect_ai/agent/_react.py +4 -3
  56. inspect_ai/agent/_run.py +16 -3
  57. inspect_ai/agent/_types.py +9 -0
  58. inspect_ai/dataset/_dataset.py +6 -3
  59. inspect_ai/log/__init__.py +14 -0
  60. inspect_ai/log/_convert.py +4 -9
  61. inspect_ai/log/_file.py +56 -0
  62. inspect_ai/log/_log.py +99 -0
  63. inspect_ai/log/_recorders/__init__.py +2 -0
  64. inspect_ai/log/_recorders/buffer/database.py +12 -11
  65. inspect_ai/log/_recorders/buffer/filestore.py +2 -2
  66. inspect_ai/log/_recorders/buffer/types.py +2 -2
  67. inspect_ai/log/_recorders/eval.py +20 -65
  68. inspect_ai/log/_recorders/file.py +28 -6
  69. inspect_ai/log/_recorders/recorder.py +7 -0
  70. inspect_ai/log/_recorders/types.py +1 -23
  71. inspect_ai/log/_samples.py +14 -25
  72. inspect_ai/log/_transcript.py +84 -36
  73. inspect_ai/log/_tree.py +118 -0
  74. inspect_ai/log/_util.py +52 -0
  75. inspect_ai/model/__init__.py +5 -1
  76. inspect_ai/model/_call_tools.py +72 -44
  77. inspect_ai/model/_generate_config.py +14 -8
  78. inspect_ai/model/_model.py +66 -88
  79. inspect_ai/model/_model_output.py +25 -0
  80. inspect_ai/model/_openai.py +2 -0
  81. inspect_ai/model/_providers/anthropic.py +13 -23
  82. inspect_ai/model/_providers/hf.py +27 -1
  83. inspect_ai/model/_providers/openai_o1.py +8 -2
  84. inspect_ai/model/_providers/providers.py +18 -4
  85. inspect_ai/model/_providers/sglang.py +247 -0
  86. inspect_ai/model/_providers/vllm.py +211 -400
  87. inspect_ai/scorer/_choice.py +1 -2
  88. inspect_ai/solver/__init__.py +7 -2
  89. inspect_ai/solver/_basic_agent.py +3 -10
  90. inspect_ai/solver/_chain.py +1 -1
  91. inspect_ai/solver/_fork.py +1 -1
  92. inspect_ai/solver/_multiple_choice.py +5 -22
  93. inspect_ai/solver/_plan.py +2 -2
  94. inspect_ai/solver/_task_state.py +26 -88
  95. inspect_ai/solver/_transcript.py +6 -7
  96. inspect_ai/tool/_json_rpc_helpers.py +45 -17
  97. inspect_ai/tool/_mcp/_mcp.py +8 -5
  98. inspect_ai/tool/_mcp/_sandbox.py +8 -2
  99. inspect_ai/tool/_mcp/server.py +3 -1
  100. inspect_ai/tool/_tool_call.py +4 -1
  101. inspect_ai/tool/_tool_support_helpers.py +51 -12
  102. inspect_ai/tool/_tools/_bash_session.py +190 -68
  103. inspect_ai/tool/_tools/_computer/_computer.py +25 -1
  104. inspect_ai/tool/_tools/_execute.py +4 -1
  105. inspect_ai/tool/_tools/_text_editor.py +4 -3
  106. inspect_ai/tool/_tools/_web_browser/_web_browser.py +10 -3
  107. inspect_ai/util/__init__.py +16 -0
  108. inspect_ai/util/_anyio.py +11 -0
  109. inspect_ai/util/_collect.py +50 -0
  110. inspect_ai/util/_limit.py +393 -0
  111. inspect_ai/util/_limited_conversation.py +57 -0
  112. inspect_ai/util/_span.py +58 -0
  113. inspect_ai/util/_subtask.py +27 -42
  114. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/METADATA +1 -1
  115. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/RECORD +120 -134
  116. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/WHEEL +1 -1
  117. inspect_ai/_display/core/group.py +0 -79
  118. inspect_ai/solver/_limit.py +0 -39
  119. inspect_ai/tool/_tools/_computer/_resources/Dockerfile +0 -102
  120. inspect_ai/tool/_tools/_computer/_resources/README.md +0 -30
  121. inspect_ai/tool/_tools/_computer/_resources/entrypoint/entrypoint.sh +0 -18
  122. inspect_ai/tool/_tools/_computer/_resources/entrypoint/novnc_startup.sh +0 -20
  123. inspect_ai/tool/_tools/_computer/_resources/entrypoint/x11vnc_startup.sh +0 -48
  124. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xfce_startup.sh +0 -13
  125. inspect_ai/tool/_tools/_computer/_resources/entrypoint/xvfb_startup.sh +0 -48
  126. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/globalStorage/state.vscdb +0 -0
  127. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/Code/User/settings.json +0 -9
  128. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-panel.xml +0 -61
  129. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-screensaver.xml +0 -10
  130. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml +0 -91
  131. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Firefox Web Browser.desktop +0 -10
  132. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Terminal.desktop +0 -10
  133. inspect_ai/tool/_tools/_computer/_resources/image_home_dir/Desktop/Visual Studio Code.desktop +0 -10
  134. inspect_ai/tool/_tools/_computer/_resources/tool/.pylintrc +0 -8
  135. inspect_ai/tool/_tools/_computer/_resources/tool/.vscode/settings.json +0 -12
  136. inspect_ai/tool/_tools/_computer/_resources/tool/_args.py +0 -78
  137. inspect_ai/tool/_tools/_computer/_resources/tool/_constants.py +0 -22
  138. inspect_ai/tool/_tools/_computer/_resources/tool/_logger.py +0 -22
  139. inspect_ai/tool/_tools/_computer/_resources/tool/_run.py +0 -42
  140. inspect_ai/tool/_tools/_computer/_resources/tool/_tool_result.py +0 -33
  141. inspect_ai/tool/_tools/_computer/_resources/tool/_x11_client.py +0 -341
  142. inspect_ai/tool/_tools/_computer/_resources/tool/computer_tool.py +0 -141
  143. inspect_ai/tool/_tools/_computer/_resources/tool/pyproject.toml +0 -65
  144. inspect_ai/tool/_tools/_computer/_resources/tool/requirements.txt +0 -0
  145. inspect_ai/tool/_tools/_computer/test_args.py +0 -151
  146. /inspect_ai/{tool/_tools/_computer/_resources/tool/__init__.py → _view/www/src/app/log-view/tabs/ModelsTab.module.css} +0 -0
  147. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/entry_points.txt +0 -0
  148. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/licenses/LICENSE +0 -0
  149. {inspect_ai-0.3.92.dist-info → inspect_ai-0.3.94.dist-info}/top_level.txt +0 -0
@@ -7,13 +7,17 @@ It includes definitions for JSON-RPC request and response models, as well as fun
7
7
  from textwrap import dedent
8
8
  from typing import Type
9
9
 
10
+ import semver
11
+
10
12
  from inspect_ai._util.error import PrerequisiteError
13
+ from inspect_ai.tool._tool import ToolError
11
14
  from inspect_ai.util import sandbox_with
12
15
  from inspect_ai.util._sandbox.environment import SandboxEnvironment
13
16
 
14
17
  from ._json_rpc_helpers import (
15
18
  BaseModelT,
16
19
  JSONRPCParamsType,
20
+ JSONRPCServerErrorMapper,
17
21
  JSONRPCTransport,
18
22
  ScalarT,
19
23
  _rpc_call_description,
@@ -29,7 +33,7 @@ async def exec_scalar_request(
29
33
  method: str,
30
34
  params: JSONRPCParamsType,
31
35
  result_type: Type[ScalarT],
32
- timeout: int | None = None,
36
+ timeout: int,
33
37
  user: str | None = None,
34
38
  ) -> ScalarT:
35
39
  return await scalar_request(
@@ -37,6 +41,7 @@ async def exec_scalar_request(
37
41
  params,
38
42
  result_type,
39
43
  transport=ToolSupportSandboxTransport(sandbox, timeout, user),
44
+ server_error_mapper=ToolSupportServerErrorMapper(),
40
45
  )
41
46
 
42
47
 
@@ -45,7 +50,7 @@ async def exec_model_request(
45
50
  method: str,
46
51
  params: JSONRPCParamsType,
47
52
  result_type: Type[BaseModelT],
48
- timeout: int | None = None,
53
+ timeout: int,
49
54
  user: str | None = None,
50
55
  ) -> BaseModelT:
51
56
  return await model_request(
@@ -53,6 +58,7 @@ async def exec_model_request(
53
58
  params,
54
59
  result_type,
55
60
  transport=ToolSupportSandboxTransport(sandbox, timeout, user),
61
+ server_error_mapper=ToolSupportServerErrorMapper(),
56
62
  )
57
63
 
58
64
 
@@ -60,7 +66,7 @@ async def exec_notification(
60
66
  sandbox: SandboxEnvironment,
61
67
  method: str,
62
68
  params: JSONRPCParamsType,
63
- timeout: int | None = None,
69
+ timeout: int,
64
70
  user: str | None = None,
65
71
  ) -> None:
66
72
  return await notification_helper(
@@ -68,19 +74,33 @@ async def exec_notification(
68
74
  )
69
75
 
70
76
 
77
+ class ToolSupportServerErrorMapper(JSONRPCServerErrorMapper):
78
+ def __call__(
79
+ self, code: int, message: str, method: str, params: JSONRPCParamsType
80
+ ) -> Exception:
81
+ """Map `inspect-tool-support` defined custom codes to an exception."""
82
+ match code:
83
+ case -32099: # This is a ToolException from the container
84
+ return ToolError(message)
85
+ case -32098: # This is an unexpected exception inside the container
86
+ return RuntimeError(message)
87
+ case _:
88
+ return RuntimeError(message)
89
+
90
+
71
91
  class ToolSupportSandboxTransport(JSONRPCTransport):
72
92
  """
73
- A transport callable that uses a sandbox for RPC communication.
93
+ A transport that uses a sandbox for RPC communication.
74
94
 
75
- This class implements the TransportCallable protocol and encapsulates
76
- the sandbox, timeout, and user parameters needed for sandbox-based
77
- RPC communication.
95
+ This class implements the TransportCallable protocol and encapsulates the
96
+ sandbox, timeout, and user parameters needed for sandbox-based RPC
97
+ communication.
78
98
  """
79
99
 
80
100
  def __init__(
81
101
  self,
82
102
  sandbox: SandboxEnvironment,
83
- timeout: int | None = None,
103
+ timeout: int,
84
104
  user: str | None = None,
85
105
  ):
86
106
  """
@@ -128,13 +148,32 @@ class ToolSupportSandboxTransport(JSONRPCTransport):
128
148
 
129
149
  SANDBOX_CLI = "inspect-tool-support"
130
150
  INSPECT_TOOL_SUPPORT_IMAGE_DOCKERHUB = "aisiuk/inspect-tool-support"
151
+ FIRST_PUBLISHED_VERSION = semver.Version.parse("0.1.6")
152
+ MIN_SUPPORTED_VERSION = FIRST_PUBLISHED_VERSION
153
+ MIN_NON_DEPRECATED_VERSION = semver.Version.parse("1.0.0")
154
+
155
+
156
+ async def _get_sandbox_tool_support_version(
157
+ sandbox: SandboxEnvironment,
158
+ ) -> semver.Version:
159
+ try:
160
+ return semver.Version.parse(
161
+ await exec_scalar_request(sandbox, "version", {}, str, 5)
162
+ )
163
+ except RuntimeError as rte:
164
+ if "-32601" in str(rte):
165
+ # The container doesn't even have a version method. The first version
166
+ # published was 0.1.6, so we'll have to assume it was that old.
167
+ return FIRST_PUBLISHED_VERSION
168
+ raise rte
131
169
 
132
170
 
133
- async def tool_container_sandbox(
171
+ async def tool_support_sandbox(
134
172
  tool_name: str, *, sandbox_name: str | None = None
135
- ) -> SandboxEnvironment:
173
+ ) -> tuple[SandboxEnvironment, semver.Version]:
136
174
  if sb := await sandbox_with(SANDBOX_CLI, True, name=sandbox_name):
137
- return sb
175
+ current_version = await _get_sandbox_tool_support_version(sb)
176
+ return (sb, current_version)
138
177
 
139
178
  # This sort of programmatic sentence building will not cut it if we ever
140
179
  # support other languages.
@@ -160,7 +199,7 @@ async def tool_container_sandbox(
160
199
 
161
200
 
162
201
  def create_sandbox_transport(
163
- sandbox: SandboxEnvironment, timeout: int | None = None, user: str | None = None
202
+ sandbox: SandboxEnvironment, timeout: int, user: str | None = None
164
203
  ) -> JSONRPCTransport:
165
204
  """
166
205
  Create a transport callable that uses a sandbox for RPC communication.
@@ -1,20 +1,27 @@
1
- from pydantic import BaseModel, Field, RootModel
1
+ from textwrap import dedent
2
+ from typing import Annotated, Literal
3
+
4
+ from pydantic import BaseModel, Discriminator, Field, RootModel
5
+ from semver import Version
2
6
  from shortuuid import uuid
3
7
 
8
+ from inspect_ai._util.error import PrerequisiteError
4
9
  from inspect_ai.tool import ToolResult
5
- from inspect_ai.tool._tool_support_helpers import (
6
- exec_model_request,
7
- tool_container_sandbox,
8
- )
9
10
  from inspect_ai.util import StoreModel, store_as
11
+ from inspect_ai.util._sandbox.environment import SandboxEnvironment
10
12
 
11
13
  from .._tool import Tool, ToolParsingError, tool
12
- from .._tool_call import ToolCall, ToolCallContent, ToolCallView, ToolCallViewer
13
-
14
+ from .._tool_support_helpers import (
15
+ exec_model_request,
16
+ exec_scalar_request,
17
+ tool_support_sandbox,
18
+ )
14
19
 
15
20
  # These models are cloned from the container code. If/when we decide to create
16
21
  # a package that is shared between the inspect and tool-container codebases, we'll
17
22
  # just have to live with it.
23
+
24
+
18
25
  class NewSessionResult(BaseModel):
19
26
  session_name: str
20
27
 
@@ -23,106 +30,221 @@ class BashRestartResult(BaseModel):
23
30
  pass
24
31
 
25
32
 
26
- class BashCommandResult(BaseModel):
27
- status: int
28
- stdout: str
29
- stderr: str
33
+ class BashSessionStore(StoreModel):
34
+ session_id: str = Field(default_factory=str)
35
+ sandbox: SandboxEnvironment | None = Field(default=None)
30
36
 
31
37
 
32
- class BashResult(RootModel[BashRestartResult | BashCommandResult]):
33
- pass
38
+ # Action-specific parameter models
34
39
 
35
40
 
36
- class BashSessionStore(StoreModel):
37
- session_id: str = Field(default_factory=str)
41
+ class TypeParams(BaseModel):
42
+ action: Literal["type"] = "type"
43
+ input: str
38
44
 
39
45
 
40
- # custom viewer for bash
41
- def code_viewer(language: str, code_param: str) -> ToolCallViewer:
42
- def viewer(tool_call: ToolCall) -> ToolCallView:
43
- code = tool_call.arguments.get(code_param, None)
44
- code = (code or tool_call.function).strip()
45
- call = ToolCallContent(
46
- title=language,
47
- format="markdown",
48
- content=f"```{language}\n" + code + "\n```\n",
49
- )
50
- return ToolCallView(call=call)
46
+ class TypeSubmitParams(BaseModel):
47
+ action: Literal["type_submit"] = "type_submit"
48
+ input: str
49
+
50
+
51
+ class RestartParams(BaseModel):
52
+ action: Literal["restart"] = "restart"
53
+
51
54
 
52
- return viewer
55
+ class ReadParams(BaseModel):
56
+ action: Literal["read"] = "read"
53
57
 
54
58
 
55
- @tool(viewer=code_viewer("bash", "command"))
56
- def bash_session(*, timeout: int | None = None, instance: str | None = uuid()) -> Tool:
57
- """Bash shell session command execution tool.
59
+ class InterruptParams(BaseModel):
60
+ action: Literal["interrupt"] = "interrupt"
58
61
 
59
- Execute bash shell commands in a long running session using a sandbox environment (e.g. "docker").
62
+
63
+ class BashSessionParams(
64
+ RootModel[
65
+ TypeParams | TypeSubmitParams | RestartParams | ReadParams | InterruptParams
66
+ ]
67
+ ):
68
+ root: Annotated[
69
+ TypeParams | TypeSubmitParams | RestartParams | ReadParams | InterruptParams,
70
+ Discriminator("action"),
71
+ ]
72
+
73
+
74
+ DEFAULT_WAIT_FOR_OUTPUT = 30
75
+ DEFAULT_IDLE_TIME = 0.5
76
+ # this is how long we're willing to wait for the basic RPC call overhead.
77
+ TRANSPORT_TIMEOUT = 5
78
+
79
+
80
+ @tool()
81
+ def bash_session(
82
+ *,
83
+ timeout: int | None = None, # default is max_wait + 5 seconds
84
+ wait_for_output: int | None = None, # default is 30 seconds
85
+ instance: str | None = uuid(),
86
+ ) -> Tool:
87
+ """Interactive bash shell session tool.
88
+
89
+ Interact with a bash shell in a long running session using a sandbox
90
+ environment (e.g. "docker"). This tool allows sending text to the shell,
91
+ which could be a command followed by a newline character or any other input
92
+ text such as the response to a password prompt.
60
93
 
61
94
  By default, a separate bash process is created within the sandbox for each
62
- call to `bash_session()`. You can modify this behavior by passing `instance=None`
63
- (which will result in a single bash process for the entire sample) or use other
64
- `instance` values that implement another scheme).
95
+ call to `bash_session()`. You can modify this behavior by passing
96
+ `instance=None` (which will result in a single bash process for the entire
97
+ sample) or use other `instance` values that implement another scheme).
65
98
 
66
99
  See complete documentation at <https://inspect.aisi.org.uk/tools-standard.html#sec-bash-session>.
67
100
 
68
101
  Args:
69
102
  timeout: Timeout (in seconds) for command.
103
+ wait_for_output: Maximum time (in seconds) to wait for output. If no
104
+ output is received within this period, the function will return an
105
+ empty string. The model may need to make multiple tool calls to obtain
106
+ all output from a given command.
70
107
  instance: Instance id (each unique instance id has its own bash process)
71
108
 
72
109
  Returns:
73
- String with command output (stdout) or command error (stderr).
110
+ String with output from the shell.
74
111
  """
112
+ wait_for_output = wait_for_output or DEFAULT_WAIT_FOR_OUTPUT
113
+ min_timeout = wait_for_output + TRANSPORT_TIMEOUT
114
+ if timeout is None:
115
+ timeout = min_timeout
116
+ elif timeout < min_timeout:
117
+ raise ValueError(
118
+ f"Timeout must be at least {min_timeout} seconds, but got {timeout}."
119
+ )
75
120
 
76
121
  async def execute(
77
- command: str | None = None,
78
- restart: bool | None = None,
122
+ action: Literal["type", "type_submit", "restart", "read", "interrupt"],
123
+ input: str | None = None,
79
124
  ) -> ToolResult:
80
- """
81
- Use this function to execute bash commands.
125
+ r"""
126
+ Interact with a bash shell.
127
+
128
+ Interact with a bash shell by sending it input text and retrieving output
129
+ from it. There is no guarantee that all output will be returned in a
130
+ single call. Call this function multiple times to retrieve additional
131
+ output from the shell.
132
+
133
+ USAGE NOTES:
134
+ - Ensure that the shell is at a command prompt (typically when the
135
+ output ends in "$ " or "# ") before submitting a new command.
136
+ - Control characters must be sent as Unicode escape sequences (e.g., use
137
+ "\u0003" for Ctrl+C/ETX, "\u0004" for Ctrl+D/EOT). The literal string
138
+ "Ctrl+C" will not be interpreted as a control character.
139
+ - Use the "read" action to retrieve output from the shell without
140
+ sending any input. This is useful for long-running commands that
141
+ produce output over time. The "read" action will return any new output
142
+ since the last call.
143
+ - If a long-running command is in progress, additional input to execute
144
+ a new command will not be processed until the previous completes. To
145
+ abort a long-running command, use the "interrupt" action:
146
+ `bash_session(action="interrupt")`
147
+
148
+ Example use case:
149
+ - For a short-running command with a nominal amount of output, a single
150
+ call may suffice.
151
+ ```
152
+ bash_session(action="type_submit", input="echo foo") -> "foo\nuser@host:/# "
153
+ ```
154
+ - For a long-running command with output over time, multiple calls to are needed.
155
+ ```
156
+ bash_session(action="type_submit", input="tail -f /tmp/foo.log") -> <some output>
157
+ bash_session(action="read") -> <more output>
158
+ # Send interrupt (Ctrl+C)
159
+ bash_session(action="interrupt") -> "<final output>^Cuser@host:/# "
160
+ ```
161
+ - Interactive command awaiting more input from the user.
162
+ ```
163
+ bash_session(action="type_submit", input="ssh fred@foo.com") -> "foo.com's password: "
164
+ bash_session(action="type_submit", input="secret") -> "fred@foo.com:~$ "
165
+ ```
82
166
 
83
167
  Args:
84
- command: The bash command to run. Required unless the tool is being restarted.
85
- restart: Specifying true will restart this tool. Otherwise, leave this unspecified.
168
+ action: The action to execute:
169
+ - "type": Send input without a return key
170
+ - "type_submit": Send input followed by a return key
171
+ - "read": Read any new output without sending input
172
+ - "interrupt": Send a Ctrl+C (ETX character) to interrupt the current process
173
+ - "restart": Restart the bash session
174
+ input: The input to send to the shell.
175
+ Required for "type". Optional for "type_submit" actions. Must
176
+ not be provided for "restart", "read", or "interrupt" actions.
86
177
 
87
178
  Returns:
88
- The output of the command.
179
+ The accumulated output of the shell.
89
180
  """
90
- if not ((command is None) ^ (restart is None)):
91
- raise ToolParsingError(
92
- "Either 'command' or 'restart' must be specified, but not both."
93
- )
94
- params: dict[str, object] = {"command": command, "restart": restart}
181
+ # Validate parameters based on action
182
+ match action:
183
+ case "type":
184
+ if input is None:
185
+ raise ToolParsingError(
186
+ f"'input' is required for '{action}' action."
187
+ )
188
+ case "restart" | "read" | "interrupt":
189
+ if input is not None:
190
+ raise ToolParsingError(
191
+ f"Do not provide 'input' with '{action}' action."
192
+ )
95
193
 
96
- sandbox = await tool_container_sandbox("bash session")
97
194
  store = store_as(BashSessionStore, instance=instance)
195
+ sandbox = await _get_sandbox(store)
98
196
 
99
197
  if not store.session_id:
100
198
  store.session_id = (
101
199
  await exec_model_request(
102
- sandbox=sandbox,
103
- method="bash_session_new_session",
104
- params={},
105
- result_type=NewSessionResult,
106
- timeout=timeout,
200
+ sandbox,
201
+ "bash_session_new_session",
202
+ {},
203
+ NewSessionResult,
204
+ TRANSPORT_TIMEOUT,
107
205
  )
108
206
  ).session_name
109
207
 
110
- params["session_name"] = store.session_id
208
+ timing: dict[str, object] = {
209
+ "wait_for_output": wait_for_output,
210
+ "idle_timeout": DEFAULT_IDLE_TIME,
211
+ }
212
+ action_specific: dict[str, dict[str, object]] = {
213
+ "type": {"input": input, **timing},
214
+ "type_submit": {"input": f"{input}\n", **timing},
215
+ "interrupt": {"input": "\u0003", **timing},
216
+ "read": timing,
217
+ "restart": {"restart": True},
218
+ }
219
+
220
+ result = await exec_scalar_request(
221
+ sandbox,
222
+ "bash_session",
223
+ {"session_name": store.session_id, **(action_specific[action])},
224
+ str,
225
+ timeout,
226
+ )
111
227
 
112
- result = (
113
- await exec_model_request(
114
- sandbox=sandbox,
115
- method="bash_session",
116
- params=params,
117
- result_type=BashResult,
118
- timeout=timeout,
119
- )
120
- ).root
228
+ # Return the appropriate response
229
+ return (
230
+ "Bash session restarted."
231
+ if isinstance(result, BashRestartResult)
232
+ else result
233
+ )
121
234
 
122
- if isinstance(result, BashRestartResult):
123
- return "Bash session restarted."
235
+ return execute
124
236
 
125
- # return output (including stderr if any)
126
- return f"{result.stderr}\n{result.stdout}" if result.stderr else result.stdout
127
237
 
128
- return execute
238
+ async def _get_sandbox(store: BashSessionStore) -> SandboxEnvironment:
239
+ if not store.sandbox:
240
+ (sandbox, sandbox_version) = await tool_support_sandbox("bash session")
241
+ required_version = Version.parse("1.0.0")
242
+ if sandbox_version < required_version:
243
+ raise PrerequisiteError(
244
+ dedent(f"""
245
+ The 'inspect-tool-support' version in your container is '{sandbox_version}'. The 'bash_session' tool requires version '{required_version}' or newer. Please update your container image to the latest version of 'inspect-tool-support'.
246
+ """).strip()
247
+ )
248
+ store.sandbox = sandbox
249
+
250
+ return store.sandbox
@@ -6,7 +6,31 @@ from inspect_ai.tool._tool import TOOL_INIT_MODEL_INPUT, ToolParsingError
6
6
  from inspect_ai.tool._tool_call import ToolCallModelInput, ToolCallModelInputHints
7
7
 
8
8
  from . import _common as common
9
- from ._resources.tool._constants import Action
9
+
10
+ # this is duplicated from ._resources.tool._constants import Action
11
+ # changes should be synchronized!
12
+
13
+ Action = Literal[
14
+ "key",
15
+ "hold_key",
16
+ "type",
17
+ "cursor_position",
18
+ "mouse_move",
19
+ "left_mouse_down",
20
+ "left_mouse_up",
21
+ "left_click",
22
+ "left_click_drag",
23
+ "right_click",
24
+ "middle_click",
25
+ "back_click",
26
+ "forward_click",
27
+ "double_click",
28
+ "triple_click",
29
+ "scroll",
30
+ "wait",
31
+ "screenshot",
32
+ ]
33
+
10
34
 
11
35
  ActionFunction = Callable[[str], ToolResult | Awaitable[ToolResult]]
12
36
 
@@ -96,7 +96,10 @@ def python(
96
96
  The output of the Python code.
97
97
  """
98
98
  result = await sandbox_env(sandbox).exec(
99
- cmd=["python3"], input=code, timeout=timeout, user=user
99
+ cmd=["bash", "--login", "-c", "python3 -"],
100
+ input=code,
101
+ timeout=timeout,
102
+ user=user,
100
103
  )
101
104
  # return output (including stderr if any)
102
105
  output = ""
@@ -6,7 +6,7 @@ from pydantic import BaseModel, Discriminator, RootModel
6
6
  from inspect_ai.tool import ToolResult
7
7
  from inspect_ai.tool._tool_support_helpers import (
8
8
  exec_scalar_request,
9
- tool_container_sandbox,
9
+ tool_support_sandbox,
10
10
  )
11
11
 
12
12
  from .._tool import Tool, tool
@@ -70,12 +70,13 @@ def text_editor(timeout: int | None = None, user: str | None = None) -> Tool:
70
70
  that a change made to a file by on Subtask will be visible to another Subtask.
71
71
 
72
72
  Args:
73
- timeout: Timeout (in seconds) for command.
73
+ timeout: Timeout (in seconds) for command. Defaults to 180 if not provided.
74
74
  user: User to execute commands as.
75
75
 
76
76
  Returns:
77
77
  String with command output (stdout) or command error (stderr).
78
78
  """
79
+ timeout = timeout or 180
79
80
 
80
81
  async def execute(
81
82
  command: Literal["view", "create", "str_replace", "insert", "undo_edit"],
@@ -101,7 +102,7 @@ def text_editor(timeout: int | None = None, user: str | None = None) -> Tool:
101
102
  Returns:
102
103
  The output of the command.
103
104
  """
104
- sandbox = await tool_container_sandbox("editor")
105
+ (sandbox, _) = await tool_support_sandbox("editor")
105
106
 
106
107
  # Create a dictionary of the parameters
107
108
  params = {
@@ -10,7 +10,7 @@ from inspect_ai.tool._tool_call import ToolCall, ToolCallContent, ToolCallView
10
10
  from inspect_ai.tool._tool_info import parse_tool_info
11
11
  from inspect_ai.tool._tool_support_helpers import (
12
12
  exec_model_request,
13
- tool_container_sandbox,
13
+ tool_support_sandbox,
14
14
  )
15
15
  from inspect_ai.tool._tool_with import tool_with
16
16
  from inspect_ai.util._store_model import StoreModel, store_as
@@ -397,8 +397,10 @@ def web_browser_refresh(instance: str | None = None) -> Tool:
397
397
  async def _web_browser_cmd(
398
398
  tool_name: str, instance: str | None, params: dict[str, object]
399
399
  ) -> ToolResult:
400
+ # TODO: Is it worth it to plumb this down from the @tool?
401
+ timeout = 180
400
402
  try:
401
- sandbox_env = await tool_container_sandbox("web browser")
403
+ (sandbox_env, _) = await tool_support_sandbox("web browser")
402
404
  except PrerequisiteError as e:
403
405
  # The user may have the old, incompatible, sandbox. If so, use that and
404
406
  # execute the old compatible code.
@@ -419,13 +421,18 @@ async def _web_browser_cmd(
419
421
  method="web_new_session",
420
422
  params={"headful": False},
421
423
  result_type=NewSessionResult,
424
+ timeout=timeout,
422
425
  )
423
426
  ).session_name
424
427
 
425
428
  params["session_name"] = store.session_id
426
429
 
427
430
  crawler_result = await exec_model_request(
428
- sandbox=sandbox_env, method=tool_name, params=params, result_type=CrawlerResult
431
+ sandbox=sandbox_env,
432
+ method=tool_name,
433
+ params=params,
434
+ result_type=CrawlerResult,
435
+ timeout=timeout,
429
436
  )
430
437
  if crawler_result.error and crawler_result.error.strip() != "":
431
438
  raise ToolError(crawler_result.error)
@@ -1,6 +1,14 @@
1
1
  from inspect_ai._util.registry import RegistryType, registry_create
2
2
  from inspect_ai._util.trace import trace_action, trace_message
3
+ from inspect_ai.util._limit import (
4
+ Limit,
5
+ LimitExceededError,
6
+ apply_limits,
7
+ message_limit,
8
+ token_limit,
9
+ )
3
10
 
11
+ from ._collect import collect
4
12
  from ._concurrency import concurrency
5
13
  from ._console import input_screen
6
14
  from ._display import DisplayType, display_counter, display_type
@@ -21,6 +29,7 @@ from ._sandbox import (
21
29
  sandbox_with,
22
30
  sandboxenv,
23
31
  )
32
+ from ._span import span
24
33
  from ._store import Store, store
25
34
  from ._store_model import StoreModel, store_as
26
35
  from ._subprocess import (
@@ -31,6 +40,7 @@ from ._subtask import Subtask, subtask
31
40
  from ._throttle import throttle
32
41
 
33
42
  __all__ = [
43
+ "apply_limits",
34
44
  "ExecResult",
35
45
  "concurrency",
36
46
  "DisplayType",
@@ -42,9 +52,12 @@ __all__ = [
42
52
  "JSONType",
43
53
  "JSONSchema",
44
54
  "json_schema",
55
+ "Limit",
56
+ "message_limit",
45
57
  "OutputLimitExceededError",
46
58
  "resource",
47
59
  "subprocess",
60
+ "LimitExceededError",
48
61
  "SandboxEnvironment",
49
62
  "SandboxEnvironmentConfigType",
50
63
  "SandboxEnvironmentLimits",
@@ -60,9 +73,12 @@ __all__ = [
60
73
  "store",
61
74
  "StoreModel",
62
75
  "store_as",
76
+ "span",
77
+ "collect",
63
78
  "Subtask",
64
79
  "subtask",
65
80
  "throttle",
81
+ "token_limit",
66
82
  "trace_action",
67
83
  "trace_message",
68
84
  "RegistryType",
inspect_ai/util/_anyio.py CHANGED
@@ -1,6 +1,10 @@
1
1
  import itertools
2
2
  import sys
3
3
 
4
+ import anyio
5
+
6
+ from inspect_ai._util._async import current_async_backend
7
+
4
8
  if sys.version_info < (3, 11):
5
9
  from exceptiongroup import ExceptionGroup
6
10
 
@@ -36,3 +40,10 @@ def _flatten_exception(exc: Exception) -> list[Exception]:
36
40
  ]
37
41
 
38
42
  return maybe_this_exception + other_exceptions
43
+
44
+
45
+ def safe_current_task_id() -> int | None:
46
+ if current_async_backend() is not None:
47
+ return anyio.get_current_task().id
48
+ else:
49
+ return None