hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/tools/playwright.py CHANGED
@@ -84,6 +84,9 @@ class PlaywrightTool(BaseTool):
84
84
  code=INVALID_PARAMS, message="url parameter is required for navigate"
85
85
  )
86
86
  )
87
+ # Guard against pydantic FieldInfo default leaking through
88
+ if not isinstance(wait_for_load_state, str):
89
+ wait_for_load_state = None
87
90
  result = await self.navigate(url, wait_for_load_state or "networkidle")
88
91
 
89
92
  elif action == "screenshot":
@@ -179,11 +182,16 @@ class PlaywrightTool(BaseTool):
179
182
  if self._browser is None:
180
183
  raise RuntimeError("Failed to connect to remote browser")
181
184
 
182
- # Use existing context or create new one
185
+ # Reuse existing context and page where possible to avoid spawning new windows
183
186
  contexts = self._browser.contexts
184
187
  if contexts:
185
188
  self._browser_context = contexts[0]
189
+ # Prefer the first existing page to keep using the already visible window/tab
190
+ existing_pages = self._browser_context.pages
191
+ if existing_pages:
192
+ self.page = existing_pages[0]
186
193
  else:
194
+ # As a fallback, create a new context
187
195
  self._browser_context = await self._browser.new_context(
188
196
  viewport={"width": 1920, "height": 1080},
189
197
  ignore_https_errors=True,
@@ -225,7 +233,14 @@ class PlaywrightTool(BaseTool):
225
233
  if self._browser_context is None:
226
234
  raise RuntimeError("Browser context failed to initialize")
227
235
 
228
- self.page = await self._browser_context.new_page()
236
+ # Reuse existing page if available (for CDP connections), otherwise create new one
237
+ pages = self._browser_context.pages
238
+ if pages:
239
+ self.page = pages[0]
240
+ logger.info("Reusing existing browser page")
241
+ else:
242
+ self.page = await self._browser_context.new_page()
243
+ logger.info("Created new browser page")
229
244
  logger.info("Playwright browser launched successfully")
230
245
 
231
246
  async def navigate(
@@ -280,7 +295,7 @@ class PlaywrightTool(BaseTool):
280
295
 
281
296
  try:
282
297
  # Always return base64 encoded screenshot as ToolResult
283
- screenshot_bytes = await self.page.screenshot(full_page=True)
298
+ screenshot_bytes = await self.page.screenshot(full_page=False)
284
299
  import base64
285
300
 
286
301
  screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
hud/tools/shell.py ADDED
@@ -0,0 +1,308 @@
1
+ """
2
+ Shell tool implementation conforming to OpenAI's shell tool specification.
3
+ https://platform.openai.com/docs/guides/tools-shell
4
+
5
+ Key features:
6
+ - Auto-restart on error (no manual restart command)
7
+ - Dynamic timeout via timeout_ms from agent
8
+ - Dynamic max_output_length from agent (passed back, not truncated locally)
9
+ - Output conforms to shell_call_output format
10
+ """
11
+
12
+ import asyncio
13
+ import os
14
+ import sys
15
+ from dataclasses import dataclass
16
+ from typing import Any, Literal
17
+
18
+ from .types import ToolError
19
+
20
+
21
+ @dataclass
22
+ class ShellCallOutcome:
23
+ """Outcome of a shell command execution."""
24
+
25
+ type: Literal["exit", "timeout"]
26
+ exit_code: int | None = None
27
+
28
+ def to_dict(self) -> dict:
29
+ if self.type == "timeout":
30
+ return {"type": "timeout"}
31
+ return {"type": "exit", "exit_code": self.exit_code}
32
+
33
+
34
+ @dataclass
35
+ class ShellCommandOutput:
36
+ """Output of a single shell command execution."""
37
+
38
+ stdout: str
39
+ stderr: str
40
+ outcome: ShellCallOutcome
41
+
42
+ def to_dict(self) -> dict:
43
+ return {
44
+ "stdout": self.stdout,
45
+ "stderr": self.stderr,
46
+ "outcome": self.outcome.to_dict(),
47
+ }
48
+
49
+
50
+ @dataclass
51
+ class ShellResult:
52
+ """Result of shell tool execution, conforming to shell_call_output format."""
53
+
54
+ output: list[ShellCommandOutput]
55
+ max_output_length: int | None = None
56
+
57
+ def to_dict(self) -> dict:
58
+ result: dict[str, Any] = {
59
+ "output": [o.to_dict() for o in self.output],
60
+ }
61
+ if self.max_output_length is not None:
62
+ result["max_output_length"] = self.max_output_length
63
+ return result
64
+
65
+
66
+ class _BashSession:
67
+ """A session of a bash shell."""
68
+
69
+ _started: bool
70
+ _process: asyncio.subprocess.Process
71
+
72
+ command: str = "/bin/bash"
73
+ _output_delay: float = 0.2 # seconds
74
+ _sentinel: str = "<<exit>>"
75
+
76
+ def __init__(self) -> None:
77
+ self._started = False
78
+ self._timed_out = False
79
+
80
+ async def start(self) -> None:
81
+ if self._started:
82
+ await asyncio.sleep(0)
83
+ return
84
+
85
+ # preexec_fn and user demotion only available on Unix
86
+ preexec_fn = None
87
+ if sys.platform != "win32":
88
+
89
+ def demote() -> None:
90
+ # This only runs in the child process (Unix only)
91
+ os.setsid() # type: ignore[attr-defined]
92
+ os.setgid(1000) # type: ignore[attr-defined]
93
+ os.setuid(1000) # type: ignore[attr-defined]
94
+
95
+ preexec_fn = demote
96
+
97
+ self._process = await asyncio.create_subprocess_shell( # noqa: S604
98
+ self.command,
99
+ preexec_fn=preexec_fn,
100
+ shell=True,
101
+ bufsize=0,
102
+ stdin=asyncio.subprocess.PIPE,
103
+ stdout=asyncio.subprocess.PIPE,
104
+ stderr=asyncio.subprocess.PIPE,
105
+ )
106
+
107
+ self._started = True
108
+ self._timed_out = False
109
+
110
+ def stop(self) -> None:
111
+ """Terminate the bash shell."""
112
+ if not self._started:
113
+ return
114
+ if self._process.returncode is not None:
115
+ return
116
+ self._process.terminate()
117
+
118
+ def is_alive(self) -> bool:
119
+ """Check if the session is alive and usable."""
120
+ return self._started and self._process.returncode is None and not self._timed_out
121
+
122
+ async def run(self, command: str, timeout_ms: int | None = None) -> ShellCommandOutput:
123
+ """Execute a command in the bash shell."""
124
+ if not self._started:
125
+ raise ToolError("Session has not started.")
126
+
127
+ # Convert timeout from ms to seconds, default to 120 seconds
128
+ timeout_sec = (timeout_ms / 1000.0) if timeout_ms else 120.0
129
+
130
+ # we know these are not None because we created the process with PIPEs
131
+ assert self._process.stdin
132
+ assert self._process.stdout
133
+ assert self._process.stderr
134
+
135
+ # send command to the process
136
+ self._process.stdin.write(command.encode() + f"; echo '{self._sentinel}'$?\n".encode())
137
+ await self._process.stdin.drain()
138
+
139
+ output = ""
140
+ error = ""
141
+ exit_code = None
142
+
143
+ # read output from the process, until the sentinel is found
144
+ try:
145
+ async with asyncio.timeout(timeout_sec):
146
+ while True:
147
+ await asyncio.sleep(self._output_delay)
148
+ # if we read directly from stdout/stderr, it will wait forever for
149
+ # EOF. use the StreamReader buffer directly instead.
150
+ output = self._process.stdout._buffer.decode() # pyright: ignore[reportAttributeAccessIssue]
151
+ error = self._process.stderr._buffer.decode() # pyright: ignore[reportAttributeAccessIssue]
152
+ if self._sentinel in output:
153
+ # Extract exit code from sentinel line
154
+ sentinel_idx = output.index(self._sentinel)
155
+ # Find the exit code after the sentinel
156
+ after_sentinel = output[sentinel_idx + len(self._sentinel) :]
157
+ newline_idx = after_sentinel.find("\n")
158
+ if newline_idx != -1:
159
+ exit_code_str = after_sentinel[:newline_idx].strip()
160
+ else:
161
+ exit_code_str = after_sentinel.strip()
162
+ try:
163
+ exit_code = int(exit_code_str)
164
+ except ValueError:
165
+ exit_code = 0
166
+ # strip the sentinel and exit code from output
167
+ output = output[:sentinel_idx]
168
+ break
169
+ except TimeoutError:
170
+ self._timed_out = True
171
+ # clear the buffers
172
+ self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
173
+ self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
174
+
175
+ return ShellCommandOutput(
176
+ stdout=output,
177
+ stderr=error,
178
+ outcome=ShellCallOutcome(type="timeout"),
179
+ )
180
+
181
+ if output.endswith("\n"):
182
+ output = output[:-1]
183
+
184
+ if error.endswith("\n"):
185
+ error = error[:-1]
186
+
187
+ # clear the buffers so that the next output can be read correctly
188
+ self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
189
+ self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
190
+
191
+ return ShellCommandOutput(
192
+ stdout=output,
193
+ stderr=error,
194
+ outcome=ShellCallOutcome(type="exit", exit_code=exit_code),
195
+ )
196
+
197
+
198
+ class ShellTool:
199
+ """
200
+ A tool that allows the agent to run shell commands.
201
+ Conforms to OpenAI's shell tool specification.
202
+
203
+ Features:
204
+ - Auto-restart on error (session automatically restarts if needed)
205
+ - Dynamic timeout via timeout_ms parameter
206
+ - Dynamic max_output_length (passed back to API, no local truncation)
207
+ - Supports concurrent command execution
208
+ """
209
+
210
+ _session: _BashSession | None
211
+
212
+ def __init__(self) -> None:
213
+ self._session = None
214
+
215
+ async def _ensure_session(self) -> tuple[_BashSession, str | None]:
216
+ """Ensure a working session exists, auto-restarting if needed.
217
+
218
+ Returns:
219
+ Tuple of (session, restart_message) where restart_message is set
220
+ if the session was restarted due to an error.
221
+ """
222
+ restart_message = None
223
+
224
+ if self._session is not None and not self._session.is_alive():
225
+ # Session exists but is dead - auto-restart
226
+ old_session = self._session
227
+ if old_session._timed_out:
228
+ restart_message = "Previous session timed out. Session auto-restarted."
229
+ elif old_session._process.returncode is not None:
230
+ restart_message = (
231
+ f"Previous session exited with code {old_session._process.returncode}. "
232
+ "Session auto-restarted."
233
+ )
234
+ else:
235
+ restart_message = "Previous session was not usable. Session auto-restarted."
236
+ old_session.stop()
237
+ self._session = None
238
+
239
+ if self._session is None:
240
+ self._session = _BashSession()
241
+ await self._session.start()
242
+ if restart_message is None:
243
+ # First start, no message needed
244
+ pass
245
+
246
+ return self._session, restart_message
247
+
248
+ async def __call__(
249
+ self,
250
+ commands: list[str] | None = None,
251
+ timeout_ms: int | None = None,
252
+ max_output_length: int | None = None,
253
+ **kwargs: object,
254
+ ) -> ShellResult:
255
+ """
256
+ Execute shell commands.
257
+
258
+ Args:
259
+ commands: List of shell commands to execute (can run concurrently).
260
+ timeout_ms: Optional timeout in milliseconds for each command.
261
+ max_output_length: Optional max output length (passed back to API).
262
+
263
+ Returns:
264
+ ShellResult conforming to shell_call_output format.
265
+ """
266
+ if not commands:
267
+ raise ToolError("No commands provided.")
268
+
269
+ session, restart_message = await self._ensure_session()
270
+ outputs: list[ShellCommandOutput] = []
271
+
272
+ # Execute commands - can be done concurrently
273
+ # Note: OpenAI docs say commands can be executed concurrently,
274
+ # but for a single bash session, we run them sequentially.
275
+ # For true concurrency, you'd need multiple sessions or subprocess per command.
276
+ for command in commands:
277
+ # Check if session is still alive before each command
278
+ if not session.is_alive():
279
+ session, new_restart_msg = await self._ensure_session()
280
+ if new_restart_msg:
281
+ restart_message = new_restart_msg
282
+
283
+ try:
284
+ result = await session.run(command, timeout_ms)
285
+
286
+ # If we had a restart message, prepend it to the first output's stderr
287
+ if restart_message:
288
+ if result.stderr:
289
+ result.stderr = f"[SYSTEM: {restart_message}]\n{result.stderr}"
290
+ else:
291
+ result.stderr = f"[SYSTEM: {restart_message}]"
292
+ restart_message = None # Only add once
293
+
294
+ outputs.append(result)
295
+ except Exception as e:
296
+ # Command execution failed, add error output
297
+ outputs.append(
298
+ ShellCommandOutput(
299
+ stdout="",
300
+ stderr=str(e),
301
+ outcome=ShellCallOutcome(type="exit", exit_code=1),
302
+ )
303
+ )
304
+
305
+ return ShellResult(
306
+ output=outputs,
307
+ max_output_length=max_output_length,
308
+ )