hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/tools/playwright.py CHANGED
@@ -84,6 +84,9 @@ class PlaywrightTool(BaseTool):
84
84
  code=INVALID_PARAMS, message="url parameter is required for navigate"
85
85
  )
86
86
  )
87
+ # Guard against pydantic FieldInfo default leaking through
88
+ if not isinstance(wait_for_load_state, str):
89
+ wait_for_load_state = None
87
90
  result = await self.navigate(url, wait_for_load_state or "networkidle")
88
91
 
89
92
  elif action == "screenshot":
@@ -179,11 +182,16 @@ class PlaywrightTool(BaseTool):
179
182
  if self._browser is None:
180
183
  raise RuntimeError("Failed to connect to remote browser")
181
184
 
182
- # Use existing context or create new one
185
+ # Reuse existing context and page where possible to avoid spawning new windows
183
186
  contexts = self._browser.contexts
184
187
  if contexts:
185
188
  self._browser_context = contexts[0]
189
+ # Prefer the first existing page to keep using the already visible window/tab
190
+ existing_pages = self._browser_context.pages
191
+ if existing_pages:
192
+ self.page = existing_pages[0]
186
193
  else:
194
+ # As a fallback, create a new context
187
195
  self._browser_context = await self._browser.new_context(
188
196
  viewport={"width": 1920, "height": 1080},
189
197
  ignore_https_errors=True,
@@ -225,7 +233,14 @@ class PlaywrightTool(BaseTool):
225
233
  if self._browser_context is None:
226
234
  raise RuntimeError("Browser context failed to initialize")
227
235
 
228
- self.page = await self._browser_context.new_page()
236
+ # Reuse existing page if available (for CDP connections), otherwise create new one
237
+ pages = self._browser_context.pages
238
+ if pages:
239
+ self.page = pages[0]
240
+ logger.info("Reusing existing browser page")
241
+ else:
242
+ self.page = await self._browser_context.new_page()
243
+ logger.info("Created new browser page")
229
244
  logger.info("Playwright browser launched successfully")
230
245
 
231
246
  async def navigate(
@@ -280,7 +295,7 @@ class PlaywrightTool(BaseTool):
280
295
 
281
296
  try:
282
297
  # Always return base64 encoded screenshot as ToolResult
283
- screenshot_bytes = await self.page.screenshot(full_page=True)
298
+ screenshot_bytes = await self.page.screenshot(full_page=False)
284
299
  import base64
285
300
 
286
301
  screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
hud/tools/shell.py ADDED
@@ -0,0 +1,308 @@
1
+ """
2
+ Shell tool implementation conforming to OpenAI's shell tool specification.
3
+ https://platform.openai.com/docs/guides/tools-shell
4
+
5
+ Key features:
6
+ - Auto-restart on error (no manual restart command)
7
+ - Dynamic timeout via timeout_ms from agent
8
+ - Dynamic max_output_length from agent (passed back, not truncated locally)
9
+ - Output conforms to shell_call_output format
10
+ """
11
+
12
+ import asyncio
13
+ import os
14
+ import sys
15
+ from dataclasses import dataclass
16
+ from typing import Any, Literal
17
+
18
+ from .types import ToolError
19
+
20
+
21
+ @dataclass
22
+ class ShellCallOutcome:
23
+ """Outcome of a shell command execution."""
24
+
25
+ type: Literal["exit", "timeout"]
26
+ exit_code: int | None = None
27
+
28
+ def to_dict(self) -> dict:
29
+ if self.type == "timeout":
30
+ return {"type": "timeout"}
31
+ return {"type": "exit", "exit_code": self.exit_code}
32
+
33
+
34
+ @dataclass
35
+ class ShellCommandOutput:
36
+ """Output of a single shell command execution."""
37
+
38
+ stdout: str
39
+ stderr: str
40
+ outcome: ShellCallOutcome
41
+
42
+ def to_dict(self) -> dict:
43
+ return {
44
+ "stdout": self.stdout,
45
+ "stderr": self.stderr,
46
+ "outcome": self.outcome.to_dict(),
47
+ }
48
+
49
+
50
+ @dataclass
51
+ class ShellResult:
52
+ """Result of shell tool execution, conforming to shell_call_output format."""
53
+
54
+ output: list[ShellCommandOutput]
55
+ max_output_length: int | None = None
56
+
57
+ def to_dict(self) -> dict:
58
+ result: dict[str, Any] = {
59
+ "output": [o.to_dict() for o in self.output],
60
+ }
61
+ if self.max_output_length is not None:
62
+ result["max_output_length"] = self.max_output_length
63
+ return result
64
+
65
+
66
+ class _BashSession:
67
+ """A session of a bash shell."""
68
+
69
+ _started: bool
70
+ _process: asyncio.subprocess.Process
71
+
72
+ command: str = "/bin/bash"
73
+ _output_delay: float = 0.2 # seconds
74
+ _sentinel: str = "<<exit>>"
75
+
76
+ def __init__(self) -> None:
77
+ self._started = False
78
+ self._timed_out = False
79
+
80
+ async def start(self) -> None:
81
+ if self._started:
82
+ await asyncio.sleep(0)
83
+ return
84
+
85
+ # preexec_fn and user demotion only available on Unix when running as root
86
+ preexec_fn = None
87
+ if sys.platform != "win32" and os.getuid() == 0:
88
+ # Only demote when running as root (e.g., inside Docker containers)
89
+ def demote() -> None:
90
+ # This only runs in the child process (Unix only)
91
+ os.setsid() # type: ignore[attr-defined]
92
+ os.setgid(1000) # type: ignore[attr-defined]
93
+ os.setuid(1000) # type: ignore[attr-defined]
94
+
95
+ preexec_fn = demote
96
+
97
+ self._process = await asyncio.create_subprocess_shell( # noqa: S604
98
+ self.command,
99
+ preexec_fn=preexec_fn,
100
+ shell=True,
101
+ bufsize=0,
102
+ stdin=asyncio.subprocess.PIPE,
103
+ stdout=asyncio.subprocess.PIPE,
104
+ stderr=asyncio.subprocess.PIPE,
105
+ )
106
+
107
+ self._started = True
108
+ self._timed_out = False
109
+
110
+ def stop(self) -> None:
111
+ """Terminate the bash shell."""
112
+ if not self._started:
113
+ return
114
+ if self._process.returncode is not None:
115
+ return
116
+ self._process.terminate()
117
+
118
+ def is_alive(self) -> bool:
119
+ """Check if the session is alive and usable."""
120
+ return self._started and self._process.returncode is None and not self._timed_out
121
+
122
+ async def run(self, command: str, timeout_ms: int | None = None) -> ShellCommandOutput:
123
+ """Execute a command in the bash shell."""
124
+ if not self._started:
125
+ raise ToolError("Session has not started.")
126
+
127
+ # Convert timeout from ms to seconds, default to 120 seconds
128
+ timeout_sec = (timeout_ms / 1000.0) if timeout_ms else 120.0
129
+
130
+ # we know these are not None because we created the process with PIPEs
131
+ assert self._process.stdin
132
+ assert self._process.stdout
133
+ assert self._process.stderr
134
+
135
+ # send command to the process
136
+ self._process.stdin.write(command.encode() + f"; echo '{self._sentinel}'$?\n".encode())
137
+ await self._process.stdin.drain()
138
+
139
+ output = ""
140
+ error = ""
141
+ exit_code = None
142
+
143
+ # read output from the process, until the sentinel is found
144
+ try:
145
+ async with asyncio.timeout(timeout_sec):
146
+ while True:
147
+ await asyncio.sleep(self._output_delay)
148
+ # if we read directly from stdout/stderr, it will wait forever for
149
+ # EOF. use the StreamReader buffer directly instead.
150
+ output = self._process.stdout._buffer.decode() # pyright: ignore[reportAttributeAccessIssue]
151
+ error = self._process.stderr._buffer.decode() # pyright: ignore[reportAttributeAccessIssue]
152
+ if self._sentinel in output:
153
+ # Extract exit code from sentinel line
154
+ sentinel_idx = output.index(self._sentinel)
155
+ # Find the exit code after the sentinel
156
+ after_sentinel = output[sentinel_idx + len(self._sentinel) :]
157
+ newline_idx = after_sentinel.find("\n")
158
+ if newline_idx != -1:
159
+ exit_code_str = after_sentinel[:newline_idx].strip()
160
+ else:
161
+ exit_code_str = after_sentinel.strip()
162
+ try:
163
+ exit_code = int(exit_code_str)
164
+ except ValueError:
165
+ exit_code = 0
166
+ # strip the sentinel and exit code from output
167
+ output = output[:sentinel_idx]
168
+ break
169
+ except TimeoutError:
170
+ self._timed_out = True
171
+ # clear the buffers
172
+ self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
173
+ self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
174
+
175
+ return ShellCommandOutput(
176
+ stdout=output,
177
+ stderr=error,
178
+ outcome=ShellCallOutcome(type="timeout"),
179
+ )
180
+
181
+ if output.endswith("\n"):
182
+ output = output[:-1]
183
+
184
+ if error.endswith("\n"):
185
+ error = error[:-1]
186
+
187
+ # clear the buffers so that the next output can be read correctly
188
+ self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
189
+ self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
190
+
191
+ return ShellCommandOutput(
192
+ stdout=output,
193
+ stderr=error,
194
+ outcome=ShellCallOutcome(type="exit", exit_code=exit_code),
195
+ )
196
+
197
+
198
+ class ShellTool:
199
+ """
200
+ A tool that allows the agent to run shell commands.
201
+ Conforms to OpenAI's shell tool specification.
202
+
203
+ Features:
204
+ - Auto-restart on error (session automatically restarts if needed)
205
+ - Dynamic timeout via timeout_ms parameter
206
+ - Dynamic max_output_length (passed back to API, no local truncation)
207
+ - Supports concurrent command execution
208
+ """
209
+
210
+ _session: _BashSession | None
211
+
212
+ def __init__(self) -> None:
213
+ self._session = None
214
+
215
+ async def _ensure_session(self) -> tuple[_BashSession, str | None]:
216
+ """Ensure a working session exists, auto-restarting if needed.
217
+
218
+ Returns:
219
+ Tuple of (session, restart_message) where restart_message is set
220
+ if the session was restarted due to an error.
221
+ """
222
+ restart_message = None
223
+
224
+ if self._session is not None and not self._session.is_alive():
225
+ # Session exists but is dead - auto-restart
226
+ old_session = self._session
227
+ if old_session._timed_out:
228
+ restart_message = "Previous session timed out. Session auto-restarted."
229
+ elif old_session._process.returncode is not None:
230
+ restart_message = (
231
+ f"Previous session exited with code {old_session._process.returncode}. "
232
+ "Session auto-restarted."
233
+ )
234
+ else:
235
+ restart_message = "Previous session was not usable. Session auto-restarted."
236
+ old_session.stop()
237
+ self._session = None
238
+
239
+ if self._session is None:
240
+ self._session = _BashSession()
241
+ await self._session.start()
242
+ if restart_message is None:
243
+ # First start, no message needed
244
+ pass
245
+
246
+ return self._session, restart_message
247
+
248
+ async def __call__(
249
+ self,
250
+ commands: list[str] | None = None,
251
+ timeout_ms: int | None = None,
252
+ max_output_length: int | None = None,
253
+ **kwargs: object,
254
+ ) -> ShellResult:
255
+ """
256
+ Execute shell commands.
257
+
258
+ Args:
259
+ commands: List of shell commands to execute (can run concurrently).
260
+ timeout_ms: Optional timeout in milliseconds for each command.
261
+ max_output_length: Optional max output length (passed back to API).
262
+
263
+ Returns:
264
+ ShellResult conforming to shell_call_output format.
265
+ """
266
+ if not commands:
267
+ raise ToolError("No commands provided.")
268
+
269
+ session, restart_message = await self._ensure_session()
270
+ outputs: list[ShellCommandOutput] = []
271
+
272
+ # Execute commands - can be done concurrently
273
+ # Note: OpenAI docs say commands can be executed concurrently,
274
+ # but for a single bash session, we run them sequentially.
275
+ # For true concurrency, you'd need multiple sessions or subprocess per command.
276
+ for command in commands:
277
+ # Check if session is still alive before each command
278
+ if not session.is_alive():
279
+ session, new_restart_msg = await self._ensure_session()
280
+ if new_restart_msg:
281
+ restart_message = new_restart_msg
282
+
283
+ try:
284
+ result = await session.run(command, timeout_ms)
285
+
286
+ # If we had a restart message, prepend it to the first output's stderr
287
+ if restart_message:
288
+ if result.stderr:
289
+ result.stderr = f"[SYSTEM: {restart_message}]\n{result.stderr}"
290
+ else:
291
+ result.stderr = f"[SYSTEM: {restart_message}]"
292
+ restart_message = None # Only add once
293
+
294
+ outputs.append(result)
295
+ except Exception as e:
296
+ # Command execution failed, add error output
297
+ outputs.append(
298
+ ShellCommandOutput(
299
+ stdout="",
300
+ stderr=str(e),
301
+ outcome=ShellCallOutcome(type="exit", exit_code=1),
302
+ )
303
+ )
304
+
305
+ return ShellResult(
306
+ output=outputs,
307
+ max_output_length=max_output_length,
308
+ )