hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/tools/playwright.py
CHANGED
|
@@ -84,6 +84,9 @@ class PlaywrightTool(BaseTool):
|
|
|
84
84
|
code=INVALID_PARAMS, message="url parameter is required for navigate"
|
|
85
85
|
)
|
|
86
86
|
)
|
|
87
|
+
# Guard against pydantic FieldInfo default leaking through
|
|
88
|
+
if not isinstance(wait_for_load_state, str):
|
|
89
|
+
wait_for_load_state = None
|
|
87
90
|
result = await self.navigate(url, wait_for_load_state or "networkidle")
|
|
88
91
|
|
|
89
92
|
elif action == "screenshot":
|
|
@@ -179,11 +182,16 @@ class PlaywrightTool(BaseTool):
|
|
|
179
182
|
if self._browser is None:
|
|
180
183
|
raise RuntimeError("Failed to connect to remote browser")
|
|
181
184
|
|
|
182
|
-
#
|
|
185
|
+
# Reuse existing context and page where possible to avoid spawning new windows
|
|
183
186
|
contexts = self._browser.contexts
|
|
184
187
|
if contexts:
|
|
185
188
|
self._browser_context = contexts[0]
|
|
189
|
+
# Prefer the first existing page to keep using the already visible window/tab
|
|
190
|
+
existing_pages = self._browser_context.pages
|
|
191
|
+
if existing_pages:
|
|
192
|
+
self.page = existing_pages[0]
|
|
186
193
|
else:
|
|
194
|
+
# As a fallback, create a new context
|
|
187
195
|
self._browser_context = await self._browser.new_context(
|
|
188
196
|
viewport={"width": 1920, "height": 1080},
|
|
189
197
|
ignore_https_errors=True,
|
|
@@ -225,7 +233,14 @@ class PlaywrightTool(BaseTool):
|
|
|
225
233
|
if self._browser_context is None:
|
|
226
234
|
raise RuntimeError("Browser context failed to initialize")
|
|
227
235
|
|
|
228
|
-
|
|
236
|
+
# Reuse existing page if available (for CDP connections), otherwise create new one
|
|
237
|
+
pages = self._browser_context.pages
|
|
238
|
+
if pages:
|
|
239
|
+
self.page = pages[0]
|
|
240
|
+
logger.info("Reusing existing browser page")
|
|
241
|
+
else:
|
|
242
|
+
self.page = await self._browser_context.new_page()
|
|
243
|
+
logger.info("Created new browser page")
|
|
229
244
|
logger.info("Playwright browser launched successfully")
|
|
230
245
|
|
|
231
246
|
async def navigate(
|
|
@@ -280,7 +295,7 @@ class PlaywrightTool(BaseTool):
|
|
|
280
295
|
|
|
281
296
|
try:
|
|
282
297
|
# Always return base64 encoded screenshot as ToolResult
|
|
283
|
-
screenshot_bytes = await self.page.screenshot(full_page=
|
|
298
|
+
screenshot_bytes = await self.page.screenshot(full_page=False)
|
|
284
299
|
import base64
|
|
285
300
|
|
|
286
301
|
screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
|
hud/tools/shell.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shell tool implementation conforming to OpenAI's shell tool specification.
|
|
3
|
+
https://platform.openai.com/docs/guides/tools-shell
|
|
4
|
+
|
|
5
|
+
Key features:
|
|
6
|
+
- Auto-restart on error (no manual restart command)
|
|
7
|
+
- Dynamic timeout via timeout_ms from agent
|
|
8
|
+
- Dynamic max_output_length from agent (passed back, not truncated locally)
|
|
9
|
+
- Output conforms to shell_call_output format
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Any, Literal
|
|
17
|
+
|
|
18
|
+
from .types import ToolError
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ShellCallOutcome:
|
|
23
|
+
"""Outcome of a shell command execution."""
|
|
24
|
+
|
|
25
|
+
type: Literal["exit", "timeout"]
|
|
26
|
+
exit_code: int | None = None
|
|
27
|
+
|
|
28
|
+
def to_dict(self) -> dict:
|
|
29
|
+
if self.type == "timeout":
|
|
30
|
+
return {"type": "timeout"}
|
|
31
|
+
return {"type": "exit", "exit_code": self.exit_code}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class ShellCommandOutput:
|
|
36
|
+
"""Output of a single shell command execution."""
|
|
37
|
+
|
|
38
|
+
stdout: str
|
|
39
|
+
stderr: str
|
|
40
|
+
outcome: ShellCallOutcome
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> dict:
|
|
43
|
+
return {
|
|
44
|
+
"stdout": self.stdout,
|
|
45
|
+
"stderr": self.stderr,
|
|
46
|
+
"outcome": self.outcome.to_dict(),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ShellResult:
|
|
52
|
+
"""Result of shell tool execution, conforming to shell_call_output format."""
|
|
53
|
+
|
|
54
|
+
output: list[ShellCommandOutput]
|
|
55
|
+
max_output_length: int | None = None
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> dict:
|
|
58
|
+
result: dict[str, Any] = {
|
|
59
|
+
"output": [o.to_dict() for o in self.output],
|
|
60
|
+
}
|
|
61
|
+
if self.max_output_length is not None:
|
|
62
|
+
result["max_output_length"] = self.max_output_length
|
|
63
|
+
return result
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class _BashSession:
|
|
67
|
+
"""A session of a bash shell."""
|
|
68
|
+
|
|
69
|
+
_started: bool
|
|
70
|
+
_process: asyncio.subprocess.Process
|
|
71
|
+
|
|
72
|
+
command: str = "/bin/bash"
|
|
73
|
+
_output_delay: float = 0.2 # seconds
|
|
74
|
+
_sentinel: str = "<<exit>>"
|
|
75
|
+
|
|
76
|
+
def __init__(self) -> None:
|
|
77
|
+
self._started = False
|
|
78
|
+
self._timed_out = False
|
|
79
|
+
|
|
80
|
+
async def start(self) -> None:
|
|
81
|
+
if self._started:
|
|
82
|
+
await asyncio.sleep(0)
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
# preexec_fn and user demotion only available on Unix when running as root
|
|
86
|
+
preexec_fn = None
|
|
87
|
+
if sys.platform != "win32" and os.getuid() == 0:
|
|
88
|
+
# Only demote when running as root (e.g., inside Docker containers)
|
|
89
|
+
def demote() -> None:
|
|
90
|
+
# This only runs in the child process (Unix only)
|
|
91
|
+
os.setsid() # type: ignore[attr-defined]
|
|
92
|
+
os.setgid(1000) # type: ignore[attr-defined]
|
|
93
|
+
os.setuid(1000) # type: ignore[attr-defined]
|
|
94
|
+
|
|
95
|
+
preexec_fn = demote
|
|
96
|
+
|
|
97
|
+
self._process = await asyncio.create_subprocess_shell( # noqa: S604
|
|
98
|
+
self.command,
|
|
99
|
+
preexec_fn=preexec_fn,
|
|
100
|
+
shell=True,
|
|
101
|
+
bufsize=0,
|
|
102
|
+
stdin=asyncio.subprocess.PIPE,
|
|
103
|
+
stdout=asyncio.subprocess.PIPE,
|
|
104
|
+
stderr=asyncio.subprocess.PIPE,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self._started = True
|
|
108
|
+
self._timed_out = False
|
|
109
|
+
|
|
110
|
+
def stop(self) -> None:
|
|
111
|
+
"""Terminate the bash shell."""
|
|
112
|
+
if not self._started:
|
|
113
|
+
return
|
|
114
|
+
if self._process.returncode is not None:
|
|
115
|
+
return
|
|
116
|
+
self._process.terminate()
|
|
117
|
+
|
|
118
|
+
def is_alive(self) -> bool:
|
|
119
|
+
"""Check if the session is alive and usable."""
|
|
120
|
+
return self._started and self._process.returncode is None and not self._timed_out
|
|
121
|
+
|
|
122
|
+
async def run(self, command: str, timeout_ms: int | None = None) -> ShellCommandOutput:
|
|
123
|
+
"""Execute a command in the bash shell."""
|
|
124
|
+
if not self._started:
|
|
125
|
+
raise ToolError("Session has not started.")
|
|
126
|
+
|
|
127
|
+
# Convert timeout from ms to seconds, default to 120 seconds
|
|
128
|
+
timeout_sec = (timeout_ms / 1000.0) if timeout_ms else 120.0
|
|
129
|
+
|
|
130
|
+
# we know these are not None because we created the process with PIPEs
|
|
131
|
+
assert self._process.stdin
|
|
132
|
+
assert self._process.stdout
|
|
133
|
+
assert self._process.stderr
|
|
134
|
+
|
|
135
|
+
# send command to the process
|
|
136
|
+
self._process.stdin.write(command.encode() + f"; echo '{self._sentinel}'$?\n".encode())
|
|
137
|
+
await self._process.stdin.drain()
|
|
138
|
+
|
|
139
|
+
output = ""
|
|
140
|
+
error = ""
|
|
141
|
+
exit_code = None
|
|
142
|
+
|
|
143
|
+
# read output from the process, until the sentinel is found
|
|
144
|
+
try:
|
|
145
|
+
async with asyncio.timeout(timeout_sec):
|
|
146
|
+
while True:
|
|
147
|
+
await asyncio.sleep(self._output_delay)
|
|
148
|
+
# if we read directly from stdout/stderr, it will wait forever for
|
|
149
|
+
# EOF. use the StreamReader buffer directly instead.
|
|
150
|
+
output = self._process.stdout._buffer.decode() # pyright: ignore[reportAttributeAccessIssue]
|
|
151
|
+
error = self._process.stderr._buffer.decode() # pyright: ignore[reportAttributeAccessIssue]
|
|
152
|
+
if self._sentinel in output:
|
|
153
|
+
# Extract exit code from sentinel line
|
|
154
|
+
sentinel_idx = output.index(self._sentinel)
|
|
155
|
+
# Find the exit code after the sentinel
|
|
156
|
+
after_sentinel = output[sentinel_idx + len(self._sentinel) :]
|
|
157
|
+
newline_idx = after_sentinel.find("\n")
|
|
158
|
+
if newline_idx != -1:
|
|
159
|
+
exit_code_str = after_sentinel[:newline_idx].strip()
|
|
160
|
+
else:
|
|
161
|
+
exit_code_str = after_sentinel.strip()
|
|
162
|
+
try:
|
|
163
|
+
exit_code = int(exit_code_str)
|
|
164
|
+
except ValueError:
|
|
165
|
+
exit_code = 0
|
|
166
|
+
# strip the sentinel and exit code from output
|
|
167
|
+
output = output[:sentinel_idx]
|
|
168
|
+
break
|
|
169
|
+
except TimeoutError:
|
|
170
|
+
self._timed_out = True
|
|
171
|
+
# clear the buffers
|
|
172
|
+
self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
|
|
173
|
+
self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
|
|
174
|
+
|
|
175
|
+
return ShellCommandOutput(
|
|
176
|
+
stdout=output,
|
|
177
|
+
stderr=error,
|
|
178
|
+
outcome=ShellCallOutcome(type="timeout"),
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if output.endswith("\n"):
|
|
182
|
+
output = output[:-1]
|
|
183
|
+
|
|
184
|
+
if error.endswith("\n"):
|
|
185
|
+
error = error[:-1]
|
|
186
|
+
|
|
187
|
+
# clear the buffers so that the next output can be read correctly
|
|
188
|
+
self._process.stdout._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
|
|
189
|
+
self._process.stderr._buffer.clear() # pyright: ignore[reportAttributeAccessIssue]
|
|
190
|
+
|
|
191
|
+
return ShellCommandOutput(
|
|
192
|
+
stdout=output,
|
|
193
|
+
stderr=error,
|
|
194
|
+
outcome=ShellCallOutcome(type="exit", exit_code=exit_code),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class ShellTool:
|
|
199
|
+
"""
|
|
200
|
+
A tool that allows the agent to run shell commands.
|
|
201
|
+
Conforms to OpenAI's shell tool specification.
|
|
202
|
+
|
|
203
|
+
Features:
|
|
204
|
+
- Auto-restart on error (session automatically restarts if needed)
|
|
205
|
+
- Dynamic timeout via timeout_ms parameter
|
|
206
|
+
- Dynamic max_output_length (passed back to API, no local truncation)
|
|
207
|
+
- Supports concurrent command execution
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
_session: _BashSession | None
|
|
211
|
+
|
|
212
|
+
def __init__(self) -> None:
|
|
213
|
+
self._session = None
|
|
214
|
+
|
|
215
|
+
async def _ensure_session(self) -> tuple[_BashSession, str | None]:
|
|
216
|
+
"""Ensure a working session exists, auto-restarting if needed.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Tuple of (session, restart_message) where restart_message is set
|
|
220
|
+
if the session was restarted due to an error.
|
|
221
|
+
"""
|
|
222
|
+
restart_message = None
|
|
223
|
+
|
|
224
|
+
if self._session is not None and not self._session.is_alive():
|
|
225
|
+
# Session exists but is dead - auto-restart
|
|
226
|
+
old_session = self._session
|
|
227
|
+
if old_session._timed_out:
|
|
228
|
+
restart_message = "Previous session timed out. Session auto-restarted."
|
|
229
|
+
elif old_session._process.returncode is not None:
|
|
230
|
+
restart_message = (
|
|
231
|
+
f"Previous session exited with code {old_session._process.returncode}. "
|
|
232
|
+
"Session auto-restarted."
|
|
233
|
+
)
|
|
234
|
+
else:
|
|
235
|
+
restart_message = "Previous session was not usable. Session auto-restarted."
|
|
236
|
+
old_session.stop()
|
|
237
|
+
self._session = None
|
|
238
|
+
|
|
239
|
+
if self._session is None:
|
|
240
|
+
self._session = _BashSession()
|
|
241
|
+
await self._session.start()
|
|
242
|
+
if restart_message is None:
|
|
243
|
+
# First start, no message needed
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
return self._session, restart_message
|
|
247
|
+
|
|
248
|
+
async def __call__(
|
|
249
|
+
self,
|
|
250
|
+
commands: list[str] | None = None,
|
|
251
|
+
timeout_ms: int | None = None,
|
|
252
|
+
max_output_length: int | None = None,
|
|
253
|
+
**kwargs: object,
|
|
254
|
+
) -> ShellResult:
|
|
255
|
+
"""
|
|
256
|
+
Execute shell commands.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
commands: List of shell commands to execute (can run concurrently).
|
|
260
|
+
timeout_ms: Optional timeout in milliseconds for each command.
|
|
261
|
+
max_output_length: Optional max output length (passed back to API).
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
ShellResult conforming to shell_call_output format.
|
|
265
|
+
"""
|
|
266
|
+
if not commands:
|
|
267
|
+
raise ToolError("No commands provided.")
|
|
268
|
+
|
|
269
|
+
session, restart_message = await self._ensure_session()
|
|
270
|
+
outputs: list[ShellCommandOutput] = []
|
|
271
|
+
|
|
272
|
+
# Execute commands - can be done concurrently
|
|
273
|
+
# Note: OpenAI docs say commands can be executed concurrently,
|
|
274
|
+
# but for a single bash session, we run them sequentially.
|
|
275
|
+
# For true concurrency, you'd need multiple sessions or subprocess per command.
|
|
276
|
+
for command in commands:
|
|
277
|
+
# Check if session is still alive before each command
|
|
278
|
+
if not session.is_alive():
|
|
279
|
+
session, new_restart_msg = await self._ensure_session()
|
|
280
|
+
if new_restart_msg:
|
|
281
|
+
restart_message = new_restart_msg
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
result = await session.run(command, timeout_ms)
|
|
285
|
+
|
|
286
|
+
# If we had a restart message, prepend it to the first output's stderr
|
|
287
|
+
if restart_message:
|
|
288
|
+
if result.stderr:
|
|
289
|
+
result.stderr = f"[SYSTEM: {restart_message}]\n{result.stderr}"
|
|
290
|
+
else:
|
|
291
|
+
result.stderr = f"[SYSTEM: {restart_message}]"
|
|
292
|
+
restart_message = None # Only add once
|
|
293
|
+
|
|
294
|
+
outputs.append(result)
|
|
295
|
+
except Exception as e:
|
|
296
|
+
# Command execution failed, add error output
|
|
297
|
+
outputs.append(
|
|
298
|
+
ShellCommandOutput(
|
|
299
|
+
stdout="",
|
|
300
|
+
stderr=str(e),
|
|
301
|
+
outcome=ShellCallOutcome(type="exit", exit_code=1),
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
return ShellResult(
|
|
306
|
+
output=outputs,
|
|
307
|
+
max_output_length=max_output_length,
|
|
308
|
+
)
|