hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +1 -1
- hud/agents/__init__.py +65 -6
- hud/agents/base.py +33 -15
- hud/agents/claude.py +60 -31
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +15 -26
- hud/agents/gemini_cua.py +6 -17
- hud/agents/misc/response_agent.py +7 -0
- hud/agents/openai.py +16 -29
- hud/agents/openai_chat.py +3 -19
- hud/agents/operator.py +5 -17
- hud/agents/resolver.py +70 -0
- hud/agents/tests/test_claude.py +2 -4
- hud/agents/tests/test_openai.py +2 -1
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +34 -3
- hud/cli/build.py +37 -5
- hud/cli/dev.py +11 -2
- hud/cli/eval.py +51 -39
- hud/cli/flows/init.py +1 -1
- hud/cli/pull.py +1 -1
- hud/cli/push.py +9 -2
- hud/cli/tests/test_build.py +2 -2
- hud/cli/tests/test_push.py +1 -1
- hud/cli/utils/metadata.py +1 -1
- hud/cli/utils/tests/test_metadata.py +1 -1
- hud/clients/mcp_use.py +6 -1
- hud/datasets/loader.py +17 -18
- hud/datasets/runner.py +16 -10
- hud/datasets/tests/test_loader.py +15 -15
- hud/environment/__init__.py +5 -3
- hud/environment/connection.py +58 -6
- hud/environment/connectors/mcp_config.py +29 -1
- hud/environment/environment.py +218 -77
- hud/environment/router.py +175 -24
- hud/environment/scenarios.py +313 -186
- hud/environment/tests/test_connectors.py +10 -23
- hud/environment/tests/test_environment.py +432 -0
- hud/environment/tests/test_local_connectors.py +81 -40
- hud/environment/tests/test_scenarios.py +820 -14
- hud/eval/context.py +63 -10
- hud/eval/instrument.py +4 -2
- hud/eval/manager.py +79 -12
- hud/eval/task.py +36 -4
- hud/eval/tests/test_eval.py +1 -1
- hud/eval/tests/test_task.py +147 -1
- hud/eval/types.py +2 -0
- hud/eval/utils.py +14 -3
- hud/patches/mcp_patches.py +178 -21
- hud/telemetry/instrument.py +8 -1
- hud/telemetry/tests/test_eval_telemetry.py +8 -8
- hud/tools/__init__.py +2 -0
- hud/tools/agent.py +223 -0
- hud/tools/computer/__init__.py +34 -5
- hud/tools/shell.py +3 -3
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/types.py +62 -34
- hud/utils/hud_console.py +30 -17
- hud/utils/strict_schema.py +1 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/patches/mcp_patches.py
CHANGED
|
@@ -18,8 +18,10 @@ def patch_streamable_http_error_handling() -> None:
|
|
|
18
18
|
Patch StreamableHTTPTransport.post_writer to handle request errors properly.
|
|
19
19
|
|
|
20
20
|
The original implementation doesn't catch errors in handle_request_async,
|
|
21
|
-
which can cause
|
|
22
|
-
|
|
21
|
+
which can cause the client to hang indefinitely. This patch wraps the handler
|
|
22
|
+
to send a proper JSONRPCError response when transport errors occur (e.g.,
|
|
23
|
+
ReadTimeout), allowing the waiting caller to receive the error and fail
|
|
24
|
+
gracefully instead of hanging.
|
|
23
25
|
"""
|
|
24
26
|
try:
|
|
25
27
|
from mcp.client.streamable_http import StreamableHTTPTransport
|
|
@@ -33,10 +35,67 @@ def patch_streamable_http_error_handling() -> None:
|
|
|
33
35
|
start_get_stream: Any,
|
|
34
36
|
tg: Any,
|
|
35
37
|
) -> None:
|
|
36
|
-
|
|
38
|
+
import asyncio
|
|
39
|
+
import ssl
|
|
40
|
+
import time
|
|
41
|
+
|
|
42
|
+
import httpx
|
|
37
43
|
from mcp.client.streamable_http import RequestContext
|
|
38
|
-
from mcp.shared.message import ClientMessageMetadata
|
|
39
|
-
from mcp.types import JSONRPCRequest
|
|
44
|
+
from mcp.shared.message import ClientMessageMetadata, SessionMessage
|
|
45
|
+
from mcp.types import ErrorData, JSONRPCError, JSONRPCMessage, JSONRPCRequest
|
|
46
|
+
|
|
47
|
+
from hud.settings import settings
|
|
48
|
+
|
|
49
|
+
async def handle_request_async(ctx: RequestContext, is_resumption: bool) -> None:
|
|
50
|
+
msg = ctx.session_message.message
|
|
51
|
+
# Use configured timeout, minimum 30s to prevent instant failures
|
|
52
|
+
timeout = max(settings.client_timeout, 15.0)
|
|
53
|
+
deadline = time.monotonic() + timeout
|
|
54
|
+
retryable = (
|
|
55
|
+
httpx.ConnectError,
|
|
56
|
+
httpx.ReadError,
|
|
57
|
+
httpx.TimeoutException,
|
|
58
|
+
ssl.SSLError,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
async def send_error_response(exc: Exception) -> None:
|
|
62
|
+
"""Send an error response to the client."""
|
|
63
|
+
if isinstance(msg.root, JSONRPCRequest):
|
|
64
|
+
error_response = JSONRPCError(
|
|
65
|
+
jsonrpc="2.0",
|
|
66
|
+
id=msg.root.id,
|
|
67
|
+
error=ErrorData(
|
|
68
|
+
code=-32000,
|
|
69
|
+
message=f"Transport error: {type(exc).__name__}",
|
|
70
|
+
data={"error_type": type(exc).__name__, "detail": str(exc)},
|
|
71
|
+
),
|
|
72
|
+
)
|
|
73
|
+
await ctx.read_stream_writer.send(
|
|
74
|
+
SessionMessage(JSONRPCMessage(error_response))
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
await ctx.read_stream_writer.send(exc)
|
|
78
|
+
|
|
79
|
+
while True:
|
|
80
|
+
try:
|
|
81
|
+
if is_resumption:
|
|
82
|
+
await self._handle_resumption_request(ctx)
|
|
83
|
+
else:
|
|
84
|
+
await self._handle_post_request(ctx)
|
|
85
|
+
return
|
|
86
|
+
except retryable as e:
|
|
87
|
+
if time.monotonic() >= deadline:
|
|
88
|
+
logger.error("MCP request failed after timeout: %s", e)
|
|
89
|
+
await send_error_response(e)
|
|
90
|
+
return
|
|
91
|
+
logger.warning("Retrying MCP request after error: %s", e)
|
|
92
|
+
await asyncio.sleep(2.0)
|
|
93
|
+
except asyncio.CancelledError:
|
|
94
|
+
raise
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.exception("Request handler error: %s", e)
|
|
97
|
+
await send_error_response(e)
|
|
98
|
+
return
|
|
40
99
|
|
|
41
100
|
try:
|
|
42
101
|
async with write_stream_reader:
|
|
@@ -47,7 +106,6 @@ def patch_streamable_http_error_handling() -> None:
|
|
|
47
106
|
if isinstance(session_message.metadata, ClientMessageMetadata)
|
|
48
107
|
else None
|
|
49
108
|
)
|
|
50
|
-
|
|
51
109
|
is_resumption = bool(metadata and metadata.resumption_token)
|
|
52
110
|
|
|
53
111
|
logger.debug("Sending client message: %s", message)
|
|
@@ -65,21 +123,6 @@ def patch_streamable_http_error_handling() -> None:
|
|
|
65
123
|
sse_read_timeout=self.sse_read_timeout,
|
|
66
124
|
)
|
|
67
125
|
|
|
68
|
-
# Patched: Accept ctx and is_resumption as params, add error handling
|
|
69
|
-
async def handle_request_async(
|
|
70
|
-
ctx: RequestContext = ctx,
|
|
71
|
-
is_resumption: bool = is_resumption,
|
|
72
|
-
) -> None:
|
|
73
|
-
try:
|
|
74
|
-
if is_resumption:
|
|
75
|
-
await self._handle_resumption_request(ctx)
|
|
76
|
-
else:
|
|
77
|
-
await self._handle_post_request(ctx)
|
|
78
|
-
except Exception as e:
|
|
79
|
-
# Send error to read stream so client knows request failed
|
|
80
|
-
logger.error("Request handler error: %s", e)
|
|
81
|
-
await ctx.read_stream_writer.send(e)
|
|
82
|
-
|
|
83
126
|
if isinstance(message.root, JSONRPCRequest):
|
|
84
127
|
tg.start_soon(handle_request_async, ctx, is_resumption)
|
|
85
128
|
else:
|
|
@@ -122,6 +165,119 @@ def patch_client_session_validation() -> None:
|
|
|
122
165
|
logger.warning("Failed to patch client session: %s", e)
|
|
123
166
|
|
|
124
167
|
|
|
168
|
+
def patch_server_output_validation() -> None:
|
|
169
|
+
"""
|
|
170
|
+
Patch MCP server to skip structured output validation and auto-generate
|
|
171
|
+
structuredContent for FastMCP tools with x-fastmcp-wrap-result.
|
|
172
|
+
"""
|
|
173
|
+
try:
|
|
174
|
+
import json
|
|
175
|
+
|
|
176
|
+
import mcp.types as types
|
|
177
|
+
from mcp.server.lowlevel.server import Server
|
|
178
|
+
|
|
179
|
+
def patched_call_tool(
|
|
180
|
+
self: Any, validate_input: bool = True, validate_output: bool = False
|
|
181
|
+
) -> Any:
|
|
182
|
+
"""Patched call_tool that skips output validation."""
|
|
183
|
+
|
|
184
|
+
def decorator(func: Any) -> Any:
|
|
185
|
+
async def handler(req: types.CallToolRequest) -> Any:
|
|
186
|
+
try:
|
|
187
|
+
tool_name = req.params.name
|
|
188
|
+
arguments = req.params.arguments or {}
|
|
189
|
+
tool = await self._get_cached_tool_definition(tool_name)
|
|
190
|
+
|
|
191
|
+
if validate_input and tool:
|
|
192
|
+
try:
|
|
193
|
+
import jsonschema
|
|
194
|
+
|
|
195
|
+
jsonschema.validate(instance=arguments, schema=tool.inputSchema)
|
|
196
|
+
except jsonschema.ValidationError as e:
|
|
197
|
+
return self._make_error_result(
|
|
198
|
+
f"Input validation error: {e.message}"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
results = await func(tool_name, arguments)
|
|
202
|
+
|
|
203
|
+
# output normalization
|
|
204
|
+
unstructured_content: list[Any]
|
|
205
|
+
maybe_structured_content: dict[str, Any] | None
|
|
206
|
+
if isinstance(results, types.CallToolResult):
|
|
207
|
+
return types.ServerResult(results)
|
|
208
|
+
elif isinstance(results, tuple) and len(results) == 2:
|
|
209
|
+
unstructured_content, maybe_structured_content = results
|
|
210
|
+
elif isinstance(results, dict):
|
|
211
|
+
maybe_structured_content = results
|
|
212
|
+
text = json.dumps(results, indent=2)
|
|
213
|
+
unstructured_content = [types.TextContent(type="text", text=text)]
|
|
214
|
+
elif results is None:
|
|
215
|
+
# None means success with no content
|
|
216
|
+
unstructured_content = []
|
|
217
|
+
maybe_structured_content = None
|
|
218
|
+
elif isinstance(results, (str, bytes, bytearray, memoryview)):
|
|
219
|
+
# Handle string/bytes explicitly before iterable check
|
|
220
|
+
# (these are iterable but should not be split into chars/ints)
|
|
221
|
+
if isinstance(results, str):
|
|
222
|
+
text = results
|
|
223
|
+
elif isinstance(results, memoryview):
|
|
224
|
+
text = bytes(results).decode("utf-8", errors="replace")
|
|
225
|
+
else:
|
|
226
|
+
text = bytes(results).decode("utf-8", errors="replace")
|
|
227
|
+
unstructured_content = [types.TextContent(type="text", text=text)]
|
|
228
|
+
maybe_structured_content = None
|
|
229
|
+
elif isinstance(results, (int, float, bool)):
|
|
230
|
+
# Primitives -> string representation
|
|
231
|
+
unstructured_content = [
|
|
232
|
+
types.TextContent(type="text", text=str(results))
|
|
233
|
+
]
|
|
234
|
+
maybe_structured_content = None
|
|
235
|
+
elif hasattr(results, "__iter__"):
|
|
236
|
+
unstructured_content = list(results)
|
|
237
|
+
maybe_structured_content = None
|
|
238
|
+
else:
|
|
239
|
+
return self._make_error_result(
|
|
240
|
+
f"Unexpected return type: {type(results).__name__}"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Auto-generate structuredContent for FastMCP tools
|
|
244
|
+
# FastMCP generates outputSchema but doesn't populate it
|
|
245
|
+
if maybe_structured_content is None and tool:
|
|
246
|
+
output_schema = getattr(tool, "outputSchema", None)
|
|
247
|
+
if output_schema and output_schema.get("x-fastmcp-wrap-result"):
|
|
248
|
+
for item in unstructured_content:
|
|
249
|
+
if isinstance(item, types.TextContent):
|
|
250
|
+
try:
|
|
251
|
+
parsed = json.loads(item.text)
|
|
252
|
+
maybe_structured_content = {"result": parsed}
|
|
253
|
+
except json.JSONDecodeError:
|
|
254
|
+
maybe_structured_content = {"result": item.text}
|
|
255
|
+
break
|
|
256
|
+
|
|
257
|
+
return types.ServerResult(
|
|
258
|
+
types.CallToolResult(
|
|
259
|
+
content=list(unstructured_content),
|
|
260
|
+
structuredContent=maybe_structured_content,
|
|
261
|
+
isError=False,
|
|
262
|
+
)
|
|
263
|
+
)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
return self._make_error_result(str(e))
|
|
266
|
+
|
|
267
|
+
self.request_handlers[types.CallToolRequest] = handler
|
|
268
|
+
return func
|
|
269
|
+
|
|
270
|
+
return decorator
|
|
271
|
+
|
|
272
|
+
Server.call_tool = patched_call_tool
|
|
273
|
+
logger.debug("Patched Server.call_tool to skip output validation")
|
|
274
|
+
|
|
275
|
+
except ImportError:
|
|
276
|
+
logger.debug("mcp.server.lowlevel.server not available, skipping patch")
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.warning("Failed to patch server output validation: %s", e)
|
|
279
|
+
|
|
280
|
+
|
|
125
281
|
def suppress_fastmcp_logging(level: int = logging.WARNING) -> None:
|
|
126
282
|
"""
|
|
127
283
|
Suppress verbose fastmcp logging.
|
|
@@ -147,5 +303,6 @@ def apply_all_patches() -> None:
|
|
|
147
303
|
"""Apply all MCP patches."""
|
|
148
304
|
patch_streamable_http_error_handling()
|
|
149
305
|
patch_client_session_validation()
|
|
306
|
+
patch_server_output_validation()
|
|
150
307
|
suppress_fastmcp_logging()
|
|
151
308
|
logger.debug("All MCP patches applied")
|
hud/telemetry/instrument.py
CHANGED
|
@@ -83,6 +83,7 @@ def instrument(
|
|
|
83
83
|
name: str | None = None,
|
|
84
84
|
category: str = "function",
|
|
85
85
|
span_type: str | None = None,
|
|
86
|
+
internal_type: str | None = None,
|
|
86
87
|
record_args: bool = True,
|
|
87
88
|
record_result: bool = True,
|
|
88
89
|
) -> Callable[[Callable[..., Any]], Callable[..., Any]]: ...
|
|
@@ -95,6 +96,7 @@ def instrument(
|
|
|
95
96
|
name: str | None = None,
|
|
96
97
|
category: str = "function",
|
|
97
98
|
span_type: str | None = None,
|
|
99
|
+
internal_type: str | None = None,
|
|
98
100
|
record_args: bool = True,
|
|
99
101
|
record_result: bool = True,
|
|
100
102
|
) -> Callable[P, R]: ...
|
|
@@ -107,6 +109,7 @@ def instrument(
|
|
|
107
109
|
name: str | None = None,
|
|
108
110
|
category: str = "function",
|
|
109
111
|
span_type: str | None = None,
|
|
112
|
+
internal_type: str | None = None,
|
|
110
113
|
record_args: bool = True,
|
|
111
114
|
record_result: bool = True,
|
|
112
115
|
) -> Callable[P, Awaitable[R]]: ...
|
|
@@ -118,6 +121,7 @@ def instrument(
|
|
|
118
121
|
name: str | None = None,
|
|
119
122
|
category: str = "function",
|
|
120
123
|
span_type: str | None = None,
|
|
124
|
+
internal_type: str | None = None,
|
|
121
125
|
record_args: bool = True,
|
|
122
126
|
record_result: bool = True,
|
|
123
127
|
) -> Callable[..., Any]:
|
|
@@ -130,6 +134,7 @@ def instrument(
|
|
|
130
134
|
name: Custom span name (defaults to module.function)
|
|
131
135
|
category: Span category (e.g., "agent", "tool", "function", "mcp")
|
|
132
136
|
span_type: Alias for category (deprecated, use category instead)
|
|
137
|
+
internal_type: Internal span type (e.g., "user-message")
|
|
133
138
|
record_args: Whether to record function arguments
|
|
134
139
|
record_result: Whether to record function result
|
|
135
140
|
|
|
@@ -204,7 +209,7 @@ def instrument(
|
|
|
204
209
|
|
|
205
210
|
# Build span
|
|
206
211
|
span_id = uuid.uuid4().hex[:16]
|
|
207
|
-
span = {
|
|
212
|
+
span: dict[str, Any] = {
|
|
208
213
|
"name": span_name,
|
|
209
214
|
"trace_id": _normalize_trace_id(task_run_id),
|
|
210
215
|
"span_id": span_id,
|
|
@@ -216,6 +221,8 @@ def instrument(
|
|
|
216
221
|
"attributes": attributes.model_dump(mode="json", exclude_none=True),
|
|
217
222
|
"exceptions": [{"message": error}] if error else None,
|
|
218
223
|
}
|
|
224
|
+
if internal_type:
|
|
225
|
+
span["internal_type"] = internal_type
|
|
219
226
|
return span
|
|
220
227
|
|
|
221
228
|
@functools.wraps(func)
|
|
@@ -49,8 +49,8 @@ class TestEvalContextTelemetry:
|
|
|
49
49
|
"""Say hello."""
|
|
50
50
|
return f"Hello, {name}!"
|
|
51
51
|
|
|
52
|
-
# Create task from environment
|
|
53
|
-
task = Task(env=env)
|
|
52
|
+
# Create task from environment (args={} = runnable, args=None = template)
|
|
53
|
+
task = Task(env=env, args={})
|
|
54
54
|
|
|
55
55
|
with (
|
|
56
56
|
patch("hud.settings.settings") as mock_settings,
|
|
@@ -110,7 +110,7 @@ class TestEvalContextTelemetry:
|
|
|
110
110
|
"""Always fails."""
|
|
111
111
|
raise ValueError("Tool error")
|
|
112
112
|
|
|
113
|
-
task = Task(env=env)
|
|
113
|
+
task = Task(env=env, args={})
|
|
114
114
|
|
|
115
115
|
with (
|
|
116
116
|
patch("hud.settings.settings") as mock_settings,
|
|
@@ -162,7 +162,7 @@ class TestEvalContextTelemetry:
|
|
|
162
162
|
"""Multiply two numbers."""
|
|
163
163
|
return a * b
|
|
164
164
|
|
|
165
|
-
task = Task(env=env)
|
|
165
|
+
task = Task(env=env, args={})
|
|
166
166
|
|
|
167
167
|
with (
|
|
168
168
|
patch("hud.settings.settings") as mock_settings,
|
|
@@ -195,7 +195,7 @@ class TestEvalContextTelemetry:
|
|
|
195
195
|
async def simple_tool() -> str:
|
|
196
196
|
return "done"
|
|
197
197
|
|
|
198
|
-
task = Task(env=env)
|
|
198
|
+
task = Task(env=env, args={})
|
|
199
199
|
|
|
200
200
|
with (
|
|
201
201
|
patch("hud.eval.context.flush") as mock_flush,
|
|
@@ -229,7 +229,7 @@ class TestEvalContextTelemetry:
|
|
|
229
229
|
async def test_tool() -> str:
|
|
230
230
|
return "ok"
|
|
231
231
|
|
|
232
|
-
task = Task(env=env)
|
|
232
|
+
task = Task(env=env, args={})
|
|
233
233
|
|
|
234
234
|
with (
|
|
235
235
|
patch("hud.settings.settings") as mock_settings,
|
|
@@ -272,7 +272,7 @@ class TestSpanFormat:
|
|
|
272
272
|
async def echo(message: str) -> str:
|
|
273
273
|
return message
|
|
274
274
|
|
|
275
|
-
task = Task(env=env)
|
|
275
|
+
task = Task(env=env, args={})
|
|
276
276
|
|
|
277
277
|
with (
|
|
278
278
|
patch("hud.settings.settings") as mock_settings,
|
|
@@ -329,7 +329,7 @@ class TestSpanFormat:
|
|
|
329
329
|
async def noop() -> None:
|
|
330
330
|
pass
|
|
331
331
|
|
|
332
|
-
task = Task(env=env)
|
|
332
|
+
task = Task(env=env, args={})
|
|
333
333
|
|
|
334
334
|
with (
|
|
335
335
|
patch("hud.settings.settings") as mock_settings,
|
hud/tools/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
+
from .agent import AgentTool
|
|
7
8
|
from .base import BaseHub, BaseTool
|
|
8
9
|
from .bash import BashTool
|
|
9
10
|
from .edit import EditTool
|
|
@@ -21,6 +22,7 @@ if TYPE_CHECKING:
|
|
|
21
22
|
)
|
|
22
23
|
|
|
23
24
|
__all__ = [
|
|
25
|
+
"AgentTool",
|
|
24
26
|
"AnthropicComputerTool",
|
|
25
27
|
"BaseHub",
|
|
26
28
|
"BaseTool",
|
hud/tools/agent.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""AgentTool - run a Task with an agent as a tool."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import inspect
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Union, get_args, get_origin
|
|
7
|
+
|
|
8
|
+
from fastmcp.tools.tool import FunctionTool, ToolResult
|
|
9
|
+
from mcp.types import TextContent
|
|
10
|
+
|
|
11
|
+
from hud.tools.base import BaseTool
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from hud.agents.base import MCPAgent
|
|
15
|
+
from hud.eval.task import Task
|
|
16
|
+
|
|
17
|
+
__all__ = ["AgentTool"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is_eval_only(param: inspect.Parameter) -> bool:
|
|
21
|
+
"""Check if param is eval-only: has None default AND None in type union.
|
|
22
|
+
|
|
23
|
+
Handles both runtime types and string annotations (PEP 563).
|
|
24
|
+
"""
|
|
25
|
+
# Must have default of None
|
|
26
|
+
if param.default is not None:
|
|
27
|
+
return False
|
|
28
|
+
if param.annotation is inspect.Parameter.empty:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
annotation = param.annotation
|
|
32
|
+
|
|
33
|
+
# Handle string annotations (from __future__ annotations or quoted)
|
|
34
|
+
if isinstance(annotation, str):
|
|
35
|
+
# Check if it looks like "X | None", "Union[X, None]", or "Optional[X]"
|
|
36
|
+
return (
|
|
37
|
+
"| None" in annotation
|
|
38
|
+
or "None |" in annotation
|
|
39
|
+
or "Optional[" in annotation
|
|
40
|
+
or ("Union[" in annotation and "None" in annotation)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Handle runtime type annotations
|
|
44
|
+
origin = get_origin(annotation)
|
|
45
|
+
|
|
46
|
+
# Union types (X | None or Union[X, None])
|
|
47
|
+
if origin is Union:
|
|
48
|
+
return type(None) in get_args(annotation)
|
|
49
|
+
|
|
50
|
+
# For Python 3.10+ union syntax at runtime (types.UnionType)
|
|
51
|
+
try:
|
|
52
|
+
import types
|
|
53
|
+
|
|
54
|
+
if isinstance(annotation, types.UnionType):
|
|
55
|
+
return type(None) in get_args(annotation)
|
|
56
|
+
except (ImportError, AttributeError):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AgentTool(BaseTool):
|
|
63
|
+
"""Tool that runs a Task template with an agent.
|
|
64
|
+
|
|
65
|
+
Parameters with `| None = None` are eval-only and hidden from the tool schema.
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
```python
|
|
69
|
+
@env.scenario()
|
|
70
|
+
async def investigate(
|
|
71
|
+
issue_id: str, # Required - orchestrator sees
|
|
72
|
+
expected_cause: str | None = None, # Eval only - hidden
|
|
73
|
+
):
|
|
74
|
+
yield {"task": f"Investigate {issue_id}"}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
seer = AgentTool(env("investigate"), model="ft:seer-v2")
|
|
78
|
+
```
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
task: Task,
|
|
84
|
+
*,
|
|
85
|
+
model: str | None = None,
|
|
86
|
+
agent: type[MCPAgent] | None = None,
|
|
87
|
+
agent_params: dict[str, Any] | None = None,
|
|
88
|
+
name: str | None = None,
|
|
89
|
+
description: str | None = None,
|
|
90
|
+
trace: bool = False,
|
|
91
|
+
) -> None:
|
|
92
|
+
if not model and agent is None:
|
|
93
|
+
raise ValueError("Must provide either 'model' or 'agent'")
|
|
94
|
+
if model and agent is not None:
|
|
95
|
+
raise ValueError("Cannot provide both 'model' and 'agent'")
|
|
96
|
+
|
|
97
|
+
self._task = task
|
|
98
|
+
self._model = model
|
|
99
|
+
self._agent_cls = agent
|
|
100
|
+
self._agent_params = agent_params or {}
|
|
101
|
+
self._trace = trace
|
|
102
|
+
|
|
103
|
+
# Get visible params from scenario function
|
|
104
|
+
self._visible_params: set[str] = set()
|
|
105
|
+
self._param_schema: dict[str, Any] = {
|
|
106
|
+
"type": "object",
|
|
107
|
+
"properties": {},
|
|
108
|
+
"required": [],
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if task.env and task.scenario:
|
|
112
|
+
scenario_fn = task.env._scenarios.get(task.scenario)
|
|
113
|
+
if scenario_fn:
|
|
114
|
+
sig = inspect.signature(scenario_fn)
|
|
115
|
+
visible = {name: p for name, p in sig.parameters.items() if not _is_eval_only(p)}
|
|
116
|
+
self._visible_params = set(visible.keys())
|
|
117
|
+
self._param_schema = self._build_schema(visible)
|
|
118
|
+
|
|
119
|
+
tool_name = name or task.scenario or "agent_tool"
|
|
120
|
+
tool_desc = description or f"Run scenario: {task.scenario}"
|
|
121
|
+
|
|
122
|
+
super().__init__(name=tool_name, description=tool_desc)
|
|
123
|
+
|
|
124
|
+
def _build_schema(self, params: dict[str, inspect.Parameter]) -> dict[str, Any]:
|
|
125
|
+
"""Build JSON schema using Pydantic TypeAdapter."""
|
|
126
|
+
from pydantic import TypeAdapter
|
|
127
|
+
|
|
128
|
+
properties: dict[str, Any] = {}
|
|
129
|
+
required: list[str] = []
|
|
130
|
+
|
|
131
|
+
for name, param in params.items():
|
|
132
|
+
if param.annotation is not inspect.Parameter.empty:
|
|
133
|
+
try:
|
|
134
|
+
# Handle string annotations
|
|
135
|
+
annotation = param.annotation
|
|
136
|
+
if isinstance(annotation, str):
|
|
137
|
+
# Try to evaluate the annotation
|
|
138
|
+
try:
|
|
139
|
+
annotation = eval(annotation) # noqa: S307
|
|
140
|
+
except Exception:
|
|
141
|
+
# Fall back to string type but don't skip required handling
|
|
142
|
+
annotation = None
|
|
143
|
+
|
|
144
|
+
if annotation is not None:
|
|
145
|
+
adapter = TypeAdapter(annotation)
|
|
146
|
+
properties[name] = adapter.json_schema()
|
|
147
|
+
else:
|
|
148
|
+
properties[name] = {"type": "string"}
|
|
149
|
+
except Exception:
|
|
150
|
+
properties[name] = {"type": "string"}
|
|
151
|
+
else:
|
|
152
|
+
properties[name] = {"type": "string"}
|
|
153
|
+
|
|
154
|
+
if param.default is inspect.Parameter.empty:
|
|
155
|
+
required.append(name)
|
|
156
|
+
elif param.default is not None:
|
|
157
|
+
properties[name]["default"] = param.default
|
|
158
|
+
|
|
159
|
+
return {"type": "object", "properties": properties, "required": required}
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def mcp(self) -> FunctionTool:
|
|
163
|
+
"""Get as FastMCP FunctionTool with filtered schema."""
|
|
164
|
+
if not hasattr(self, "_mcp_tool"):
|
|
165
|
+
# Directly instantiate FunctionTool with our callable and schema
|
|
166
|
+
# This bypasses from_function's signature parsing
|
|
167
|
+
self._mcp_tool = FunctionTool(
|
|
168
|
+
name=self.name,
|
|
169
|
+
description=self.description or "",
|
|
170
|
+
parameters=self._param_schema,
|
|
171
|
+
fn=self._execute_with_args,
|
|
172
|
+
)
|
|
173
|
+
return self._mcp_tool
|
|
174
|
+
|
|
175
|
+
async def _execute_with_args(self, **kwargs: Any) -> ToolResult:
|
|
176
|
+
"""Internal executor that FastMCP calls with parsed arguments."""
|
|
177
|
+
return await self(**kwargs)
|
|
178
|
+
|
|
179
|
+
async def __call__(self, **kwargs: Any) -> ToolResult:
|
|
180
|
+
"""Execute the task with a fresh agent."""
|
|
181
|
+
from hud.eval.context import get_current_trace_id
|
|
182
|
+
from hud.eval.manager import run_eval
|
|
183
|
+
from hud.telemetry.instrument import instrument
|
|
184
|
+
|
|
185
|
+
# Filter to visible params only
|
|
186
|
+
filtered = {k: v for k, v in kwargs.items() if k in self._visible_params}
|
|
187
|
+
|
|
188
|
+
# Merge with template args
|
|
189
|
+
base_args = self._task.args or {}
|
|
190
|
+
task = self._task.model_copy(update={"args": {**base_args, **filtered}})
|
|
191
|
+
|
|
192
|
+
# Use parent trace if available (for hierarchical agents)
|
|
193
|
+
parent_trace_id = get_current_trace_id()
|
|
194
|
+
|
|
195
|
+
# If nested (has parent), skip subagent's enter/exit registration
|
|
196
|
+
# Tool calls are still recorded via the shared trace_id's context
|
|
197
|
+
is_nested = parent_trace_id is not None
|
|
198
|
+
|
|
199
|
+
# Trace if explicitly requested AND not nested (nested uses parent trace)
|
|
200
|
+
should_trace = self._trace and not is_nested
|
|
201
|
+
|
|
202
|
+
# Wrap execution with instrumentation to mark as subagent
|
|
203
|
+
# Platform uses category="subagent" to detect and render subagent tool calls
|
|
204
|
+
@instrument(category="subagent", name=self.name)
|
|
205
|
+
async def _run_subagent() -> ToolResult:
|
|
206
|
+
async with run_eval(
|
|
207
|
+
task,
|
|
208
|
+
trace=should_trace,
|
|
209
|
+
trace_id=parent_trace_id,
|
|
210
|
+
quiet=True,
|
|
211
|
+
) as ctx:
|
|
212
|
+
if self._model:
|
|
213
|
+
from hud.agents import create_agent
|
|
214
|
+
|
|
215
|
+
agent = create_agent(self._model, **self._agent_params)
|
|
216
|
+
else:
|
|
217
|
+
agent = self._agent_cls.create(**self._agent_params) # type: ignore
|
|
218
|
+
|
|
219
|
+
result = await agent.run(ctx)
|
|
220
|
+
content = result.content if hasattr(result, "content") and result.content else ""
|
|
221
|
+
return ToolResult(content=[TextContent(type="text", text=content)])
|
|
222
|
+
|
|
223
|
+
return await _run_subagent()
|
hud/tools/computer/__init__.py
CHANGED
|
@@ -2,13 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
from .hud import HudComputerTool
|
|
8
|
-
from .openai import OpenAIComputerTool
|
|
9
|
-
from .qwen import QwenComputerTool
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
10
7
|
from .settings import computer_settings
|
|
11
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from .anthropic import AnthropicComputerTool
|
|
11
|
+
from .gemini import GeminiComputerTool
|
|
12
|
+
from .hud import HudComputerTool
|
|
13
|
+
from .openai import OpenAIComputerTool
|
|
14
|
+
from .qwen import QwenComputerTool
|
|
15
|
+
|
|
12
16
|
__all__ = [
|
|
13
17
|
"AnthropicComputerTool",
|
|
14
18
|
"GeminiComputerTool",
|
|
@@ -17,3 +21,28 @@ __all__ = [
|
|
|
17
21
|
"QwenComputerTool",
|
|
18
22
|
"computer_settings",
|
|
19
23
|
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def __getattr__(name: str) -> type:
|
|
27
|
+
"""Lazy import computer tools."""
|
|
28
|
+
if name == "AnthropicComputerTool":
|
|
29
|
+
from .anthropic import AnthropicComputerTool
|
|
30
|
+
|
|
31
|
+
return AnthropicComputerTool
|
|
32
|
+
elif name == "GeminiComputerTool":
|
|
33
|
+
from .gemini import GeminiComputerTool
|
|
34
|
+
|
|
35
|
+
return GeminiComputerTool
|
|
36
|
+
elif name == "HudComputerTool":
|
|
37
|
+
from .hud import HudComputerTool
|
|
38
|
+
|
|
39
|
+
return HudComputerTool
|
|
40
|
+
elif name == "OpenAIComputerTool":
|
|
41
|
+
from .openai import OpenAIComputerTool
|
|
42
|
+
|
|
43
|
+
return OpenAIComputerTool
|
|
44
|
+
elif name == "QwenComputerTool":
|
|
45
|
+
from .qwen import QwenComputerTool
|
|
46
|
+
|
|
47
|
+
return QwenComputerTool
|
|
48
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
hud/tools/shell.py
CHANGED
|
@@ -82,10 +82,10 @@ class _BashSession:
|
|
|
82
82
|
await asyncio.sleep(0)
|
|
83
83
|
return
|
|
84
84
|
|
|
85
|
-
# preexec_fn and user demotion only available on Unix
|
|
85
|
+
# preexec_fn and user demotion only available on Unix when running as root
|
|
86
86
|
preexec_fn = None
|
|
87
|
-
if sys.platform != "win32":
|
|
88
|
-
|
|
87
|
+
if sys.platform != "win32" and os.getuid() == 0:
|
|
88
|
+
# Only demote when running as root (e.g., inside Docker containers)
|
|
89
89
|
def demote() -> None:
|
|
90
90
|
# This only runs in the child process (Unix only)
|
|
91
91
|
os.setsid() # type: ignore[attr-defined]
|