hud-python 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +7 -0
- hud/agents/base.py +42 -10
- hud/agents/claude.py +24 -14
- hud/agents/grounded_openai.py +280 -0
- hud/agents/tests/test_client.py +11 -27
- hud/agents/tests/test_grounded_openai_agent.py +155 -0
- hud/cli/__init__.py +50 -20
- hud/cli/build.py +3 -44
- hud/cli/eval.py +25 -6
- hud/cli/init.py +4 -4
- hud/cli/push.py +3 -1
- hud/cli/tests/test_push.py +6 -6
- hud/cli/utils/interactive.py +1 -1
- hud/clients/__init__.py +3 -2
- hud/clients/base.py +20 -9
- hud/clients/mcp_use.py +44 -22
- hud/datasets/task.py +6 -2
- hud/native/__init__.py +6 -0
- hud/native/comparator.py +546 -0
- hud/native/tests/__init__.py +1 -0
- hud/native/tests/test_comparator.py +539 -0
- hud/native/tests/test_native_init.py +79 -0
- hud/otel/instrumentation.py +0 -2
- hud/server/server.py +9 -2
- hud/settings.py +6 -0
- hud/shared/exceptions.py +204 -31
- hud/shared/hints.py +177 -0
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +385 -144
- hud/tools/__init__.py +2 -0
- hud/tools/executors/tests/test_base_executor.py +1 -1
- hud/tools/executors/xdo.py +1 -1
- hud/tools/grounding/__init__.py +13 -0
- hud/tools/grounding/config.py +54 -0
- hud/tools/grounding/grounded_tool.py +314 -0
- hud/tools/grounding/grounder.py +301 -0
- hud/tools/grounding/tests/__init__.py +1 -0
- hud/tools/grounding/tests/test_grounded_tool.py +196 -0
- hud/tools/submit.py +66 -0
- hud/tools/tests/test_playwright_tool.py +1 -1
- hud/tools/tests/test_tools_init.py +1 -1
- hud/tools/tests/test_utils.py +2 -2
- hud/types.py +33 -5
- hud/utils/agent_factories.py +86 -0
- hud/utils/design.py +57 -0
- hud/utils/mcp.py +6 -0
- hud/utils/pretty_errors.py +68 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/METADATA +2 -4
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/RECORD +54 -37
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/WHEEL +0 -0
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import mcp.types as types
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.tools.grounding.grounded_tool import GroundedComputerTool
|
|
10
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class FakeResult:
|
|
15
|
+
content: list[types.ContentBlock]
|
|
16
|
+
isError: bool = False
|
|
17
|
+
structuredContent: dict | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FakeMCPClient:
|
|
21
|
+
"""Fake MCP client that implements AgentMCPClient protocol."""
|
|
22
|
+
|
|
23
|
+
_initialized: bool
|
|
24
|
+
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
self.calls: list[tuple[str, dict[str, Any]]] = []
|
|
27
|
+
self._initialized = False
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def mcp_config(self) -> dict[str, dict[str, Any]]:
|
|
31
|
+
return {"test": {"command": "echo", "args": ["test"]}}
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def is_connected(self) -> bool:
|
|
35
|
+
return self._initialized
|
|
36
|
+
|
|
37
|
+
async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
|
|
38
|
+
self._initialized = True
|
|
39
|
+
|
|
40
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
41
|
+
return [types.Tool(name="computer", description="Test tool", inputSchema={})]
|
|
42
|
+
|
|
43
|
+
async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
|
|
44
|
+
self.calls.append((tool_call.name, tool_call.arguments or {}))
|
|
45
|
+
return MCPToolResult(content=[types.TextContent(text="ok", type="text")], isError=False)
|
|
46
|
+
|
|
47
|
+
async def shutdown(self) -> None:
|
|
48
|
+
self._initialized = False
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class FakeGrounder:
|
|
52
|
+
"""Fake grounder that implements Grounder interface."""
|
|
53
|
+
|
|
54
|
+
def __init__(self, coords: tuple[int, int] | None = (10, 20)) -> None:
|
|
55
|
+
self.coords = coords
|
|
56
|
+
self.calls: list[tuple[str, str]] = []
|
|
57
|
+
|
|
58
|
+
async def predict_click(
|
|
59
|
+
self, *, image_b64: str, instruction: str, max_retries: int = 3
|
|
60
|
+
) -> tuple[int, int] | None:
|
|
61
|
+
self.calls.append((image_b64[:10], instruction))
|
|
62
|
+
return self.coords
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _png_b64() -> str:
|
|
66
|
+
# 1x1 transparent PNG base64 (valid minimal image)
|
|
67
|
+
return (
|
|
68
|
+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
|
|
69
|
+
"J2n0mQAAAABJRU5ErkJggg=="
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@pytest.mark.asyncio
|
|
74
|
+
async def test_click_action_grounds_and_calls_mcp() -> None:
|
|
75
|
+
client = FakeMCPClient()
|
|
76
|
+
grounder = FakeGrounder(coords=(123, 456))
|
|
77
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
78
|
+
|
|
79
|
+
blocks = await tool(
|
|
80
|
+
action="click",
|
|
81
|
+
element_description="red button",
|
|
82
|
+
screenshot_b64=_png_b64(),
|
|
83
|
+
button="left",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
assert isinstance(blocks, list)
|
|
87
|
+
# Grounder called once
|
|
88
|
+
assert len(grounder.calls) == 1
|
|
89
|
+
# MCP called with resolved coordinates
|
|
90
|
+
assert client.calls == [("computer", {"action": "click", "x": 123, "y": 456, "button": "left"})]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@pytest.mark.asyncio
|
|
94
|
+
async def test_move_and_scroll_require_element_description_and_screenshot() -> None:
|
|
95
|
+
client = FakeMCPClient()
|
|
96
|
+
grounder = FakeGrounder(coords=(5, 6))
|
|
97
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
98
|
+
|
|
99
|
+
# Missing element_description
|
|
100
|
+
with pytest.raises(Exception) as ei:
|
|
101
|
+
await tool(action="move", screenshot_b64=_png_b64())
|
|
102
|
+
assert "element_description is required" in str(ei.value)
|
|
103
|
+
|
|
104
|
+
# Missing screenshot
|
|
105
|
+
with pytest.raises(Exception) as ei2:
|
|
106
|
+
await tool(action="scroll", element_description="list", scroll_y=100)
|
|
107
|
+
assert "No screenshot available" in str(ei2.value)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.mark.asyncio
|
|
111
|
+
async def test_drag_grounds_both_points_and_calls_mcp() -> None:
|
|
112
|
+
client = FakeMCPClient()
|
|
113
|
+
grounder = FakeGrounder(coords=(10, 20))
|
|
114
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
115
|
+
|
|
116
|
+
await tool(
|
|
117
|
+
action="drag",
|
|
118
|
+
start_element_description="source",
|
|
119
|
+
end_element_description="target",
|
|
120
|
+
screenshot_b64=_png_b64(),
|
|
121
|
+
button="left",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Two grounding calls (start and end)
|
|
125
|
+
assert len(grounder.calls) == 2
|
|
126
|
+
# Drag path contains two points, same coords from fake grounder
|
|
127
|
+
name, args = client.calls[0]
|
|
128
|
+
assert name == "computer"
|
|
129
|
+
assert args["action"] == "drag"
|
|
130
|
+
assert args["button"] == "left"
|
|
131
|
+
assert args["path"] == [(10, 20), (10, 20)]
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@pytest.mark.asyncio
|
|
135
|
+
async def test_drag_requires_both_descriptions_and_screenshot() -> None:
|
|
136
|
+
client = FakeMCPClient()
|
|
137
|
+
grounder = FakeGrounder()
|
|
138
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
139
|
+
|
|
140
|
+
with pytest.raises(Exception) as ei:
|
|
141
|
+
await tool(action="drag", start_element_description="a", screenshot_b64=_png_b64())
|
|
142
|
+
assert "start_element_description and end_element_description" in str(ei.value)
|
|
143
|
+
|
|
144
|
+
with pytest.raises(Exception) as ei2:
|
|
145
|
+
await tool(
|
|
146
|
+
action="drag",
|
|
147
|
+
start_element_description="a",
|
|
148
|
+
end_element_description="b",
|
|
149
|
+
)
|
|
150
|
+
assert "No screenshot available" in str(ei2.value)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@pytest.mark.asyncio
|
|
154
|
+
async def test_direct_actions_bypass_grounding_and_call_mcp() -> None:
|
|
155
|
+
client = FakeMCPClient()
|
|
156
|
+
grounder = FakeGrounder()
|
|
157
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
158
|
+
|
|
159
|
+
# Actions that bypass grounding
|
|
160
|
+
for action, extra in [
|
|
161
|
+
("screenshot", {}),
|
|
162
|
+
("type", {"text": "hello"}),
|
|
163
|
+
("keypress", {"keys": ["ctrl", "a"]}),
|
|
164
|
+
("wait", {}),
|
|
165
|
+
("get_current_url", {}),
|
|
166
|
+
("get_dimensions", {}),
|
|
167
|
+
("get_environment", {}),
|
|
168
|
+
]:
|
|
169
|
+
client.calls.clear()
|
|
170
|
+
_ = await tool(action=action, **extra)
|
|
171
|
+
assert client.calls and client.calls[0][0] == "computer"
|
|
172
|
+
assert client.calls[0][1]["action"] == action
|
|
173
|
+
# Grounder not invoked for these
|
|
174
|
+
assert grounder.calls == []
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@pytest.mark.asyncio
|
|
178
|
+
async def test_unsupported_action_raises() -> None:
|
|
179
|
+
client = FakeMCPClient()
|
|
180
|
+
grounder = FakeGrounder()
|
|
181
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
182
|
+
|
|
183
|
+
with pytest.raises(Exception) as ei:
|
|
184
|
+
await tool(action="zoom")
|
|
185
|
+
assert "Unsupported action" in str(ei.value)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@pytest.mark.asyncio
|
|
189
|
+
async def test_grounding_failure_propagates_as_error() -> None:
|
|
190
|
+
client = FakeMCPClient()
|
|
191
|
+
grounder = FakeGrounder(coords=None)
|
|
192
|
+
tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
|
|
193
|
+
|
|
194
|
+
with pytest.raises(Exception) as ei:
|
|
195
|
+
await tool(action="click", element_description="x", screenshot_b64=_png_b64())
|
|
196
|
+
assert "Could not locate element" in str(ei.value)
|
hud/tools/submit.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from mcp.types import ContentBlock, TextContent
|
|
6
|
+
|
|
7
|
+
from .response import ResponseTool
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Global submission storage
|
|
13
|
+
_SUBMISSION: str | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def set_submission(value: str | None) -> None:
|
|
17
|
+
global _SUBMISSION
|
|
18
|
+
_SUBMISSION = value
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_submission() -> str | None:
|
|
22
|
+
return _SUBMISSION
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SubmitTool(ResponseTool):
|
|
26
|
+
"""Lifecycle tool to submit the agent's final answer for evaluation.
|
|
27
|
+
|
|
28
|
+
Accepts either a `response` string or a `messages` list and stores the
|
|
29
|
+
submission as a plain string, accessible via `get_submission()`.
|
|
30
|
+
Priority: The last text content in `messages` (if provided) overrides `response`.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
name: str = "response"
|
|
34
|
+
title: str = "Submit Tool"
|
|
35
|
+
description: str = "Submit the agent's final response for later evaluation"
|
|
36
|
+
|
|
37
|
+
async def __call__(
|
|
38
|
+
self, response: str | None = None, messages: list[ContentBlock] | None = None
|
|
39
|
+
) -> list[ContentBlock]:
|
|
40
|
+
# 1) If messages provided, take the last text block
|
|
41
|
+
# chosen: str | None = None
|
|
42
|
+
|
|
43
|
+
# if messages:
|
|
44
|
+
# # Gather all text blocks
|
|
45
|
+
# text_blocks: list[str] = []
|
|
46
|
+
# for block in messages:
|
|
47
|
+
# try:
|
|
48
|
+
# if isinstance(block, TextContent):
|
|
49
|
+
# text_blocks.append(str(block.text))
|
|
50
|
+
# except Exception:
|
|
51
|
+
# logger.debug("SubmitTool skipped non-text block: %s", block)
|
|
52
|
+
# continue
|
|
53
|
+
# if text_blocks:
|
|
54
|
+
# chosen = text_blocks[-1]
|
|
55
|
+
|
|
56
|
+
# # 2) Otherwise use `response` as-is
|
|
57
|
+
# if chosen is None and response is not None:
|
|
58
|
+
# chosen = response
|
|
59
|
+
|
|
60
|
+
set_submission(response)
|
|
61
|
+
|
|
62
|
+
# Echo back what we stored
|
|
63
|
+
blocks: list[ContentBlock] = []
|
|
64
|
+
if response:
|
|
65
|
+
blocks.append(TextContent(text=response, type="text"))
|
|
66
|
+
return blocks
|
|
@@ -52,7 +52,7 @@ class TestPlaywrightTool:
|
|
|
52
52
|
assert any(isinstance(b, TextContent) for b in blocks)
|
|
53
53
|
# The actual call includes wait_until parameter with a Field object
|
|
54
54
|
mock_page.goto.assert_called_once()
|
|
55
|
-
args,
|
|
55
|
+
args, _kwargs = mock_page.goto.call_args
|
|
56
56
|
assert args[0] == "https://example.com"
|
|
57
57
|
mock_ensure.assert_called_once()
|
|
58
58
|
|
|
@@ -33,7 +33,7 @@ class TestToolsInit:
|
|
|
33
33
|
"""Test lazy import with invalid attribute name."""
|
|
34
34
|
import hud.tools as tools_module
|
|
35
35
|
|
|
36
|
-
with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidTool'"):
|
|
36
|
+
with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidTool'"):
|
|
37
37
|
_ = tools_module.InvalidTool
|
|
38
38
|
|
|
39
39
|
def test_direct_imports_available(self):
|
hud/tools/tests/test_utils.py
CHANGED
|
@@ -58,7 +58,7 @@ class TestRun:
|
|
|
58
58
|
mock_proc.communicate = AsyncMock(return_value=(b"processed", b""))
|
|
59
59
|
|
|
60
60
|
with patch("asyncio.create_subprocess_shell", return_value=mock_proc):
|
|
61
|
-
return_code, stdout,
|
|
61
|
+
return_code, stdout, _stderr = await run("cat", input="test input")
|
|
62
62
|
|
|
63
63
|
assert return_code == 0
|
|
64
64
|
assert stdout == "processed"
|
|
@@ -91,7 +91,7 @@ class TestRun:
|
|
|
91
91
|
):
|
|
92
92
|
mock_wait_for.return_value = (b"done", b"")
|
|
93
93
|
|
|
94
|
-
|
|
94
|
+
_return_code, _stdout, _stderr = await run("sleep 1", timeout=5.0)
|
|
95
95
|
|
|
96
96
|
# Check that wait_for was called with the correct timeout
|
|
97
97
|
mock_wait_for.assert_called_once()
|
hud/types.py
CHANGED
|
@@ -15,7 +15,20 @@ class MCPToolCall(CallToolRequestParams):
|
|
|
15
15
|
id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Unique identifier for reference
|
|
16
16
|
|
|
17
17
|
def __str__(self) -> str:
|
|
18
|
-
"""Format tool call
|
|
18
|
+
"""Format tool call as plain text."""
|
|
19
|
+
args_str = ""
|
|
20
|
+
if self.arguments:
|
|
21
|
+
try:
|
|
22
|
+
args_str = json.dumps(self.arguments, separators=(",", ":"))
|
|
23
|
+
if len(args_str) > 60:
|
|
24
|
+
args_str = args_str[:57] + "..."
|
|
25
|
+
except (TypeError, ValueError):
|
|
26
|
+
args_str = str(self.arguments)[:60]
|
|
27
|
+
|
|
28
|
+
return f"→ {self.name}({args_str})"
|
|
29
|
+
|
|
30
|
+
def __rich__(self) -> str:
|
|
31
|
+
"""Rich representation with color formatting."""
|
|
19
32
|
from hud.utils.design import design
|
|
20
33
|
|
|
21
34
|
return design.format_tool_call(self.name, self.arguments)
|
|
@@ -24,10 +37,8 @@ class MCPToolCall(CallToolRequestParams):
|
|
|
24
37
|
class MCPToolResult(CallToolResult):
|
|
25
38
|
"""A tool result."""
|
|
26
39
|
|
|
27
|
-
def
|
|
28
|
-
"""
|
|
29
|
-
from hud.utils.design import design
|
|
30
|
-
|
|
40
|
+
def _get_content_summary(self) -> str:
|
|
41
|
+
"""Extract a summary of the content."""
|
|
31
42
|
# Extract content summary
|
|
32
43
|
content_summary = ""
|
|
33
44
|
if self.content:
|
|
@@ -49,6 +60,23 @@ class MCPToolResult(CallToolResult):
|
|
|
49
60
|
except (TypeError, ValueError):
|
|
50
61
|
content_summary = str(self.structuredContent)
|
|
51
62
|
|
|
63
|
+
return content_summary
|
|
64
|
+
|
|
65
|
+
def __str__(self) -> str:
|
|
66
|
+
"""Format tool result as plain text for compatibility."""
|
|
67
|
+
content_summary = self._get_content_summary()
|
|
68
|
+
|
|
69
|
+
# Plain text format with unicode symbols
|
|
70
|
+
if self.isError:
|
|
71
|
+
return f"✗ {content_summary}"
|
|
72
|
+
else:
|
|
73
|
+
return f"✓ {content_summary}"
|
|
74
|
+
|
|
75
|
+
def __rich__(self) -> str:
|
|
76
|
+
"""Rich representation with color formatting."""
|
|
77
|
+
from hud.utils.design import design
|
|
78
|
+
|
|
79
|
+
content_summary = self._get_content_summary()
|
|
52
80
|
return design.format_tool_result(content_summary, self.isError)
|
|
53
81
|
|
|
54
82
|
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Factory functions for creating agents compatible with run_dataset."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from openai import AsyncOpenAI
|
|
8
|
+
|
|
9
|
+
from hud.agents.grounded_openai import GroundedOpenAIChatAgent
|
|
10
|
+
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
11
|
+
from hud.tools.grounding import GrounderConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def create_openai_agent(**kwargs: Any) -> GenericOpenAIChatAgent:
|
|
15
|
+
"""Factory for GenericOpenAIChatAgent with run_dataset compatibility.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
api_key: OpenAI API key
|
|
19
|
+
base_url: Optional custom API endpoint
|
|
20
|
+
model_name: Model to use (e.g., "gpt-4o-mini")
|
|
21
|
+
**kwargs: Additional arguments passed to GenericOpenAIChatAgent
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Configured GenericOpenAIChatAgent instance
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> from hud.datasets import run_dataset
|
|
28
|
+
>>> from hud.utils.agent_factories import create_openai_agent
|
|
29
|
+
>>> results = await run_dataset(
|
|
30
|
+
... "My Eval",
|
|
31
|
+
... "hud-evals/SheetBench-50",
|
|
32
|
+
... create_openai_agent,
|
|
33
|
+
... {"api_key": "your-key", "model_name": "gpt-4o-mini"},
|
|
34
|
+
... )
|
|
35
|
+
"""
|
|
36
|
+
api_key = kwargs.pop("api_key", None)
|
|
37
|
+
base_url = kwargs.pop("base_url", None)
|
|
38
|
+
|
|
39
|
+
openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
40
|
+
|
|
41
|
+
return GenericOpenAIChatAgent(openai_client=openai_client, **kwargs)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent:
|
|
45
|
+
"""Factory for GroundedOpenAIChatAgent with run_dataset compatibility.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
api_key: OpenAI API key for planning model
|
|
49
|
+
base_url: Optional custom API endpoint for planning model
|
|
50
|
+
model_name: Planning model to use (e.g., "gpt-4o-mini")
|
|
51
|
+
grounder_api_key: API key for grounding model
|
|
52
|
+
grounder_api_base: API base URL for grounding model (default: OpenRouter)
|
|
53
|
+
grounder_model: Grounding model to use (default: qwen/qwen-2.5-vl-7b-instruct)
|
|
54
|
+
**kwargs: Additional arguments passed to GroundedOpenAIChatAgent
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Configured GroundedOpenAIChatAgent instance
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
>>> from hud.datasets import run_dataset
|
|
61
|
+
>>> from hud.utils.agent_factories import create_grounded_agent
|
|
62
|
+
>>> results = await run_dataset(
|
|
63
|
+
... "Grounded Eval",
|
|
64
|
+
... dataset,
|
|
65
|
+
... create_grounded_agent,
|
|
66
|
+
... {
|
|
67
|
+
... "api_key": "openai-key",
|
|
68
|
+
... "grounder_api_key": "openrouter-key",
|
|
69
|
+
... "model_name": "gpt-4o-mini",
|
|
70
|
+
... },
|
|
71
|
+
... )
|
|
72
|
+
"""
|
|
73
|
+
api_key = kwargs.pop("api_key", None)
|
|
74
|
+
base_url = kwargs.pop("base_url", None)
|
|
75
|
+
grounder_api_key = kwargs.pop("grounder_api_key", None)
|
|
76
|
+
grounder_api_base = kwargs.pop("grounder_api_base", "https://openrouter.ai/api/v1")
|
|
77
|
+
grounder_model = kwargs.pop("grounder_model", "qwen/qwen-2.5-vl-7b-instruct")
|
|
78
|
+
|
|
79
|
+
openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
80
|
+
grounder_config = GrounderConfig(
|
|
81
|
+
api_base=grounder_api_base, model=grounder_model, api_key=grounder_api_key
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return GroundedOpenAIChatAgent(
|
|
85
|
+
openai_client=openai_client, grounder_config=grounder_config, **kwargs
|
|
86
|
+
)
|
hud/utils/design.py
CHANGED
|
@@ -257,6 +257,63 @@ class HUDDesign:
|
|
|
257
257
|
else:
|
|
258
258
|
console.print(f" [cyan]{command}[/cyan]")
|
|
259
259
|
|
|
260
|
+
# Exception rendering utilities
|
|
261
|
+
def render_support_hint(self, stderr: bool = True) -> None:
|
|
262
|
+
"""Render a standard support message for users encountering issues."""
|
|
263
|
+
support = (
|
|
264
|
+
"If this looks like an issue with the sdk, please make a github issue at "
|
|
265
|
+
"https://github.com/hud-evals/hud-python/issues"
|
|
266
|
+
)
|
|
267
|
+
self.info(support, stderr=stderr)
|
|
268
|
+
|
|
269
|
+
def render_exception(self, error: BaseException, *, stderr: bool = True) -> None:
|
|
270
|
+
"""Render exceptions consistently using the HUD design system.
|
|
271
|
+
|
|
272
|
+
- Shows exception type and message
|
|
273
|
+
- Displays structured hints if present on the exception (e.g., HudException.hints)
|
|
274
|
+
- Prints a link to open an issue for SDK problems
|
|
275
|
+
"""
|
|
276
|
+
try:
|
|
277
|
+
from hud.shared.exceptions import HudRequestError # lazy import
|
|
278
|
+
except Exception:
|
|
279
|
+
# Keep type available for isinstance guards below without import-time dependency
|
|
280
|
+
HudRequestError = tuple() # type: ignore
|
|
281
|
+
|
|
282
|
+
# Header with exception type
|
|
283
|
+
ex_type = type(error).__name__
|
|
284
|
+
message = getattr(error, "message", "") or str(error) or ex_type
|
|
285
|
+
self.error(f"{ex_type}: {message}", stderr=stderr)
|
|
286
|
+
|
|
287
|
+
# Specialized details for request errors
|
|
288
|
+
if isinstance(error, HudRequestError): # type: ignore[arg-type]
|
|
289
|
+
details: dict[str, str] = {}
|
|
290
|
+
status_code = getattr(error, "status_code", None)
|
|
291
|
+
if status_code is not None:
|
|
292
|
+
details["Status"] = str(status_code)
|
|
293
|
+
response_text = getattr(error, "response_text", None)
|
|
294
|
+
if response_text:
|
|
295
|
+
# Limit very long responses
|
|
296
|
+
trimmed = response_text[:500] + ("..." if len(response_text) > 500 else "")
|
|
297
|
+
details["Response"] = trimmed
|
|
298
|
+
response_json = getattr(error, "response_json", None)
|
|
299
|
+
if response_json and not details.get("Response"):
|
|
300
|
+
details["Response JSON"] = str(response_json)
|
|
301
|
+
if details:
|
|
302
|
+
self.key_value_table(details, show_header=False, stderr=stderr)
|
|
303
|
+
|
|
304
|
+
# Structured hints, if available
|
|
305
|
+
hints = getattr(error, "hints", None)
|
|
306
|
+
if hints:
|
|
307
|
+
try:
|
|
308
|
+
from hud.shared.hints import render_hints # lazy import
|
|
309
|
+
|
|
310
|
+
render_hints(hints, design=self)
|
|
311
|
+
except Exception as render_error:
|
|
312
|
+
self.debug(f"Failed to render hints: {render_error}")
|
|
313
|
+
|
|
314
|
+
# Standard support hint
|
|
315
|
+
self.render_support_hint(stderr=stderr)
|
|
316
|
+
|
|
260
317
|
@property
|
|
261
318
|
def console(self) -> Console:
|
|
262
319
|
"""Get the stderr console for direct access when needed."""
|
hud/utils/mcp.py
CHANGED
|
@@ -76,4 +76,10 @@ def setup_hud_telemetry(
|
|
|
76
76
|
MCPConfigPatch(headers={"Run-Id": run_id}, meta={"run_id": run_id}),
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
+
if settings.api_key:
|
|
80
|
+
patch_mcp_config(
|
|
81
|
+
mcp_config,
|
|
82
|
+
MCPConfigPatch(headers={"Authorization": f"Bearer {settings.api_key}"}),
|
|
83
|
+
)
|
|
84
|
+
|
|
79
85
|
return auto_trace_cm
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from hud.utils.design import design
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _render_and_fallback(exc_type: type[BaseException], value: BaseException, tb: Any) -> None:
|
|
14
|
+
"""Render exceptions via HUD design, then delegate to default excepthook.
|
|
15
|
+
|
|
16
|
+
Only formats for HudException family or when running in a TTY; otherwise,
|
|
17
|
+
defers to the default handler to avoid swallowing useful tracebacks in code.
|
|
18
|
+
"""
|
|
19
|
+
# First, print the full traceback
|
|
20
|
+
sys.__excepthook__(exc_type, value, tb)
|
|
21
|
+
|
|
22
|
+
# Then print our formatted error
|
|
23
|
+
try:
|
|
24
|
+
from hud.shared.exceptions import HudException # lazy import
|
|
25
|
+
|
|
26
|
+
if isinstance(value, HudException):
|
|
27
|
+
# Flush stderr to ensure traceback is printed first
|
|
28
|
+
sys.stderr.flush()
|
|
29
|
+
# Add separator and render our formatted error
|
|
30
|
+
design.console.print("")
|
|
31
|
+
design.render_exception(value)
|
|
32
|
+
except Exception:
|
|
33
|
+
# If rendering fails for any reason, silently continue
|
|
34
|
+
logger.warning("Failed to render exception: %s, %s, %s", exc_type, value, tb)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _async_exception_handler(loop: asyncio.AbstractEventLoop, context: dict[str, Any]) -> None:
|
|
38
|
+
exc = context.get("exception")
|
|
39
|
+
msg = context.get("message")
|
|
40
|
+
try:
|
|
41
|
+
if exc is not None:
|
|
42
|
+
design.render_exception(exc)
|
|
43
|
+
elif msg:
|
|
44
|
+
design.error(msg)
|
|
45
|
+
design.render_support_hint()
|
|
46
|
+
except Exception:
|
|
47
|
+
logger.warning("Failed to render exception: %s, %s, %s", exc, msg, context)
|
|
48
|
+
|
|
49
|
+
# Delegate to default handler
|
|
50
|
+
loop.default_exception_handler(context)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def install_pretty_errors() -> None:
|
|
54
|
+
"""Install global pretty error handlers for sync and async exceptions."""
|
|
55
|
+
sys.excepthook = _render_and_fallback
|
|
56
|
+
try:
|
|
57
|
+
# Try to get the running loop first
|
|
58
|
+
loop = asyncio.get_running_loop()
|
|
59
|
+
loop.set_exception_handler(_async_exception_handler)
|
|
60
|
+
except RuntimeError:
|
|
61
|
+
# No running loop, try to create one
|
|
62
|
+
try:
|
|
63
|
+
loop = asyncio.new_event_loop()
|
|
64
|
+
loop.set_exception_handler(_async_exception_handler)
|
|
65
|
+
except Exception:
|
|
66
|
+
logger.warning("No running loop, could not set exception handler")
|
|
67
|
+
except Exception:
|
|
68
|
+
logger.warning("No running loop, could not set exception handler")
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.22
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -38,6 +38,7 @@ Requires-Python: <3.14,>=3.11
|
|
|
38
38
|
Requires-Dist: httpx<1,>=0.23.0
|
|
39
39
|
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
40
40
|
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|
|
41
|
+
Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
|
|
41
42
|
Requires-Dist: opentelemetry-api>=1.34.1
|
|
42
43
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
|
|
43
44
|
Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
|
|
@@ -56,7 +57,6 @@ Provides-Extra: agent
|
|
|
56
57
|
Requires-Dist: anthropic; extra == 'agent'
|
|
57
58
|
Requires-Dist: datasets>=2.14.0; extra == 'agent'
|
|
58
59
|
Requires-Dist: dotenv>=0.9.9; extra == 'agent'
|
|
59
|
-
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
|
|
60
60
|
Requires-Dist: ipykernel; extra == 'agent'
|
|
61
61
|
Requires-Dist: ipython<9; extra == 'agent'
|
|
62
62
|
Requires-Dist: jupyter-client; extra == 'agent'
|
|
@@ -70,7 +70,6 @@ Provides-Extra: agents
|
|
|
70
70
|
Requires-Dist: anthropic; extra == 'agents'
|
|
71
71
|
Requires-Dist: datasets>=2.14.0; extra == 'agents'
|
|
72
72
|
Requires-Dist: dotenv>=0.9.9; extra == 'agents'
|
|
73
|
-
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
|
|
74
73
|
Requires-Dist: ipykernel; extra == 'agents'
|
|
75
74
|
Requires-Dist: ipython<9; extra == 'agents'
|
|
76
75
|
Requires-Dist: jupyter-client; extra == 'agents'
|
|
@@ -85,7 +84,6 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
|
|
|
85
84
|
Requires-Dist: anthropic; extra == 'dev'
|
|
86
85
|
Requires-Dist: datasets>=2.14.0; extra == 'dev'
|
|
87
86
|
Requires-Dist: dotenv>=0.9.9; extra == 'dev'
|
|
88
|
-
Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
|
|
89
87
|
Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
|
|
90
88
|
Requires-Dist: ipykernel; extra == 'dev'
|
|
91
89
|
Requires-Dist: ipython<9; extra == 'dev'
|