hud-python 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (54) hide show
  1. hud/__init__.py +7 -0
  2. hud/agents/base.py +42 -10
  3. hud/agents/claude.py +24 -14
  4. hud/agents/grounded_openai.py +280 -0
  5. hud/agents/tests/test_client.py +11 -27
  6. hud/agents/tests/test_grounded_openai_agent.py +155 -0
  7. hud/cli/__init__.py +50 -20
  8. hud/cli/build.py +3 -44
  9. hud/cli/eval.py +25 -6
  10. hud/cli/init.py +4 -4
  11. hud/cli/push.py +3 -1
  12. hud/cli/tests/test_push.py +6 -6
  13. hud/cli/utils/interactive.py +1 -1
  14. hud/clients/__init__.py +3 -2
  15. hud/clients/base.py +20 -9
  16. hud/clients/mcp_use.py +44 -22
  17. hud/datasets/task.py +6 -2
  18. hud/native/__init__.py +6 -0
  19. hud/native/comparator.py +546 -0
  20. hud/native/tests/__init__.py +1 -0
  21. hud/native/tests/test_comparator.py +539 -0
  22. hud/native/tests/test_native_init.py +79 -0
  23. hud/otel/instrumentation.py +0 -2
  24. hud/server/server.py +9 -2
  25. hud/settings.py +6 -0
  26. hud/shared/exceptions.py +204 -31
  27. hud/shared/hints.py +177 -0
  28. hud/shared/requests.py +15 -3
  29. hud/shared/tests/test_exceptions.py +385 -144
  30. hud/tools/__init__.py +2 -0
  31. hud/tools/executors/tests/test_base_executor.py +1 -1
  32. hud/tools/executors/xdo.py +1 -1
  33. hud/tools/grounding/__init__.py +13 -0
  34. hud/tools/grounding/config.py +54 -0
  35. hud/tools/grounding/grounded_tool.py +314 -0
  36. hud/tools/grounding/grounder.py +301 -0
  37. hud/tools/grounding/tests/__init__.py +1 -0
  38. hud/tools/grounding/tests/test_grounded_tool.py +196 -0
  39. hud/tools/submit.py +66 -0
  40. hud/tools/tests/test_playwright_tool.py +1 -1
  41. hud/tools/tests/test_tools_init.py +1 -1
  42. hud/tools/tests/test_utils.py +2 -2
  43. hud/types.py +33 -5
  44. hud/utils/agent_factories.py +86 -0
  45. hud/utils/design.py +57 -0
  46. hud/utils/mcp.py +6 -0
  47. hud/utils/pretty_errors.py +68 -0
  48. hud/utils/tests/test_version.py +1 -1
  49. hud/version.py +1 -1
  50. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/METADATA +2 -4
  51. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/RECORD +54 -37
  52. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/WHEEL +0 -0
  53. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/entry_points.txt +0 -0
  54. {hud_python-0.4.20.dist-info → hud_python-0.4.22.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,196 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+ import mcp.types as types
7
+ import pytest
8
+
9
+ from hud.tools.grounding.grounded_tool import GroundedComputerTool
10
+ from hud.types import MCPToolCall, MCPToolResult
11
+
12
+
13
+ @dataclass
14
+ class FakeResult:
15
+ content: list[types.ContentBlock]
16
+ isError: bool = False
17
+ structuredContent: dict | None = None
18
+
19
+
20
+ class FakeMCPClient:
21
+ """Fake MCP client that implements AgentMCPClient protocol."""
22
+
23
+ _initialized: bool
24
+
25
+ def __init__(self) -> None:
26
+ self.calls: list[tuple[str, dict[str, Any]]] = []
27
+ self._initialized = False
28
+
29
+ @property
30
+ def mcp_config(self) -> dict[str, dict[str, Any]]:
31
+ return {"test": {"command": "echo", "args": ["test"]}}
32
+
33
+ @property
34
+ def is_connected(self) -> bool:
35
+ return self._initialized
36
+
37
+ async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
38
+ self._initialized = True
39
+
40
+ async def list_tools(self) -> list[types.Tool]:
41
+ return [types.Tool(name="computer", description="Test tool", inputSchema={})]
42
+
43
+ async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
44
+ self.calls.append((tool_call.name, tool_call.arguments or {}))
45
+ return MCPToolResult(content=[types.TextContent(text="ok", type="text")], isError=False)
46
+
47
+ async def shutdown(self) -> None:
48
+ self._initialized = False
49
+
50
+
51
+ class FakeGrounder:
52
+ """Fake grounder that implements Grounder interface."""
53
+
54
+ def __init__(self, coords: tuple[int, int] | None = (10, 20)) -> None:
55
+ self.coords = coords
56
+ self.calls: list[tuple[str, str]] = []
57
+
58
+ async def predict_click(
59
+ self, *, image_b64: str, instruction: str, max_retries: int = 3
60
+ ) -> tuple[int, int] | None:
61
+ self.calls.append((image_b64[:10], instruction))
62
+ return self.coords
63
+
64
+
65
+ def _png_b64() -> str:
66
+ # 1x1 transparent PNG base64 (valid minimal image)
67
+ return (
68
+ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
69
+ "J2n0mQAAAABJRU5ErkJggg=="
70
+ )
71
+
72
+
73
+ @pytest.mark.asyncio
74
+ async def test_click_action_grounds_and_calls_mcp() -> None:
75
+ client = FakeMCPClient()
76
+ grounder = FakeGrounder(coords=(123, 456))
77
+ tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
78
+
79
+ blocks = await tool(
80
+ action="click",
81
+ element_description="red button",
82
+ screenshot_b64=_png_b64(),
83
+ button="left",
84
+ )
85
+
86
+ assert isinstance(blocks, list)
87
+ # Grounder called once
88
+ assert len(grounder.calls) == 1
89
+ # MCP called with resolved coordinates
90
+ assert client.calls == [("computer", {"action": "click", "x": 123, "y": 456, "button": "left"})]
91
+
92
+
93
+ @pytest.mark.asyncio
94
+ async def test_move_and_scroll_require_element_description_and_screenshot() -> None:
95
+ client = FakeMCPClient()
96
+ grounder = FakeGrounder(coords=(5, 6))
97
+ tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
98
+
99
+ # Missing element_description
100
+ with pytest.raises(Exception) as ei:
101
+ await tool(action="move", screenshot_b64=_png_b64())
102
+ assert "element_description is required" in str(ei.value)
103
+
104
+ # Missing screenshot
105
+ with pytest.raises(Exception) as ei2:
106
+ await tool(action="scroll", element_description="list", scroll_y=100)
107
+ assert "No screenshot available" in str(ei2.value)
108
+
109
+
110
+ @pytest.mark.asyncio
111
+ async def test_drag_grounds_both_points_and_calls_mcp() -> None:
112
+ client = FakeMCPClient()
113
+ grounder = FakeGrounder(coords=(10, 20))
114
+ tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
115
+
116
+ await tool(
117
+ action="drag",
118
+ start_element_description="source",
119
+ end_element_description="target",
120
+ screenshot_b64=_png_b64(),
121
+ button="left",
122
+ )
123
+
124
+ # Two grounding calls (start and end)
125
+ assert len(grounder.calls) == 2
126
+ # Drag path contains two points, same coords from fake grounder
127
+ name, args = client.calls[0]
128
+ assert name == "computer"
129
+ assert args["action"] == "drag"
130
+ assert args["button"] == "left"
131
+ assert args["path"] == [(10, 20), (10, 20)]
132
+
133
+
134
+ @pytest.mark.asyncio
135
+ async def test_drag_requires_both_descriptions_and_screenshot() -> None:
136
+ client = FakeMCPClient()
137
+ grounder = FakeGrounder()
138
+ tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
139
+
140
+ with pytest.raises(Exception) as ei:
141
+ await tool(action="drag", start_element_description="a", screenshot_b64=_png_b64())
142
+ assert "start_element_description and end_element_description" in str(ei.value)
143
+
144
+ with pytest.raises(Exception) as ei2:
145
+ await tool(
146
+ action="drag",
147
+ start_element_description="a",
148
+ end_element_description="b",
149
+ )
150
+ assert "No screenshot available" in str(ei2.value)
151
+
152
+
153
+ @pytest.mark.asyncio
154
+ async def test_direct_actions_bypass_grounding_and_call_mcp() -> None:
155
+ client = FakeMCPClient()
156
+ grounder = FakeGrounder()
157
+ tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
158
+
159
+ # Actions that bypass grounding
160
+ for action, extra in [
161
+ ("screenshot", {}),
162
+ ("type", {"text": "hello"}),
163
+ ("keypress", {"keys": ["ctrl", "a"]}),
164
+ ("wait", {}),
165
+ ("get_current_url", {}),
166
+ ("get_dimensions", {}),
167
+ ("get_environment", {}),
168
+ ]:
169
+ client.calls.clear()
170
+ _ = await tool(action=action, **extra)
171
+ assert client.calls and client.calls[0][0] == "computer"
172
+ assert client.calls[0][1]["action"] == action
173
+ # Grounder not invoked for these
174
+ assert grounder.calls == []
175
+
176
+
177
+ @pytest.mark.asyncio
178
+ async def test_unsupported_action_raises() -> None:
179
+ client = FakeMCPClient()
180
+ grounder = FakeGrounder()
181
+ tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
182
+
183
+ with pytest.raises(Exception) as ei:
184
+ await tool(action="zoom")
185
+ assert "Unsupported action" in str(ei.value)
186
+
187
+
188
+ @pytest.mark.asyncio
189
+ async def test_grounding_failure_propagates_as_error() -> None:
190
+ client = FakeMCPClient()
191
+ grounder = FakeGrounder(coords=None)
192
+ tool = GroundedComputerTool(grounder=grounder, mcp_client=client) # type: ignore
193
+
194
+ with pytest.raises(Exception) as ei:
195
+ await tool(action="click", element_description="x", screenshot_b64=_png_b64())
196
+ assert "Could not locate element" in str(ei.value)
hud/tools/submit.py ADDED
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from mcp.types import ContentBlock, TextContent
6
+
7
+ from .response import ResponseTool
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ # Global submission storage
13
+ _SUBMISSION: str | None = None
14
+
15
+
16
+ def set_submission(value: str | None) -> None:
17
+ global _SUBMISSION
18
+ _SUBMISSION = value
19
+
20
+
21
+ def get_submission() -> str | None:
22
+ return _SUBMISSION
23
+
24
+
25
+ class SubmitTool(ResponseTool):
26
+ """Lifecycle tool to submit the agent's final answer for evaluation.
27
+
28
+ Accepts either a `response` string or a `messages` list and stores the
29
+ submission as a plain string, accessible via `get_submission()`.
30
+ Priority: The last text content in `messages` (if provided) overrides `response`.
31
+ """
32
+
33
+ name: str = "response"
34
+ title: str = "Submit Tool"
35
+ description: str = "Submit the agent's final response for later evaluation"
36
+
37
+ async def __call__(
38
+ self, response: str | None = None, messages: list[ContentBlock] | None = None
39
+ ) -> list[ContentBlock]:
40
+ # 1) If messages provided, take the last text block
41
+ # chosen: str | None = None
42
+
43
+ # if messages:
44
+ # # Gather all text blocks
45
+ # text_blocks: list[str] = []
46
+ # for block in messages:
47
+ # try:
48
+ # if isinstance(block, TextContent):
49
+ # text_blocks.append(str(block.text))
50
+ # except Exception:
51
+ # logger.debug("SubmitTool skipped non-text block: %s", block)
52
+ # continue
53
+ # if text_blocks:
54
+ # chosen = text_blocks[-1]
55
+
56
+ # # 2) Otherwise use `response` as-is
57
+ # if chosen is None and response is not None:
58
+ # chosen = response
59
+
60
+ set_submission(response)
61
+
62
+ # Echo back what we stored
63
+ blocks: list[ContentBlock] = []
64
+ if response:
65
+ blocks.append(TextContent(text=response, type="text"))
66
+ return blocks
@@ -52,7 +52,7 @@ class TestPlaywrightTool:
52
52
  assert any(isinstance(b, TextContent) for b in blocks)
53
53
  # The actual call includes wait_until parameter with a Field object
54
54
  mock_page.goto.assert_called_once()
55
- args, kwargs = mock_page.goto.call_args
55
+ args, _kwargs = mock_page.goto.call_args
56
56
  assert args[0] == "https://example.com"
57
57
  mock_ensure.assert_called_once()
58
58
 
@@ -33,7 +33,7 @@ class TestToolsInit:
33
33
  """Test lazy import with invalid attribute name."""
34
34
  import hud.tools as tools_module
35
35
 
36
- with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidTool'"):
36
+ with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidTool'"):
37
37
  _ = tools_module.InvalidTool
38
38
 
39
39
  def test_direct_imports_available(self):
@@ -58,7 +58,7 @@ class TestRun:
58
58
  mock_proc.communicate = AsyncMock(return_value=(b"processed", b""))
59
59
 
60
60
  with patch("asyncio.create_subprocess_shell", return_value=mock_proc):
61
- return_code, stdout, stderr = await run("cat", input="test input")
61
+ return_code, stdout, _stderr = await run("cat", input="test input")
62
62
 
63
63
  assert return_code == 0
64
64
  assert stdout == "processed"
@@ -91,7 +91,7 @@ class TestRun:
91
91
  ):
92
92
  mock_wait_for.return_value = (b"done", b"")
93
93
 
94
- return_code, stdout, stderr = await run("sleep 1", timeout=5.0)
94
+ _return_code, _stdout, _stderr = await run("sleep 1", timeout=5.0)
95
95
 
96
96
  # Check that wait_for was called with the correct timeout
97
97
  mock_wait_for.assert_called_once()
hud/types.py CHANGED
@@ -15,7 +15,20 @@ class MCPToolCall(CallToolRequestParams):
15
15
  id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Unique identifier for reference
16
16
 
17
17
  def __str__(self) -> str:
18
- """Format tool call with Rich markup for HUD design."""
18
+ """Format tool call as plain text."""
19
+ args_str = ""
20
+ if self.arguments:
21
+ try:
22
+ args_str = json.dumps(self.arguments, separators=(",", ":"))
23
+ if len(args_str) > 60:
24
+ args_str = args_str[:57] + "..."
25
+ except (TypeError, ValueError):
26
+ args_str = str(self.arguments)[:60]
27
+
28
+ return f"→ {self.name}({args_str})"
29
+
30
+ def __rich__(self) -> str:
31
+ """Rich representation with color formatting."""
19
32
  from hud.utils.design import design
20
33
 
21
34
  return design.format_tool_call(self.name, self.arguments)
@@ -24,10 +37,8 @@ class MCPToolCall(CallToolRequestParams):
24
37
  class MCPToolResult(CallToolResult):
25
38
  """A tool result."""
26
39
 
27
- def __str__(self) -> str:
28
- """Format tool result with Rich markup for HUD design - compact version."""
29
- from hud.utils.design import design
30
-
40
+ def _get_content_summary(self) -> str:
41
+ """Extract a summary of the content."""
31
42
  # Extract content summary
32
43
  content_summary = ""
33
44
  if self.content:
@@ -49,6 +60,23 @@ class MCPToolResult(CallToolResult):
49
60
  except (TypeError, ValueError):
50
61
  content_summary = str(self.structuredContent)
51
62
 
63
+ return content_summary
64
+
65
+ def __str__(self) -> str:
66
+ """Format tool result as plain text for compatibility."""
67
+ content_summary = self._get_content_summary()
68
+
69
+ # Plain text format with unicode symbols
70
+ if self.isError:
71
+ return f"✗ {content_summary}"
72
+ else:
73
+ return f"✓ {content_summary}"
74
+
75
+ def __rich__(self) -> str:
76
+ """Rich representation with color formatting."""
77
+ from hud.utils.design import design
78
+
79
+ content_summary = self._get_content_summary()
52
80
  return design.format_tool_result(content_summary, self.isError)
53
81
 
54
82
 
@@ -0,0 +1,86 @@
1
+ """Factory functions for creating agents compatible with run_dataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from openai import AsyncOpenAI
8
+
9
+ from hud.agents.grounded_openai import GroundedOpenAIChatAgent
10
+ from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
11
+ from hud.tools.grounding import GrounderConfig
12
+
13
+
14
+ def create_openai_agent(**kwargs: Any) -> GenericOpenAIChatAgent:
15
+ """Factory for GenericOpenAIChatAgent with run_dataset compatibility.
16
+
17
+ Args:
18
+ api_key: OpenAI API key
19
+ base_url: Optional custom API endpoint
20
+ model_name: Model to use (e.g., "gpt-4o-mini")
21
+ **kwargs: Additional arguments passed to GenericOpenAIChatAgent
22
+
23
+ Returns:
24
+ Configured GenericOpenAIChatAgent instance
25
+
26
+ Example:
27
+ >>> from hud.datasets import run_dataset
28
+ >>> from hud.utils.agent_factories import create_openai_agent
29
+ >>> results = await run_dataset(
30
+ ... "My Eval",
31
+ ... "hud-evals/SheetBench-50",
32
+ ... create_openai_agent,
33
+ ... {"api_key": "your-key", "model_name": "gpt-4o-mini"},
34
+ ... )
35
+ """
36
+ api_key = kwargs.pop("api_key", None)
37
+ base_url = kwargs.pop("base_url", None)
38
+
39
+ openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
40
+
41
+ return GenericOpenAIChatAgent(openai_client=openai_client, **kwargs)
42
+
43
+
44
+ def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent:
45
+ """Factory for GroundedOpenAIChatAgent with run_dataset compatibility.
46
+
47
+ Args:
48
+ api_key: OpenAI API key for planning model
49
+ base_url: Optional custom API endpoint for planning model
50
+ model_name: Planning model to use (e.g., "gpt-4o-mini")
51
+ grounder_api_key: API key for grounding model
52
+ grounder_api_base: API base URL for grounding model (default: OpenRouter)
53
+ grounder_model: Grounding model to use (default: qwen/qwen-2.5-vl-7b-instruct)
54
+ **kwargs: Additional arguments passed to GroundedOpenAIChatAgent
55
+
56
+ Returns:
57
+ Configured GroundedOpenAIChatAgent instance
58
+
59
+ Example:
60
+ >>> from hud.datasets import run_dataset
61
+ >>> from hud.utils.agent_factories import create_grounded_agent
62
+ >>> results = await run_dataset(
63
+ ... "Grounded Eval",
64
+ ... dataset,
65
+ ... create_grounded_agent,
66
+ ... {
67
+ ... "api_key": "openai-key",
68
+ ... "grounder_api_key": "openrouter-key",
69
+ ... "model_name": "gpt-4o-mini",
70
+ ... },
71
+ ... )
72
+ """
73
+ api_key = kwargs.pop("api_key", None)
74
+ base_url = kwargs.pop("base_url", None)
75
+ grounder_api_key = kwargs.pop("grounder_api_key", None)
76
+ grounder_api_base = kwargs.pop("grounder_api_base", "https://openrouter.ai/api/v1")
77
+ grounder_model = kwargs.pop("grounder_model", "qwen/qwen-2.5-vl-7b-instruct")
78
+
79
+ openai_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
80
+ grounder_config = GrounderConfig(
81
+ api_base=grounder_api_base, model=grounder_model, api_key=grounder_api_key
82
+ )
83
+
84
+ return GroundedOpenAIChatAgent(
85
+ openai_client=openai_client, grounder_config=grounder_config, **kwargs
86
+ )
hud/utils/design.py CHANGED
@@ -257,6 +257,63 @@ class HUDDesign:
257
257
  else:
258
258
  console.print(f" [cyan]{command}[/cyan]")
259
259
 
260
+ # Exception rendering utilities
261
+ def render_support_hint(self, stderr: bool = True) -> None:
262
+ """Render a standard support message for users encountering issues."""
263
+ support = (
264
+ "If this looks like an issue with the sdk, please make a github issue at "
265
+ "https://github.com/hud-evals/hud-python/issues"
266
+ )
267
+ self.info(support, stderr=stderr)
268
+
269
+ def render_exception(self, error: BaseException, *, stderr: bool = True) -> None:
270
+ """Render exceptions consistently using the HUD design system.
271
+
272
+ - Shows exception type and message
273
+ - Displays structured hints if present on the exception (e.g., HudException.hints)
274
+ - Prints a link to open an issue for SDK problems
275
+ """
276
+ try:
277
+ from hud.shared.exceptions import HudRequestError # lazy import
278
+ except Exception:
279
+ # Keep type available for isinstance guards below without import-time dependency
280
+ HudRequestError = tuple() # type: ignore
281
+
282
+ # Header with exception type
283
+ ex_type = type(error).__name__
284
+ message = getattr(error, "message", "") or str(error) or ex_type
285
+ self.error(f"{ex_type}: {message}", stderr=stderr)
286
+
287
+ # Specialized details for request errors
288
+ if isinstance(error, HudRequestError): # type: ignore[arg-type]
289
+ details: dict[str, str] = {}
290
+ status_code = getattr(error, "status_code", None)
291
+ if status_code is not None:
292
+ details["Status"] = str(status_code)
293
+ response_text = getattr(error, "response_text", None)
294
+ if response_text:
295
+ # Limit very long responses
296
+ trimmed = response_text[:500] + ("..." if len(response_text) > 500 else "")
297
+ details["Response"] = trimmed
298
+ response_json = getattr(error, "response_json", None)
299
+ if response_json and not details.get("Response"):
300
+ details["Response JSON"] = str(response_json)
301
+ if details:
302
+ self.key_value_table(details, show_header=False, stderr=stderr)
303
+
304
+ # Structured hints, if available
305
+ hints = getattr(error, "hints", None)
306
+ if hints:
307
+ try:
308
+ from hud.shared.hints import render_hints # lazy import
309
+
310
+ render_hints(hints, design=self)
311
+ except Exception as render_error:
312
+ self.debug(f"Failed to render hints: {render_error}")
313
+
314
+ # Standard support hint
315
+ self.render_support_hint(stderr=stderr)
316
+
260
317
  @property
261
318
  def console(self) -> Console:
262
319
  """Get the stderr console for direct access when needed."""
hud/utils/mcp.py CHANGED
@@ -76,4 +76,10 @@ def setup_hud_telemetry(
76
76
  MCPConfigPatch(headers={"Run-Id": run_id}, meta={"run_id": run_id}),
77
77
  )
78
78
 
79
+ if settings.api_key:
80
+ patch_mcp_config(
81
+ mcp_config,
82
+ MCPConfigPatch(headers={"Authorization": f"Bearer {settings.api_key}"}),
83
+ )
84
+
79
85
  return auto_trace_cm
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import sys
6
+ from typing import Any
7
+
8
+ from hud.utils.design import design
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def _render_and_fallback(exc_type: type[BaseException], value: BaseException, tb: Any) -> None:
14
+ """Render exceptions via HUD design, then delegate to default excepthook.
15
+
16
+ Only formats for HudException family or when running in a TTY; otherwise,
17
+ defers to the default handler to avoid swallowing useful tracebacks in code.
18
+ """
19
+ # First, print the full traceback
20
+ sys.__excepthook__(exc_type, value, tb)
21
+
22
+ # Then print our formatted error
23
+ try:
24
+ from hud.shared.exceptions import HudException # lazy import
25
+
26
+ if isinstance(value, HudException):
27
+ # Flush stderr to ensure traceback is printed first
28
+ sys.stderr.flush()
29
+ # Add separator and render our formatted error
30
+ design.console.print("")
31
+ design.render_exception(value)
32
+ except Exception:
33
+ # If rendering fails for any reason, silently continue
34
+ logger.warning("Failed to render exception: %s, %s, %s", exc_type, value, tb)
35
+
36
+
37
+ def _async_exception_handler(loop: asyncio.AbstractEventLoop, context: dict[str, Any]) -> None:
38
+ exc = context.get("exception")
39
+ msg = context.get("message")
40
+ try:
41
+ if exc is not None:
42
+ design.render_exception(exc)
43
+ elif msg:
44
+ design.error(msg)
45
+ design.render_support_hint()
46
+ except Exception:
47
+ logger.warning("Failed to render exception: %s, %s, %s", exc, msg, context)
48
+
49
+ # Delegate to default handler
50
+ loop.default_exception_handler(context)
51
+
52
+
53
+ def install_pretty_errors() -> None:
54
+ """Install global pretty error handlers for sync and async exceptions."""
55
+ sys.excepthook = _render_and_fallback
56
+ try:
57
+ # Try to get the running loop first
58
+ loop = asyncio.get_running_loop()
59
+ loop.set_exception_handler(_async_exception_handler)
60
+ except RuntimeError:
61
+ # No running loop, try to create one
62
+ try:
63
+ loop = asyncio.new_event_loop()
64
+ loop.set_exception_handler(_async_exception_handler)
65
+ except Exception:
66
+ logger.warning("No running loop, could not set exception handler")
67
+ except Exception:
68
+ logger.warning("No running loop, could not set exception handler")
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.4.20"
8
+ assert hud.__version__ == "0.4.22"
hud/version.py CHANGED
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.4.20"
7
+ __version__ = "0.4.22"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.20
3
+ Version: 0.4.22
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -38,6 +38,7 @@ Requires-Python: <3.14,>=3.11
38
38
  Requires-Dist: httpx<1,>=0.23.0
39
39
  Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
40
40
  Requires-Dist: hud-mcp-python-sdk>=3.13.2
41
+ Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
41
42
  Requires-Dist: opentelemetry-api>=1.34.1
42
43
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
43
44
  Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
@@ -56,7 +57,6 @@ Provides-Extra: agent
56
57
  Requires-Dist: anthropic; extra == 'agent'
57
58
  Requires-Dist: datasets>=2.14.0; extra == 'agent'
58
59
  Requires-Dist: dotenv>=0.9.9; extra == 'agent'
59
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
60
60
  Requires-Dist: ipykernel; extra == 'agent'
61
61
  Requires-Dist: ipython<9; extra == 'agent'
62
62
  Requires-Dist: jupyter-client; extra == 'agent'
@@ -70,7 +70,6 @@ Provides-Extra: agents
70
70
  Requires-Dist: anthropic; extra == 'agents'
71
71
  Requires-Dist: datasets>=2.14.0; extra == 'agents'
72
72
  Requires-Dist: dotenv>=0.9.9; extra == 'agents'
73
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
74
73
  Requires-Dist: ipykernel; extra == 'agents'
75
74
  Requires-Dist: ipython<9; extra == 'agents'
76
75
  Requires-Dist: jupyter-client; extra == 'agents'
@@ -85,7 +84,6 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
85
84
  Requires-Dist: anthropic; extra == 'dev'
86
85
  Requires-Dist: datasets>=2.14.0; extra == 'dev'
87
86
  Requires-Dist: dotenv>=0.9.9; extra == 'dev'
88
- Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
89
87
  Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
90
88
  Requires-Dist: ipykernel; extra == 'dev'
91
89
  Requires-Dist: ipython<9; extra == 'dev'