hud-python 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show
  1. hud/__main__.py +8 -0
  2. hud/agents/base.py +7 -8
  3. hud/agents/langchain.py +2 -2
  4. hud/agents/tests/test_openai.py +3 -1
  5. hud/cli/__init__.py +114 -52
  6. hud/cli/build.py +121 -71
  7. hud/cli/debug.py +2 -2
  8. hud/cli/{mcp_server.py → dev.py} +101 -38
  9. hud/cli/eval.py +175 -90
  10. hud/cli/init.py +442 -64
  11. hud/cli/list_func.py +72 -71
  12. hud/cli/pull.py +1 -2
  13. hud/cli/push.py +35 -23
  14. hud/cli/remove.py +35 -41
  15. hud/cli/tests/test_analyze.py +2 -1
  16. hud/cli/tests/test_analyze_metadata.py +42 -49
  17. hud/cli/tests/test_build.py +28 -52
  18. hud/cli/tests/test_cursor.py +1 -1
  19. hud/cli/tests/test_debug.py +1 -1
  20. hud/cli/tests/test_list_func.py +75 -64
  21. hud/cli/tests/test_main_module.py +30 -0
  22. hud/cli/tests/test_mcp_server.py +3 -3
  23. hud/cli/tests/test_pull.py +30 -61
  24. hud/cli/tests/test_push.py +70 -89
  25. hud/cli/tests/test_registry.py +36 -38
  26. hud/cli/tests/test_utils.py +1 -1
  27. hud/cli/utils/__init__.py +1 -0
  28. hud/cli/{docker_utils.py → utils/docker.py} +36 -0
  29. hud/cli/{env_utils.py → utils/environment.py} +7 -7
  30. hud/cli/{interactive.py → utils/interactive.py} +91 -19
  31. hud/cli/{analyze_metadata.py → utils/metadata.py} +12 -8
  32. hud/cli/{registry.py → utils/registry.py} +28 -30
  33. hud/cli/{remote_runner.py → utils/remote_runner.py} +1 -1
  34. hud/cli/utils/runner.py +134 -0
  35. hud/cli/utils/server.py +250 -0
  36. hud/clients/base.py +1 -1
  37. hud/clients/fastmcp.py +5 -13
  38. hud/clients/mcp_use.py +6 -10
  39. hud/server/server.py +35 -5
  40. hud/shared/exceptions.py +11 -0
  41. hud/shared/tests/test_exceptions.py +22 -0
  42. hud/telemetry/tests/__init__.py +0 -0
  43. hud/telemetry/tests/test_replay.py +40 -0
  44. hud/telemetry/tests/test_trace.py +63 -0
  45. hud/tools/base.py +20 -3
  46. hud/tools/computer/hud.py +15 -6
  47. hud/tools/executors/tests/test_base_executor.py +27 -0
  48. hud/tools/response.py +12 -8
  49. hud/tools/tests/test_response.py +60 -0
  50. hud/tools/tests/test_tools_init.py +49 -0
  51. hud/utils/design.py +19 -8
  52. hud/utils/mcp.py +17 -5
  53. hud/utils/tests/test_mcp.py +112 -0
  54. hud/utils/tests/test_version.py +1 -1
  55. hud/version.py +1 -1
  56. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/METADATA +16 -13
  57. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/RECORD +62 -52
  58. hud/cli/runner.py +0 -160
  59. /hud/cli/{cursor.py → utils/cursor.py} +0 -0
  60. /hud/cli/{utils.py → utils/logging.py} +0 -0
  61. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/WHEEL +0 -0
  62. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/entry_points.txt +0 -0
  63. {hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/licenses/LICENSE +0 -0
hud/tools/computer/hud.py CHANGED
@@ -58,20 +58,29 @@ class HudComputerTool(BaseTool):
58
58
  title: Human-readable display name for the tool (auto-generated from class name)
59
59
  description: Tool description (auto-generated from docstring if not provided)
60
60
  """
61
+ # This is the width and height the agent thinks it operates in
62
+ # By default, use subclass's width and height
63
+ # If specifically set to None, use environment width and height
64
+ self.width = width or computer_settings.DISPLAY_WIDTH
65
+ self.height = height or computer_settings.DISPLAY_HEIGHT
66
+
67
+ # Build metadata with resolution info
68
+ meta = {
69
+ "resolution": {
70
+ "width": self.width,
71
+ "height": self.height,
72
+ }
73
+ }
74
+
61
75
  # Initialize base tool with executor as env
62
76
  super().__init__(
63
77
  env=executor,
64
78
  name=name or "computer",
65
79
  title=title or "Computer Control",
66
80
  description=description or "Control computer with mouse, keyboard, and screenshots",
81
+ meta=meta,
67
82
  )
68
83
 
69
- # This is the width and height the agent thinks it operates in
70
- # By default, use subclass's width and height
71
- # If specifically set to None, use environment width and height
72
- self.width = width or computer_settings.DISPLAY_WIDTH
73
- self.height = height or computer_settings.DISPLAY_HEIGHT
74
-
75
84
  # This is the static width and height of the environment screen
76
85
  # And the width and height of the screenshots taken by the tool
77
86
  self.environment_width = computer_settings.DISPLAY_WIDTH
@@ -336,3 +336,30 @@ class TestBaseExecutor:
336
336
 
337
337
  assert result1.base64_image == screenshot1
338
338
  assert result2.base64_image == screenshot1
339
+
340
+
341
+ class TestLazyImports:
342
+ """Tests for lazy import functionality in executors module."""
343
+
344
+ def test_lazy_import_pyautogui_executor(self):
345
+ """Test lazy import of PyAutoGUIExecutor."""
346
+ # This should trigger the __getattr__ function and import PyAutoGUIExecutor
347
+ from hud.tools.executors import PyAutoGUIExecutor
348
+
349
+ # Verify it's imported correctly
350
+ assert PyAutoGUIExecutor.__name__ == "PyAutoGUIExecutor"
351
+
352
+ def test_lazy_import_xdo_executor(self):
353
+ """Test lazy import of XDOExecutor."""
354
+ # This should trigger the __getattr__ function and import XDOExecutor
355
+ from hud.tools.executors import XDOExecutor
356
+
357
+ # Verify it's imported correctly
358
+ assert XDOExecutor.__name__ == "XDOExecutor"
359
+
360
+ def test_lazy_import_invalid_attribute(self):
361
+ """Test lazy import with invalid attribute name."""
362
+ import hud.tools.executors as executors_module
363
+
364
+ with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidExecutor'"):
365
+ _ = executors_module.InvalidExecutor
hud/tools/response.py CHANGED
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
12
12
  class ResponseTool(BaseTool):
13
13
  """
14
14
  Protocol for handling responses within environments.
15
-
15
+
16
16
  This abstract tool defines the interface for response handling in environments.
17
17
  Subclasses should implement the __call__ method to handle responses according
18
18
  to their specific needs.
@@ -36,18 +36,22 @@ class ResponseTool(BaseTool):
36
36
  return blocks
37
37
  """
38
38
 
39
- def __init__(self, name: str, title: str, description: str):
39
+ name: str = "response"
40
+ title: str = "Response Tool"
41
+ description: str = "Send a text response or list of messages to the environment"
42
+
43
+ def __init__(
44
+ self, name: str | None = None, title: str | None = None, description: str | None = None
45
+ ) -> None:
40
46
  super().__init__(
41
- name=name or "response",
42
- title=title or "Response Tool",
43
- description=description or "Send a text response or list of messages to the environment",
47
+ name=name or self.name,
48
+ title=title or self.title,
49
+ description=description or self.description,
44
50
  )
45
51
 
46
52
  @abstractmethod
47
53
  async def __call__(
48
- self,
49
- response: str | None = None,
50
- messages: list[ContentBlock] | None = None
54
+ self, response: str | None = None, messages: list[ContentBlock] | None = None
51
55
  ) -> list[ContentBlock]:
52
56
  """Handle response or messages and return as ContentBlocks.
53
57
 
@@ -0,0 +1,60 @@
1
+ """Tests for ResponseTool class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from hud.tools.response import ResponseTool
8
+
9
+
10
+ class ConcreteResponseTool(ResponseTool):
11
+ """Concrete implementation for testing."""
12
+
13
+ async def __call__(self, response: str | None = None, messages=None):
14
+ """Concrete implementation."""
15
+ from mcp.types import TextContent
16
+
17
+ return [TextContent(text=response or "test", type="text")]
18
+
19
+
20
+ class TestResponseTool:
21
+ """Tests for ResponseTool abstract class."""
22
+
23
+ def test_init_with_defaults(self):
24
+ """Test initialization with default values."""
25
+ tool = ConcreteResponseTool()
26
+ assert tool.name == "response"
27
+ assert tool.title == "Response Tool"
28
+ assert tool.description == "Send a text response or list of messages to the environment"
29
+
30
+ def test_init_with_custom_values(self):
31
+ """Test initialization with custom values."""
32
+ tool = ConcreteResponseTool(
33
+ name="custom_response", title="Custom Response Tool", description="Custom description"
34
+ )
35
+ assert tool.name == "custom_response"
36
+ assert tool.title == "Custom Response Tool"
37
+ assert tool.description == "Custom description"
38
+
39
+ def test_abstract_method_not_implemented(self):
40
+ """Test that abstract method raises NotImplementedError when not implemented."""
41
+
42
+ # Create a concrete tool to test the abstract method's NotImplementedError
43
+ tool = ConcreteResponseTool()
44
+
45
+ # This should trigger the NotImplementedError in the abstract method
46
+ with pytest.raises(NotImplementedError, match="Subclasses must implement __call__"):
47
+ # Call the parent abstract method directly to hit the raise line
48
+ import asyncio
49
+
50
+ asyncio.run(ResponseTool.__call__(tool, "test")) # type: ignore[attr-defined]
51
+
52
+ @pytest.mark.asyncio
53
+ async def test_concrete_implementation(self):
54
+ """Test that concrete implementation works correctly."""
55
+ tool = ConcreteResponseTool()
56
+ result = await tool("Hello, World!")
57
+
58
+ assert len(result) == 1
59
+ assert result[0].text == "Hello, World!"
60
+ assert result[0].type == "text"
@@ -0,0 +1,49 @@
1
+ """Tests for hud.tools.__init__ module."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+
8
+ class TestToolsInit:
9
+ """Tests for the tools package initialization."""
10
+
11
+ def test_lazy_import_anthropic_computer_tool(self):
12
+ """Test lazy import of AnthropicComputerTool."""
13
+ from hud.tools import AnthropicComputerTool
14
+
15
+ # Verify it's imported correctly
16
+ assert AnthropicComputerTool.__name__ == "AnthropicComputerTool"
17
+
18
+ def test_lazy_import_hud_computer_tool(self):
19
+ """Test lazy import of HudComputerTool."""
20
+ from hud.tools import HudComputerTool
21
+
22
+ # Verify it's imported correctly
23
+ assert HudComputerTool.__name__ == "HudComputerTool"
24
+
25
+ def test_lazy_import_openai_computer_tool(self):
26
+ """Test lazy import of OpenAIComputerTool."""
27
+ from hud.tools import OpenAIComputerTool
28
+
29
+ # Verify it's imported correctly
30
+ assert OpenAIComputerTool.__name__ == "OpenAIComputerTool"
31
+
32
+ def test_lazy_import_invalid_attribute(self):
33
+ """Test lazy import with invalid attribute name."""
34
+ import hud.tools as tools_module
35
+
36
+ with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidTool'"):
37
+ _ = tools_module.InvalidTool
38
+
39
+ def test_direct_imports_available(self):
40
+ """Test that directly imported tools are available."""
41
+ from hud.tools import BaseHub, BaseTool, BashTool, EditTool, PlaywrightTool, ResponseTool
42
+
43
+ # All should be available
44
+ assert BaseHub is not None
45
+ assert BaseTool is not None
46
+ assert BashTool is not None
47
+ assert EditTool is not None
48
+ assert PlaywrightTool is not None
49
+ assert ResponseTool is not None
hud/utils/design.py CHANGED
@@ -93,10 +93,10 @@ class HUDDesign:
93
93
  """
94
94
  console = self._stderr_console if stderr else self._stdout_console
95
95
  console.print(f"[default not bold]{message}[/default not bold]")
96
-
96
+
97
97
  def print(self, message: str, stderr: bool = True) -> None:
98
98
  """Print a message.
99
-
99
+
100
100
  Args:
101
101
  message: The message to print
102
102
  stderr: If True, output to stderr (default), otherwise stdout
@@ -136,7 +136,9 @@ class HUDDesign:
136
136
  console = self._stderr_console if stderr else self._stdout_console
137
137
  console.print(f"[default not bold]{json_str}[/default not bold]")
138
138
 
139
- def key_value_table(self, data: dict[str, str], show_header: bool = False, stderr: bool = True) -> None:
139
+ def key_value_table(
140
+ self, data: dict[str, str], show_header: bool = False, stderr: bool = True
141
+ ) -> None:
140
142
  """Print a key-value table.
141
143
 
142
144
  Args:
@@ -197,7 +199,14 @@ class HUDDesign:
197
199
  console = self._stderr_console if stderr else self._stdout_console
198
200
  console.print(f"\n[yellow]💡 Hint: {hint}[/yellow]")
199
201
 
200
- def status_item(self, label: str, value: str, status: str = "success", primary: bool = False, stderr: bool = True) -> None:
202
+ def status_item(
203
+ self,
204
+ label: str,
205
+ value: str,
206
+ status: str = "success",
207
+ primary: bool = False,
208
+ stderr: bool = True,
209
+ ) -> None:
201
210
  """Print a status item with indicator.
202
211
 
203
212
  Args:
@@ -211,18 +220,20 @@ class HUDDesign:
211
220
  "success": f"[{GREEN} not bold]✓[/{GREEN} not bold]",
212
221
  "error": f"[{RED} not bold]✗[/{RED} not bold]",
213
222
  "warning": "[yellow]⚠[/yellow]",
214
- "info": f"[{DIM}]•[/{DIM}]"
223
+ "info": f"[{DIM}]•[/{DIM}]",
215
224
  }
216
-
225
+
217
226
  indicator = indicators.get(status, indicators["info"])
218
227
  console = self._stderr_console if stderr else self._stdout_console
219
-
228
+
220
229
  if primary:
221
230
  console.print(f"{indicator} {label}: [bold cyan]{value}[/bold cyan]")
222
231
  else:
223
232
  console.print(f"{indicator} {label}: {value}")
224
233
 
225
- def command_example(self, command: str, description: str | None = None, stderr: bool = True) -> None:
234
+ def command_example(
235
+ self, command: str, description: str | None = None, stderr: bool = True
236
+ ) -> None:
226
237
  """Print a command example with cyan highlighting.
227
238
 
228
239
  Args:
hud/utils/mcp.py CHANGED
@@ -23,7 +23,7 @@ def patch_mcp_config(mcp_config: dict[str, dict[str, Any]], patch: MCPConfigPatc
23
23
 
24
24
  for server_cfg in mcp_config.values():
25
25
  url = server_cfg.get("url", "")
26
-
26
+
27
27
  # 1) HTTP header lane (only for hud MCP servers)
28
28
  if hud_mcp_url in url and patch.headers:
29
29
  for key, value in patch.headers.items():
@@ -37,9 +37,11 @@ def patch_mcp_config(mcp_config: dict[str, dict[str, Any]], patch: MCPConfigPatc
37
37
  meta.setdefault(key, value)
38
38
 
39
39
 
40
- def setup_hud_telemetry(mcp_config: dict[str, dict[str, Any]], auto_trace: bool = True) -> Any | None:
40
+ def setup_hud_telemetry(
41
+ mcp_config: dict[str, dict[str, Any]], auto_trace: bool = True
42
+ ) -> Any | None:
41
43
  """Setup telemetry for hud servers.
42
-
44
+
43
45
  Returns:
44
46
  The auto-created trace context manager if one was created, None otherwise.
45
47
  Caller is responsible for exiting the context manager.
@@ -47,12 +49,22 @@ def setup_hud_telemetry(mcp_config: dict[str, dict[str, Any]], auto_trace: bool
47
49
  if not mcp_config:
48
50
  raise ValueError("Please run initialize() before setting up client-side telemetry")
49
51
 
52
+ # Check if there are any HUD servers to setup telemetry for
53
+ hud_mcp_url = settings.hud_mcp_url
54
+ has_hud_servers = any(
55
+ hud_mcp_url in server_cfg.get("url", "") for server_cfg in mcp_config.values()
56
+ )
57
+
58
+ # If no HUD servers, no need for telemetry setup
59
+ if not has_hud_servers:
60
+ return None
61
+
50
62
  from hud.otel import get_current_task_run_id
51
63
  from hud.telemetry import trace
52
64
 
53
65
  run_id = get_current_task_run_id()
54
66
  auto_trace_cm = None
55
-
67
+
56
68
  if not run_id and auto_trace:
57
69
  auto_trace_cm = trace("My Trace")
58
70
  run_id = auto_trace_cm.__enter__()
@@ -63,5 +75,5 @@ def setup_hud_telemetry(mcp_config: dict[str, dict[str, Any]], auto_trace: bool
63
75
  mcp_config,
64
76
  MCPConfigPatch(headers={"Run-Id": run_id}, meta={"run_id": run_id}),
65
77
  )
66
-
78
+
67
79
  return auto_trace_cm
@@ -0,0 +1,112 @@
1
+ """Tests for MCP utility functions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
8
+
9
+
10
+ class TestPatchMCPConfig:
11
+ """Tests for patch_mcp_config function."""
12
+
13
+ def test_patch_headers_for_hud_servers(self):
14
+ """Test patching headers for HUD MCP servers."""
15
+ from hud.settings import get_settings
16
+
17
+ settings = get_settings()
18
+
19
+ # Create an MCP config with a HUD server URL
20
+ mcp_config = {"test_server": {"url": f"{settings.hud_mcp_url}/test"}}
21
+
22
+ # Create patch with headers
23
+ patch = MCPConfigPatch(headers={"X-Test-Header": "test-value"}, meta=None)
24
+
25
+ # Apply patch
26
+ patch_mcp_config(mcp_config, patch)
27
+
28
+ # Verify headers were added
29
+ assert "headers" in mcp_config["test_server"]
30
+ assert mcp_config["test_server"]["headers"]["X-Test-Header"] == "test-value" # type: ignore[index]
31
+
32
+ def test_patch_headers_preserves_existing(self):
33
+ """Test that existing headers are preserved."""
34
+ from hud.settings import get_settings
35
+
36
+ settings = get_settings()
37
+
38
+ # Create config with existing headers
39
+ mcp_config = {
40
+ "test_server": {
41
+ "url": f"{settings.hud_mcp_url}/test",
42
+ "headers": {"Existing-Header": "existing-value"},
43
+ }
44
+ }
45
+
46
+ patch = MCPConfigPatch(
47
+ headers={"X-Test-Header": "test-value", "Existing-Header": "new-value"},
48
+ meta=None,
49
+ )
50
+
51
+ patch_mcp_config(mcp_config, patch)
52
+
53
+ # Existing header should be preserved, new one added
54
+ assert mcp_config["test_server"]["headers"]["Existing-Header"] == "existing-value"
55
+ assert mcp_config["test_server"]["headers"]["X-Test-Header"] == "test-value"
56
+
57
+ def test_patch_meta_for_all_servers(self):
58
+ """Test patching metadata for all servers."""
59
+ mcp_config = {
60
+ "server1": {"url": "http://example.com"},
61
+ "server2": {"url": "http://other.com"},
62
+ }
63
+
64
+ patch = MCPConfigPatch(headers=None, meta={"test_key": "test_value"})
65
+
66
+ patch_mcp_config(mcp_config, patch)
67
+
68
+ # Meta should be added to both servers
69
+ assert mcp_config["server1"]["meta"]["test_key"] == "test_value" # type: ignore[index]
70
+ assert mcp_config["server2"]["meta"]["test_key"] == "test_value" # type: ignore[index]
71
+
72
+ def test_patch_meta_preserves_existing(self):
73
+ """Test that existing meta is preserved."""
74
+ mcp_config = {
75
+ "test_server": {"url": "http://example.com", "meta": {"existing_key": "existing_value"}}
76
+ }
77
+
78
+ patch = MCPConfigPatch(
79
+ headers=None,
80
+ meta={"test_key": "test_value", "existing_key": "new_value"},
81
+ )
82
+
83
+ patch_mcp_config(mcp_config, patch)
84
+
85
+ # Existing meta should be preserved, new one added
86
+ assert mcp_config["test_server"]["meta"]["existing_key"] == "existing_value"
87
+ assert mcp_config["test_server"]["meta"]["test_key"] == "test_value"
88
+
89
+
90
+ class TestSetupHUDTelemetry:
91
+ """Tests for setup_hud_telemetry function."""
92
+
93
+ def test_empty_config_raises_error(self):
94
+ """Test that empty config raises ValueError."""
95
+ with pytest.raises(
96
+ ValueError, match="Please run initialize\\(\\) before setting up client-side telemetry"
97
+ ):
98
+ setup_hud_telemetry({})
99
+
100
+ def test_none_config_raises_error(self):
101
+ """Test that None config raises ValueError."""
102
+ with pytest.raises(
103
+ ValueError, match="Please run initialize\\(\\) before setting up client-side telemetry"
104
+ ):
105
+ setup_hud_telemetry(None) # type: ignore[arg-type]
106
+
107
+ def test_valid_config_returns_none_when_no_hud_servers(self):
108
+ """Test that valid config with no HUD servers returns None."""
109
+ mcp_config = {"test_server": {"url": "http://example.com"}}
110
+
111
+ result = setup_hud_telemetry(mcp_config)
112
+ assert result is None
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.4.11"
8
+ assert hud.__version__ == "0.4.13"
hud/version.py CHANGED
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.4.11"
7
+ __version__ = "0.4.13"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.11
3
+ Version: 0.4.13
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -35,10 +35,9 @@ Classifier: Programming Language :: Python :: 3.11
35
35
  Classifier: Programming Language :: Python :: 3.12
36
36
  Classifier: Programming Language :: Python :: 3.13
37
37
  Requires-Python: <3.14,>=3.11
38
- Requires-Dist: fastmcp>=2.11.2
39
38
  Requires-Dist: httpx<1,>=0.23.0
40
- Requires-Dist: hud-mcp-python-sdk>=0.1.0
41
- Requires-Dist: mcp>=1.13.1
39
+ Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
40
+ Requires-Dist: hud-mcp-python-sdk>=3.13.2
42
41
  Requires-Dist: opentelemetry-api>=1.34.1
43
42
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
44
43
  Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
@@ -56,7 +55,11 @@ Provides-Extra: agent
56
55
  Requires-Dist: anthropic; extra == 'agent'
57
56
  Requires-Dist: datasets>=2.14.0; extra == 'agent'
58
57
  Requires-Dist: dotenv>=0.9.9; extra == 'agent'
59
- Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'agent'
58
+ Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
59
+ Requires-Dist: ipykernel; extra == 'agent'
60
+ Requires-Dist: ipython<9; extra == 'agent'
61
+ Requires-Dist: jupyter-client; extra == 'agent'
62
+ Requires-Dist: jupyter-core; extra == 'agent'
60
63
  Requires-Dist: langchain; extra == 'agent'
61
64
  Requires-Dist: langchain-anthropic; extra == 'agent'
62
65
  Requires-Dist: langchain-openai; extra == 'agent'
@@ -66,7 +69,11 @@ Provides-Extra: agents
66
69
  Requires-Dist: anthropic; extra == 'agents'
67
70
  Requires-Dist: datasets>=2.14.0; extra == 'agents'
68
71
  Requires-Dist: dotenv>=0.9.9; extra == 'agents'
69
- Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'agents'
72
+ Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
73
+ Requires-Dist: ipykernel; extra == 'agents'
74
+ Requires-Dist: ipython<9; extra == 'agents'
75
+ Requires-Dist: jupyter-client; extra == 'agents'
76
+ Requires-Dist: jupyter-core; extra == 'agents'
70
77
  Requires-Dist: langchain; extra == 'agents'
71
78
  Requires-Dist: langchain-anthropic; extra == 'agents'
72
79
  Requires-Dist: langchain-openai; extra == 'agents'
@@ -77,7 +84,7 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
77
84
  Requires-Dist: anthropic; extra == 'dev'
78
85
  Requires-Dist: datasets>=2.14.0; extra == 'dev'
79
86
  Requires-Dist: dotenv>=0.9.9; extra == 'dev'
80
- Requires-Dist: hud-mcp-use-python-sdk>=0.1.0; extra == 'dev'
87
+ Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
81
88
  Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
82
89
  Requires-Dist: ipykernel; extra == 'dev'
83
90
  Requires-Dist: ipython<9; extra == 'dev'
@@ -233,7 +240,7 @@ Any hud MCP environment and evaluation works with our RL pipeline. Even our remo
233
240
 
234
241
  This is Claude Computer Use running on our proprietary financial analyst benchmark [SheetBench-50](https://huggingface.co/datasets/hud-evals/SheetBench-50):
235
242
 
236
- ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/l/text-2048/docs/src/images/trace_sheet.gif)
243
+ ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
237
244
 
238
245
  > [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
239
246
 
@@ -385,7 +392,7 @@ result = await ClaudeAgent().run({ # See all agents: https://docs.hud.so/refere
385
392
 
386
393
  All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
387
394
 
388
- ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/l/text-2048/docs/src/images/leaderboards_2.png)
395
+ ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
389
396
 
390
397
  We highly suggest running 3-5 evaluations per dataset for the most consistent results across multiple jobs.
391
398
 
@@ -430,10 +437,6 @@ graph LR
430
437
  Trace --> Dashboard
431
438
  AnyMCP -->|"MCP"| API
432
439
 
433
- style Dashboard fill:#e0e7ff,stroke:#6366f1,stroke-width:2px
434
- style SDK fill:#fef3c7,stroke:#f59e0b,stroke-width:2px
435
- style RemoteEnv fill:#d1fae5,stroke:#10b981,stroke-width:2px
436
- style AnyMCP fill:#fce7f3,stroke:#ec4899,stroke-width:2px,stroke-dasharray: 5 5
437
440
  ```
438
441
 
439
442
  ## CLI reference