hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (86) hide show
  1. hud/__init__.py +20 -8
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +15 -3
  6. hud/env/environment.py +10 -7
  7. hud/env/local_docker_client.py +29 -7
  8. hud/env/remote_client.py +1 -1
  9. hud/env/remote_docker_client.py +2 -2
  10. hud/exceptions.py +2 -1
  11. hud/gym.py +0 -9
  12. hud/mcp/__init__.py +17 -0
  13. hud/mcp/base.py +631 -0
  14. hud/mcp/claude.py +321 -0
  15. hud/mcp/client.py +312 -0
  16. hud/mcp/langchain.py +250 -0
  17. hud/mcp/openai.py +334 -0
  18. hud/mcp/tests/__init__.py +1 -0
  19. hud/mcp/tests/test_base.py +512 -0
  20. hud/mcp/tests/test_claude.py +294 -0
  21. hud/mcp/tests/test_client.py +324 -0
  22. hud/mcp/tests/test_openai.py +238 -0
  23. hud/settings.py +20 -2
  24. hud/task.py +5 -88
  25. hud/taskset.py +2 -23
  26. hud/telemetry/__init__.py +16 -7
  27. hud/telemetry/_trace.py +246 -72
  28. hud/telemetry/context.py +88 -27
  29. hud/telemetry/exporter.py +171 -11
  30. hud/telemetry/instrumentation/mcp.py +174 -410
  31. hud/telemetry/job.py +141 -0
  32. hud/telemetry/mcp_models.py +13 -74
  33. hud/telemetry/tests/test_context.py +9 -6
  34. hud/telemetry/tests/test_trace.py +120 -78
  35. hud/tools/__init__.py +34 -0
  36. hud/tools/base.py +65 -0
  37. hud/tools/bash.py +137 -0
  38. hud/tools/computer/__init__.py +13 -0
  39. hud/tools/computer/anthropic.py +411 -0
  40. hud/tools/computer/hud.py +315 -0
  41. hud/tools/computer/openai.py +283 -0
  42. hud/tools/edit.py +290 -0
  43. hud/tools/executors/__init__.py +30 -0
  44. hud/tools/executors/base.py +331 -0
  45. hud/tools/executors/pyautogui.py +619 -0
  46. hud/tools/executors/tests/__init__.py +1 -0
  47. hud/tools/executors/tests/test_base_executor.py +338 -0
  48. hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
  49. hud/tools/executors/xdo.py +503 -0
  50. hud/tools/helper/README.md +56 -0
  51. hud/tools/helper/__init__.py +9 -0
  52. hud/tools/helper/mcp_server.py +78 -0
  53. hud/tools/helper/server_initialization.py +115 -0
  54. hud/tools/helper/utils.py +58 -0
  55. hud/tools/playwright_tool.py +379 -0
  56. hud/tools/tests/__init__.py +3 -0
  57. hud/tools/tests/test_bash.py +152 -0
  58. hud/tools/tests/test_computer.py +52 -0
  59. hud/tools/tests/test_computer_actions.py +34 -0
  60. hud/tools/tests/test_edit.py +240 -0
  61. hud/tools/tests/test_init.py +27 -0
  62. hud/tools/tests/test_playwright_tool.py +183 -0
  63. hud/tools/tests/test_tools.py +157 -0
  64. hud/tools/tests/test_utils.py +156 -0
  65. hud/tools/utils.py +50 -0
  66. hud/trajectory.py +5 -1
  67. hud/types.py +10 -1
  68. hud/utils/tests/test_init.py +21 -0
  69. hud/utils/tests/test_version.py +1 -1
  70. hud/version.py +1 -1
  71. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
  72. hud_python-0.3.1.dist-info/RECORD +119 -0
  73. hud/evaluators/__init__.py +0 -9
  74. hud/evaluators/base.py +0 -32
  75. hud/evaluators/inspect.py +0 -24
  76. hud/evaluators/judge.py +0 -189
  77. hud/evaluators/match.py +0 -156
  78. hud/evaluators/remote.py +0 -65
  79. hud/evaluators/tests/__init__.py +0 -0
  80. hud/evaluators/tests/test_inspect.py +0 -12
  81. hud/evaluators/tests/test_judge.py +0 -231
  82. hud/evaluators/tests/test_match.py +0 -115
  83. hud/evaluators/tests/test_remote.py +0 -98
  84. hud_python-0.2.10.dist-info/RECORD +0 -85
  85. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
  86. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,157 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import inspect
5
+ import sys
6
+
7
+ import pytest
8
+ from mcp.types import ImageContent, TextContent
9
+
10
+ from hud.tools.bash import BashTool
11
+ from hud.tools.computer.hud import HudComputerTool
12
+ from hud.tools.edit import EditTool
13
+ from hud.tools.helper import register_instance_tool
14
+
15
+
16
+ @pytest.mark.asyncio
17
+ async def test_bash_tool_echo():
18
+ tool = BashTool()
19
+
20
+ # Monkey-patch the private _session methods so no subprocess is spawned
21
+ class _FakeSession:
22
+ async def run(self, cmd: str):
23
+ from hud.tools.base import ToolResult
24
+
25
+ return ToolResult(output=f"mocked: {cmd}")
26
+
27
+ async def start(self):
28
+ return None
29
+
30
+ tool._session = _FakeSession() # type: ignore[attr-defined]
31
+
32
+ result = await tool(command="echo hello")
33
+ assert result.output == "mocked: echo hello"
34
+
35
+
36
+ @pytest.mark.asyncio
37
+ async def test_bash_tool_restart_and_no_command():
38
+ from hud.tools.base import ToolError, ToolResult
39
+
40
+ tool = BashTool()
41
+
42
+ class _FakeSession:
43
+ async def run(self, cmd: str):
44
+ return ToolResult(output="ran")
45
+
46
+ async def start(self):
47
+ return None
48
+
49
+ def stop(self):
50
+ return None
51
+
52
+ tool._session = _FakeSession() # type: ignore[attr-defined]
53
+
54
+ # Monkey-patch _BashSession.start to avoid launching a real shell
55
+ async def _dummy_start(self):
56
+ self._started = True
57
+ from types import SimpleNamespace
58
+
59
+ # minimal fake process attributes used later
60
+ self._process = SimpleNamespace(returncode=None)
61
+
62
+ import hud.tools.bash as bash_mod
63
+
64
+ bash_mod._BashSession.start = _dummy_start # type: ignore[assignment]
65
+
66
+ # restart=True returns system message
67
+ res = await tool(command="ignored", restart=True)
68
+ assert res.system == "tool has been restarted."
69
+
70
+ # Calling without command raises ToolError
71
+ with pytest.raises(ToolError):
72
+ await tool()
73
+
74
+
75
+ @pytest.mark.asyncio
76
+ @pytest.mark.skipif(sys.platform == "win32", reason="EditTool uses Unix commands")
77
+ async def test_edit_tool_flow(tmp_path):
78
+ file_path = tmp_path / "demo.txt"
79
+
80
+ edit = EditTool()
81
+
82
+ # create
83
+ res = await edit(command="create", path=str(file_path), file_text="hello\nworld\n")
84
+ assert "File created" in (res.output or "")
85
+
86
+ # view
87
+ res = await edit(command="view", path=str(file_path))
88
+ assert "hello" in (res.output or "")
89
+
90
+ # replace
91
+ res = await edit(command="str_replace", path=str(file_path), old_str="world", new_str="earth")
92
+ assert "has been edited" in (res.output or "")
93
+
94
+ # insert
95
+ res = await edit(command="insert", path=str(file_path), insert_line=1, new_str="first line\n")
96
+ assert res
97
+
98
+
99
+ @pytest.mark.asyncio
100
+ async def test_base_executor_simulation():
101
+ from hud.tools.executors.base import BaseExecutor
102
+
103
+ exec = BaseExecutor()
104
+ res = await exec.execute("echo test")
105
+ assert "SIMULATED" in (res.output or "")
106
+ shot = await exec.screenshot()
107
+ assert isinstance(shot, str) and len(shot) > 0
108
+
109
+
110
+ @pytest.mark.asyncio
111
+ @pytest.mark.skipif(sys.platform == "win32", reason="EditTool uses Unix commands")
112
+ async def test_edit_tool_view(tmp_path):
113
+ # Create a temporary file
114
+ p = tmp_path / "sample.txt"
115
+ p.write_text("Sample content\n")
116
+
117
+ tool = EditTool()
118
+ result = await tool(command="view", path=str(p))
119
+ assert result.output is not None
120
+ assert "Sample content" in result.output
121
+
122
+
123
+ @pytest.mark.asyncio
124
+ async def test_computer_tool_screenshot():
125
+ comp = HudComputerTool()
126
+ blocks = await comp(action="screenshot")
127
+ # Check that we got content blocks back
128
+ assert blocks is not None
129
+ assert len(blocks) > 0
130
+ # Either ImageContent or TextContent is valid
131
+ assert all(isinstance(b, (ImageContent | TextContent)) for b in blocks)
132
+
133
+
134
+ def test_register_instance_tool_signature():
135
+ """Helper should expose same user-facing parameters (no *args/**kwargs)."""
136
+
137
+ class Dummy:
138
+ async def __call__(self, *, x: int, y: str) -> str:
139
+ return f"{x}-{y}"
140
+
141
+ from mcp.server.fastmcp import FastMCP
142
+
143
+ mcp = FastMCP("test")
144
+ fn = register_instance_tool(mcp, "dummy", Dummy())
145
+ sig = inspect.signature(fn)
146
+ params = list(sig.parameters.values())
147
+
148
+ assert [p.name for p in params] == ["x", "y"], "*args/**kwargs should be stripped"
149
+
150
+
151
+ def test_build_server_subset():
152
+ """Ensure build_server registers only requested tools."""
153
+ from hud.tools.helper.mcp_server import build_server
154
+
155
+ mcp = build_server(["bash"])
156
+ names = [t.name for t in asyncio.run(mcp.list_tools())]
157
+ assert names == ["bash"]
@@ -0,0 +1,156 @@
1
+ """Tests for tools utils."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from unittest.mock import AsyncMock, patch
7
+
8
+ import pytest
9
+
10
+ from hud.tools.utils import maybe_truncate, run
11
+
12
+
13
+ class TestRun:
14
+ """Tests for the run function."""
15
+
16
+ @pytest.mark.asyncio
17
+ async def test_run_string_command_success(self):
18
+ """Test running a string command successfully."""
19
+ mock_proc = AsyncMock()
20
+ mock_proc.returncode = 0
21
+ mock_proc.communicate = AsyncMock(return_value=(b"output", b""))
22
+
23
+ with patch("asyncio.create_subprocess_shell", return_value=mock_proc) as mock_shell:
24
+ return_code, stdout, stderr = await run("echo test")
25
+
26
+ assert return_code == 0
27
+ assert stdout == "output"
28
+ assert stderr == ""
29
+ mock_shell.assert_called_once()
30
+
31
+ @pytest.mark.asyncio
32
+ async def test_run_list_command_success(self):
33
+ """Test running a list command successfully."""
34
+ mock_proc = AsyncMock()
35
+ mock_proc.returncode = 0
36
+ mock_proc.communicate = AsyncMock(return_value=(b"hello world", b""))
37
+
38
+ with patch("asyncio.create_subprocess_exec", return_value=mock_proc) as mock_exec:
39
+ return_code, stdout, stderr = await run(["echo", "hello", "world"])
40
+
41
+ assert return_code == 0
42
+ assert stdout == "hello world"
43
+ assert stderr == ""
44
+ mock_exec.assert_called_once_with(
45
+ "echo",
46
+ "hello",
47
+ "world",
48
+ stdin=None,
49
+ stdout=asyncio.subprocess.PIPE,
50
+ stderr=asyncio.subprocess.PIPE,
51
+ )
52
+
53
+ @pytest.mark.asyncio
54
+ async def test_run_with_input(self):
55
+ """Test running a command with input."""
56
+ mock_proc = AsyncMock()
57
+ mock_proc.returncode = 0
58
+ mock_proc.communicate = AsyncMock(return_value=(b"processed", b""))
59
+
60
+ with patch("asyncio.create_subprocess_shell", return_value=mock_proc):
61
+ return_code, stdout, stderr = await run("cat", input="test input")
62
+
63
+ assert return_code == 0
64
+ assert stdout == "processed"
65
+ mock_proc.communicate.assert_called_once_with(input=b"test input")
66
+
67
+ @pytest.mark.asyncio
68
+ async def test_run_with_error(self):
69
+ """Test running a command that returns an error."""
70
+ mock_proc = AsyncMock()
71
+ mock_proc.returncode = 1
72
+ mock_proc.communicate = AsyncMock(return_value=(b"", b"error message"))
73
+
74
+ with patch("asyncio.create_subprocess_shell", return_value=mock_proc):
75
+ return_code, stdout, stderr = await run("false")
76
+
77
+ assert return_code == 1
78
+ assert stdout == ""
79
+ assert stderr == "error message"
80
+
81
+ @pytest.mark.asyncio
82
+ async def test_run_with_timeout(self):
83
+ """Test running a command with custom timeout."""
84
+ mock_proc = AsyncMock()
85
+ mock_proc.returncode = 0
86
+ mock_proc.communicate = AsyncMock(return_value=(b"done", b""))
87
+
88
+ with (
89
+ patch("asyncio.create_subprocess_shell", return_value=mock_proc),
90
+ patch("asyncio.wait_for") as mock_wait_for,
91
+ ):
92
+ mock_wait_for.return_value = (b"done", b"")
93
+
94
+ return_code, stdout, stderr = await run("sleep 1", timeout=5.0)
95
+
96
+ # Check that wait_for was called with the correct timeout
97
+ mock_wait_for.assert_called_once()
98
+ assert mock_wait_for.call_args[1]["timeout"] == 5.0
99
+
100
+ @pytest.mark.asyncio
101
+ async def test_run_timeout_exception(self):
102
+ """Test running a command that times out."""
103
+ mock_proc = AsyncMock()
104
+
105
+ with (
106
+ patch("asyncio.create_subprocess_shell", return_value=mock_proc),
107
+ patch("asyncio.wait_for", side_effect=TimeoutError()),
108
+ pytest.raises(asyncio.TimeoutError),
109
+ ):
110
+ await run("sleep infinity", timeout=0.1)
111
+
112
+
113
+ class TestMaybeTruncate:
114
+ """Tests for the maybe_truncate function."""
115
+
116
+ def test_maybe_truncate_short_text(self):
117
+ """Test that short text is not truncated."""
118
+ text = "This is a short text"
119
+ result = maybe_truncate(text)
120
+ assert result == text
121
+
122
+ def test_maybe_truncate_long_text_default(self):
123
+ """Test that long text is truncated with default limit."""
124
+ text = "x" * 30000 # Much longer than default limit
125
+ result = maybe_truncate(text)
126
+
127
+ assert len(result) < len(text)
128
+ assert result.endswith("... (truncated)")
129
+ assert len(result) == 20480 + len("... (truncated)")
130
+
131
+ def test_maybe_truncate_custom_limit(self):
132
+ """Test truncation with custom limit."""
133
+ text = "abcdefghijklmnopqrstuvwxyz"
134
+ result = maybe_truncate(text, max_length=10)
135
+
136
+ assert result == "abcdefghij... (truncated)"
137
+
138
+ def test_maybe_truncate_exact_limit(self):
139
+ """Test text exactly at limit is not truncated."""
140
+ text = "x" * 100
141
+ result = maybe_truncate(text, max_length=100)
142
+
143
+ assert result == text
144
+
145
+ def test_maybe_truncate_empty_string(self):
146
+ """Test empty string handling."""
147
+ result = maybe_truncate("")
148
+ assert result == ""
149
+
150
+ def test_maybe_truncate_unicode(self):
151
+ """Test truncation with unicode characters."""
152
+ text = "🎉" * 5000
153
+ result = maybe_truncate(text, max_length=10)
154
+
155
+ assert len(result) > 10 # Because of "... (truncated)" suffix
156
+ assert result.endswith("... (truncated)")
hud/tools/utils.py ADDED
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import subprocess
5
+
6
+ # Default timeout for running commands
7
+ DEFAULT_TIMEOUT = 10.0
8
+
9
+
10
+ async def run(
11
+ command: str | list[str],
12
+ input: str | None = None,
13
+ timeout: float | None = DEFAULT_TIMEOUT, # noqa: ASYNC109
14
+ ) -> tuple[int, str, str]:
15
+ """
16
+ Run a command asynchronously and return the result.
17
+
18
+ Args:
19
+ command: Command to run (string or list of strings)
20
+ input: Optional input to send to stdin
21
+ timeout: Timeout in seconds
22
+
23
+ Returns:
24
+ Tuple of (return_code, stdout, stderr)
25
+ """
26
+ if isinstance(command, str):
27
+ proc = await asyncio.create_subprocess_shell(
28
+ command,
29
+ stdin=subprocess.PIPE if input else None,
30
+ stdout=subprocess.PIPE,
31
+ stderr=subprocess.PIPE,
32
+ )
33
+ else:
34
+ proc = await asyncio.create_subprocess_exec(
35
+ *command,
36
+ stdin=subprocess.PIPE if input else None,
37
+ stdout=subprocess.PIPE,
38
+ stderr=subprocess.PIPE,
39
+ )
40
+
41
+ stdout, stderr = await asyncio.wait_for(
42
+ proc.communicate(input=input.encode() if input else None), timeout=timeout
43
+ )
44
+
45
+ return proc.returncode or 0, stdout.decode(), stderr.decode()
46
+
47
+
48
+ def maybe_truncate(text: str, max_length: int = 2048 * 10) -> str:
49
+ """Truncate output if too long."""
50
+ return text if len(text) <= max_length else text[:max_length] + "... (truncated)"
hud/trajectory.py CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
 
4
4
  import datetime
5
5
 
6
- from IPython.display import HTML, Markdown, display
7
6
  from pydantic import BaseModel, Field
8
7
 
9
8
  from .adapters.common.types import LogType
@@ -30,6 +29,11 @@ class Trajectory(BaseModel):
30
29
  trajectory: list[TrajectoryStep] = Field(default_factory=list)
31
30
 
32
31
  def display(self) -> None:
32
+ try:
33
+ from IPython.display import HTML, Markdown, display
34
+ except ImportError:
35
+ raise ImportError("IPython is required for trajectory display") from None
36
+
33
37
  trajectory_start_timestamp_str = self.trajectory[0].start_timestamp
34
38
  t_start_dt = (
35
39
  datetime.datetime.fromisoformat(trajectory_start_timestamp_str.replace("Z", "+00:00"))
hud/types.py CHANGED
@@ -33,6 +33,15 @@ class CustomGym(BaseModel):
33
33
  host_config: dict[str, Any] | None = None
34
34
 
35
35
 
36
+ class MCPConfig(BaseModel):
37
+ """
38
+ MCP config for the environment.
39
+ """
40
+
41
+ type: Literal["mcp"] = "mcp"
42
+ config: dict[str, Any]
43
+
44
+
36
45
  class EnvironmentStatus(str, enum.Enum):
37
46
  """
38
47
  Status of the environment.
@@ -54,7 +63,7 @@ class EnvironmentStatus(str, enum.Enum):
54
63
  ServerGym: TypeAlias = Literal["qa", "hud-browser", "OSWorld-Ubuntu", "docker"]
55
64
 
56
65
  # Gyms can be either custom or server-side
57
- Gym: TypeAlias = CustomGym | ServerGym
66
+ Gym: TypeAlias = CustomGym | MCPConfig | ServerGym
58
67
 
59
68
 
60
69
  # Metadata keys for the environment.
@@ -0,0 +1,21 @@
1
+ """Test utils package imports."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ def test_utils_imports():
7
+ """Test that utils package can be imported."""
8
+ import hud.utils
9
+
10
+ # Check that the module exists
11
+ assert hud.utils is not None
12
+
13
+ # Try importing submodules
14
+ from hud.utils import agent, common, config, misc, progress, telemetry
15
+
16
+ assert agent is not None
17
+ assert common is not None
18
+ assert config is not None
19
+ assert misc is not None
20
+ assert progress is not None
21
+ assert telemetry is not None
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.2.10"
8
+ assert hud.__version__ == "0.3.1"
hud/version.py CHANGED
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.2.10"
7
+ __version__ = "0.3.1"
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.10
4
- Summary: SDK for the HUD evaluation platform.
5
- Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
- Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
7
- Project-URL: Documentation, https://hud.so
3
+ Version: 0.3.1
4
+ Summary: SDK for the HUD platform.
5
+ Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
+ Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
7
+ Project-URL: Documentation, https://docs.hud.so
8
8
  Author-email: HUD SDK <founders@hud.so>
9
9
  License: MIT License
10
10
 
@@ -31,30 +31,26 @@ License-File: LICENSE
31
31
  Classifier: Development Status :: 4 - Beta
32
32
  Classifier: Intended Audience :: Developers
33
33
  Classifier: Programming Language :: Python :: 3
34
- Classifier: Programming Language :: Python :: 3.10
35
34
  Classifier: Programming Language :: Python :: 3.11
36
35
  Classifier: Programming Language :: Python :: 3.12
37
36
  Classifier: Programming Language :: Python :: 3.13
38
- Requires-Python: <3.14,>=3.10
39
- Requires-Dist: aiodocker>=0.24.0
37
+ Requires-Python: <3.14,>=3.11
40
38
  Requires-Dist: anthropic
39
+ Requires-Dist: datasets>=4.0.0
41
40
  Requires-Dist: dotenv>=0.9.9
42
41
  Requires-Dist: httpx<1,>=0.23.0
43
- Requires-Dist: inspect-ai>=0.3.80
44
- Requires-Dist: ipykernel
45
42
  Requires-Dist: langchain
43
+ Requires-Dist: langchain-anthropic
46
44
  Requires-Dist: langchain-openai
47
- Requires-Dist: mcp
48
- Requires-Dist: numpy
45
+ Requires-Dist: mcp-use>=1.3.7
46
+ Requires-Dist: mcp==1.12.2
49
47
  Requires-Dist: openai
50
48
  Requires-Dist: pathspec>=0.12.1
51
- Requires-Dist: pillow>=11.1.0
52
49
  Requires-Dist: pydantic-settings<3,>=2
53
50
  Requires-Dist: pydantic<3,>=2
54
- Requires-Dist: textdistance<5,>=4.5.0
55
- Requires-Dist: toml>=0.10.2
56
51
  Requires-Dist: wrapt>=1.14.0
57
52
  Provides-Extra: dev
53
+ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
58
54
  Requires-Dist: anthropic; extra == 'dev'
59
55
  Requires-Dist: dotenv; extra == 'dev'
60
56
  Requires-Dist: ipykernel; extra == 'dev'
@@ -62,16 +58,29 @@ Requires-Dist: ipython<9; extra == 'dev'
62
58
  Requires-Dist: jupyter-client; extra == 'dev'
63
59
  Requires-Dist: jupyter-core; extra == 'dev'
64
60
  Requires-Dist: openai; extra == 'dev'
61
+ Requires-Dist: pillow>=11.1.0; extra == 'dev'
62
+ Requires-Dist: playwright; extra == 'dev'
63
+ Requires-Dist: pyautogui>=0.9.54; extra == 'dev'
65
64
  Requires-Dist: pyright==1.1.401; extra == 'dev'
66
65
  Requires-Dist: pytest-asyncio; extra == 'dev'
67
66
  Requires-Dist: pytest-cov; extra == 'dev'
68
67
  Requires-Dist: pytest-mock; extra == 'dev'
69
68
  Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
70
69
  Requires-Dist: ruff==0.11.8; extra == 'dev'
70
+ Requires-Dist: toml>=0.10.2; extra == 'dev'
71
+ Provides-Extra: v2
72
+ Requires-Dist: aiodocker>=0.24.0; extra == 'v2'
73
+ Requires-Dist: inspect-ai>=0.3.80; extra == 'v2'
74
+ Requires-Dist: ipykernel; extra == 'v2'
75
+ Requires-Dist: numpy; extra == 'v2'
76
+ Requires-Dist: pillow>=11.1.0; extra == 'v2'
77
+ Requires-Dist: pyautogui>=0.9.54; extra == 'v2'
78
+ Requires-Dist: textdistance<5,>=4.5.0; extra == 'v2'
79
+ Requires-Dist: toml>=0.10.2; extra == 'v2'
71
80
  Description-Content-Type: text/markdown
72
81
 
73
82
  <div align="left">
74
- <img src="https://raw.githubusercontent.com/hud-evals/hud-sdk/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 20px;"/>
83
+ <img src="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 20px;"/>
75
84
  </div>
76
85
 
77
86
  <h3>
@@ -85,7 +94,7 @@ Evaluate your Computer Use AI agents across web browsers, desktop environments,
85
94
  We're here to help with eval strategies, custom environments, or improving your agent architecture!
86
95
 
87
96
 
88
- > **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-sdk/issues), as the SDK is still evolving!
97
+ > **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-python/issues), as the SDK is still evolving!
89
98
 
90
99
  [![PyPI version](https://img.shields.io/pypi/v/hud-python)](https://pypi.org/project/hud-python/)
91
100
 
@@ -269,7 +278,7 @@ If you use this SDK in your research, please cite it as follows:
269
278
  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
270
279
  title = {{HUD: An Evaluation Platform for Agents}},
271
280
  date = {2025-04},
272
- url = {https://github.com/hud-evals/hud-sdk},
281
+ url = {https://github.com/hud-evals/hud-python},
273
282
  langid = {en}
274
283
  }
275
284
  ```
@@ -0,0 +1,119 @@
1
+ hud/__init__.py,sha256=j5Zzth7_M-5DU_KJT2ZV9OfikD2aE6lzyiZA4OrLzi8,1578
2
+ hud/datasets.py,sha256=UZCzzXREbPhlw2ZdUFZ8EDz0lErWEeBPOPQxH71p6EA,6196
3
+ hud/exceptions.py,sha256=Xna_pdEK_ESwkcffsRmT5GXq4xSHLV5cu7Qu3MjstSE,5516
4
+ hud/gym.py,sha256=-hp5HdPBWf6-j0CgSoX_f2CTLssf1Wo5UhfyrnPbvkc,4774
5
+ hud/job.py,sha256=0vWbr3E5bYstVRzXS_6l-57JGUFcrZpmFrNkOSQ8Aa0,26969
6
+ hud/settings.py,sha256=KPzeF9OUecApYH8YYMW-8vIRhFP_6htzzZvC4RCUARc,2183
7
+ hud/task.py,sha256=Lhr-pHJEXGKUPxaT4UrJRvC3KwZcl2szM_sEM1l6xmU,5418
8
+ hud/taskset.py,sha256=QjHbcxSy7h7fmtzRHW1ewxtOIydtH7ZotttDoiABTEY,6573
9
+ hud/trajectory.py,sha256=LBVkFz6U_rmyooCZHN81tdOx0Z7DuAgzf0KQLejc4Fo,3937
10
+ hud/types.py,sha256=h7fUowbdyGF4Fg8TUnvCFoa2fflRRPi6xx7YgpBwFis,3109
11
+ hud/version.py,sha256=F0AzPXU574XP1t-vO3NkNHIuwe8Z3Q9rbT90d2vq07Q,104
12
+ hud/adapters/__init__.py,sha256=zz24KdC_e9TJPgWo6y57_8SzevEE5ak4Cm6tXzMxwRk,266
13
+ hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
14
+ hud/adapters/claude/adapter.py,sha256=vCpotJ5gzQs4PP2iCXVavIcyG8c_4m1P6fuXStwUxSo,6675
15
+ hud/adapters/claude/tests/__init__.py,sha256=9GZj0rz4tTkiPnLfxTmyBPr-s8UZc3gph6WH8fs8T34,39
16
+ hud/adapters/claude/tests/test_adapter.py,sha256=cAdHEoqLngLiV7QwlWJ0KuNgb1vNv9WZTPQMnxhMDKI,18319
17
+ hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
18
+ hud/adapters/common/adapter.py,sha256=fTpw7wA501nxM3ufl6WMWq4Nc3vXlUeBGS7WgvZVFjU,6180
19
+ hud/adapters/common/types.py,sha256=6frue7_gZlSYtOHhF2tFHqzjltzzHsTVs6-H-jQwZ4Y,9955
20
+ hud/adapters/common/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ hud/adapters/common/tests/test_adapter.py,sha256=7QRpQPGM1PlMi8RcqJAT4ruGvLT9TgGmc9R5tzncN1M,8965
22
+ hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
23
+ hud/adapters/operator/adapter.py,sha256=Uz4Sr73T57B7v4RRP0uaibHI17N2hBx6Z9YYjgJCUXA,3732
24
+ hud/adapters/operator/tests/__init__.py,sha256=yTsDVusVXZBQL6DnXpLgKQCBRuOYUAVQ8Blk_k5GETk,41
25
+ hud/adapters/operator/tests/test_adapter.py,sha256=4RAXwyxAtkh-1Mlt1zJayRkcv3LWaPNEhDVTpwOZd4A,12942
26
+ hud/agent/__init__.py,sha256=_OxMG3UW1vXSuixdpo09b1jexfWcUbfK44zto8t6_LE,453
27
+ hud/agent/base.py,sha256=hC3mVUMAWo5HHF2b576ScA9UQzsAzcCfPU9S8mDWthA,4080
28
+ hud/agent/claude.py,sha256=FBSKCxICO6XXYCuIrerVL89bVJ-5JxrZJBDeZgzAdJI,9886
29
+ hud/agent/claude_plays_pokemon.py,sha256=4TPibnTFhTb24ISRKAU3pA4waIcISTfZLOdfBMIMqxE,10085
30
+ hud/agent/langchain.py,sha256=H55JNHcGkdl-LVzZEqOFRkuuFEO0D8MI1jCNz9deoko,9012
31
+ hud/agent/operator.py,sha256=kntMOsdL5tzaGVSnzbGvFD2PMLzW2DEB2wEqN_LArQw,10500
32
+ hud/agent/misc/__init__.py,sha256=-ftYH1T5r7fXKKra6d8jXYmUz9KOTmYwBrPJU-V3S7g,71
33
+ hud/agent/misc/response_agent.py,sha256=3PPsZqNAyUo2ouSV0ylGQj9fJqojfSB2roq2DadUdG0,3048
34
+ hud/agent/tests/__init__.py,sha256=HbAW7FvSvzzKPU5LpveZceU8XTcDkRe1Bmte3OGi2f0,29
35
+ hud/agent/tests/test_base.py,sha256=MAHx4QWsX4y4jXDoA1sxWw8uFvL7lIzGlXrnHfOTmkw,8511
36
+ hud/env/__init__.py,sha256=wVEesXMXM5hcNXQHt0-PN4-9RnE69DEnQENS7uJSv_Y,266
37
+ hud/env/client.py,sha256=brhfLkWGSuvxl3vqGMCQT-vXfj8rUbJMhE3zJg9WMDA,869
38
+ hud/env/docker_client.py,sha256=55PTFansUDzsRMT_43eSTVO9rb_wzl_s4aBpBqmMeXk,11749
39
+ hud/env/environment.py,sha256=wjMBwGs5qkkXsVlXR_Z2QPZi4cwXE82ckdzRgHiXPjw,17019
40
+ hud/env/local_docker_client.py,sha256=IIuPSV_KJsfCONJAIVkgq_2zgUJl-FE4e5tDkkbRp0Y,12442
41
+ hud/env/remote_client.py,sha256=tP5Gn1YtYgsjdXA4vM4FibAAHnR-9OOH4GrTog97cf8,6670
42
+ hud/env/remote_docker_client.py,sha256=sBoOz3cq9HMgVvX8qCYEhRLvdswMZLG9G4Ybc60RzDo,9574
43
+ hud/mcp/__init__.py,sha256=VBAZfpD8Ww59CkWb4CB0rGhNGqJYtc5y1gWZWHDaViQ,371
44
+ hud/mcp/base.py,sha256=H4CRVGG4aEXAk_qRk3iOi-KLf8AVuffmoXPTaSXD4_0,24376
45
+ hud/mcp/claude.py,sha256=XxXHjNnBvrS2Y98m0xTfFjZYgACCoFVTiNd01neffbM,12034
46
+ hud/mcp/client.py,sha256=qrmpk2syjJ56y-09Dg44RVjUCFfmf5bPXaQSY-2ih-k,11494
47
+ hud/mcp/langchain.py,sha256=hbKSCSQBf4W_pPpGEdy_KNoPA-T7Bsn_BLIDxaLzvVU,9251
48
+ hud/mcp/openai.py,sha256=tpYK4ixLWqxAUXatXhoIZUXMlK1oP8TUZjnkSxBQVMc,13244
49
+ hud/mcp/tests/__init__.py,sha256=W-O-_4i34d9TTyEHV-O_q1Ai1gLhzwDaaPo02_TWQIY,34
50
+ hud/mcp/tests/test_base.py,sha256=lrRZoyDN7T67kOfv1A5WESaSHsYCaodD2CJnFli-4A4,19125
51
+ hud/mcp/tests/test_claude.py,sha256=kGDThen8ij9QWx_YH3P9UvLlra1ueEMgA_clQ1q60II,11312
52
+ hud/mcp/tests/test_client.py,sha256=ffxKzLmY75v-9l3aceUkn7aTdoO3j6deA4KBE3l9gaQ,11975
53
+ hud/mcp/tests/test_openai.py,sha256=AhnBT_y-zMykQyJARDwKWiQWJsBGwNIlH6fGAzhJh88,9091
54
+ hud/server/__init__.py,sha256=IPxPCqtPLguryN-nBq78Sakypw2bRiE2iHv3SXG8YRk,139
55
+ hud/server/requests.py,sha256=AnFW4ELojjvfF6xjS2no6_fg4Rph2aR2hjPzYTede0Q,8841
56
+ hud/server/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
+ hud/server/tests/test_requests.py,sha256=63YCbykcib5MxKxm-OgHJPLX3QC7hmgIwnWaYukVM6s,9077
58
+ hud/telemetry/__init__.py,sha256=qSQhbXYy7c_sG7KhVr-5eiCmeREj6GQ2cijhbIR2-Z4,717
59
+ hud/telemetry/_trace.py,sha256=Di2zKByHaljL6H4VkA-Gh_085jRJQw2VTiMOHX_FKp0,11433
60
+ hud/telemetry/context.py,sha256=qwCdUQ3UX_Y_zfIHSAQ1cdJNv-VLh5y8ovXfLpjHKVY,7492
61
+ hud/telemetry/exporter.py,sha256=10NwliO35J0fStvspgzb93N5MTko3pYNJe0fuTs-gPQ,23225
62
+ hud/telemetry/job.py,sha256=eyjr7Ha2ijM0MIF5f0d1xFOScFUdFIqlmO8GzQZoAJc,4905
63
+ hud/telemetry/mcp_models.py,sha256=0FQZoXtKOKeUsc2L61UbANpUDC7VNL842R2YFR61UBQ,8980
64
+ hud/telemetry/instrumentation/__init__.py,sha256=vHmSqaJMMehgRNn6EN2SMoYDD12rSHkLeVmj7Uy1my0,88
65
+ hud/telemetry/instrumentation/mcp.py,sha256=RbEaqmp8QHj1XqpIzwDSE8gH2cN5UjaBTouRxiPWxmc,9339
66
+ hud/telemetry/instrumentation/registry.py,sha256=UVaSsEA693lvKYd5R3n3ve6GcAB1fwqubRwIVeZiNmo,1821
67
+ hud/telemetry/tests/__init__.py,sha256=QMN8OzfrBUDbQESwrwHCqXLdDwCjYWX8BJcpeLUJfqA,33
68
+ hud/telemetry/tests/test_context.py,sha256=RdtjYHsyvlkKoTQxk0VezaAISEoVQReYqQiqK3jgFLQ,6746
69
+ hud/telemetry/tests/test_trace.py,sha256=mCm5AH-NpuDVvRG-CZhMMqHiJ4dahvcy9KHmWmo6o3A,12494
70
+ hud/tools/__init__.py,sha256=T4PnE5nuBCXsTKXUYBHmaF1Ojc6D5vAa6wA2cFWJfTc,986
71
+ hud/tools/base.py,sha256=lmd7N7IccIWrPpA0NZundIglFTTiLFW9VP_PJI2EXug,2069
72
+ hud/tools/bash.py,sha256=o841_HF1NJFfUWLOVUw9s0iB4BoIxhA-8vMasJOhZ70,4319
73
+ hud/tools/edit.py,sha256=9vJ2XSnWOPViujQbZZuDjLahvzxoPHyAeXxgKfpUDHo,11796
74
+ hud/tools/playwright_tool.py,sha256=IQT1hk5U4H8BI988iZq0B2oS_fbgkaX01Z-ZXL4r71o,13724
75
+ hud/tools/utils.py,sha256=bfVyYMcBOJvr1QdptCjVb6jaHVGIL5WUxmY59kzMekQ,1447
76
+ hud/tools/computer/__init__.py,sha256=ehKY7u0_4cZ9h7YQlOQjbKPWfd5LhQq8ZQn2w2-l2mY,302
77
+ hud/tools/computer/anthropic.py,sha256=M-djQmd0vPZm95FDszaMh4wSaLFPhlcCUb-JkSuflnU,16104
78
+ hud/tools/computer/hud.py,sha256=xyFYLqVoLsps0Dbs9kAfg941kXLnMHx7SL8a2skhjHw,13351
79
+ hud/tools/computer/openai.py,sha256=pcMGfoT6O8Rh9IrW_H1Mw2cIwk-FzCswrgjW19piRU8,10538
80
+ hud/tools/executors/__init__.py,sha256=jHxfus9SLhkL6YGtebR5RyKYyVAix3yu5EkUp2Q27Kg,732
81
+ hud/tools/executors/base.py,sha256=4h04Byt4ktaNk_aLOOI798pkMCLiqA7pE2PoaEn_hfg,11647
82
+ hud/tools/executors/pyautogui.py,sha256=Kc2OcFw-sEuRBRFtO1ZrWeHs1p-p5FtEpESkzpRhOHk,22098
83
+ hud/tools/executors/xdo.py,sha256=C6ecIVPUba7c6vKpgIcNxKcc698hwelQjj4YYUxT2_4,17751
84
+ hud/tools/executors/tests/__init__.py,sha256=opFpGSH6cEqIZgt9izXd3Yt85pC7xkxiYmOZQTHf4AY,32
85
+ hud/tools/executors/tests/test_base_executor.py,sha256=dvpKHCIjrBhT6E2U3hsjAwuivCAYXplvd08EHN6cxTI,12306
86
+ hud/tools/executors/tests/test_pyautogui_executor.py,sha256=br-wVvXnRx9G6X0yJ_xeKZf2xl8o4LCnYLeaIbkpuzY,6608
87
+ hud/tools/helper/README.md,sha256=GDS-K-wMnDO3-gtWjisgk5153zBmU29XSrs2ZhlOWQY,1727
88
+ hud/tools/helper/__init__.py,sha256=VqgQkY-y9h-WnGXZRK387fSr1BzrOQoAy3975WDAs4c,209
89
+ hud/tools/helper/mcp_server.py,sha256=t8UaGq91hDKef6zO3ApnJydwcKEqgLF6RdDcJ1GmfEA,2248
90
+ hud/tools/helper/server_initialization.py,sha256=j3lymoyXf9nGX907Thf4kxDfkIQ7g4-3yiRvR1Ztqc0,4025
91
+ hud/tools/helper/utils.py,sha256=hfaJX9HX2vmytaIwk_NG-luSXHY4VhrzegELDtx7Lp8,1776
92
+ hud/tools/tests/__init__.py,sha256=eEYYkxX5Hz9woXVOBJ2H2_CQoEih0vH6nRt3sH2Z8v8,49
93
+ hud/tools/tests/test_bash.py,sha256=LV3LjijwkQqxuxIXFSepD2x3sYoY4uhdw8EBv4JOyLU,4847
94
+ hud/tools/tests/test_computer.py,sha256=HxYHxKJ0eWyZzC3abzviFBU-auc8x6Sh2ciR_uVXMXw,1595
95
+ hud/tools/tests/test_computer_actions.py,sha256=YtUNFL7anhpXrcvg8EoUY1CqIV-TAAyaNFLZO9CiJ40,1194
96
+ hud/tools/tests/test_edit.py,sha256=_Bfh9Qc_zSYK5vS9kfhm5G9tkVvX1dsEIFqE3jkeSv0,8527
97
+ hud/tools/tests/test_init.py,sha256=PD_SS6X6SPhEjStJqYxdJRtsa7RbL6cTokAGIn5bWhA,702
98
+ hud/tools/tests/test_playwright_tool.py,sha256=1qED_NF2QXUZmBRbWSmcKImMLUQ3m5CbA_9tLUiaxTQ,6696
99
+ hud/tools/tests/test_tools.py,sha256=KgSPgdqldpifbHeQHBFdYJVf3boWbvK6LRRRORPfTOg,4595
100
+ hud/tools/tests/test_utils.py,sha256=oYxEnLpSA5sEeYFGUTj74QRNv0AHP3AjmYYHXgIW0BY,5496
101
+ hud/utils/__init__.py,sha256=oSl_gGoS272X2VFnBYX8hLxcP2xgGoBYQXAuLhtQgw8,260
102
+ hud/utils/agent.py,sha256=CpNgjKWMaNqo-EATH_vfJHIN53rEkZngm2LXfUFlldQ,1225
103
+ hud/utils/common.py,sha256=_3HNmSOsHWyexP6iXTuU2wMx3Fafeg5hZU3VXBmv0Ag,7780
104
+ hud/utils/config.py,sha256=L_sSYtEaOap-Gnb2iLPJPQc2rteyt6mjOdJUrktmFwM,4020
105
+ hud/utils/misc.py,sha256=CfOv_ftLty1iEo3Rxyz4AD4nmaBkhCJVO_W-FlcyDgI,1481
106
+ hud/utils/progress.py,sha256=suikwFM8sdSfkV10nAOEaInDhG4XKgOSvFePg4jSj1A,5927
107
+ hud/utils/telemetry.py,sha256=hrVIx2rUjSGyy9IVxTZ_3Jii83PiHjyFRd5ls2whimM,1863
108
+ hud/utils/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
+ hud/utils/tests/test_common.py,sha256=KqDSMf7gWf1oYCiQ_BXsnvW1wUmyzbOzAT-HNoF7txs,9443
110
+ hud/utils/tests/test_config.py,sha256=dPlXYWuMrxX-NOYbf0vdJ27TJpfacKG8eiKOSGOcfDU,4079
111
+ hud/utils/tests/test_init.py,sha256=UxlNTwjlSE2q3M0R86EmMYmmXmbRvzZaC-S2av26QXI,529
112
+ hud/utils/tests/test_progress.py,sha256=QunwDgi_heQXhDgmC25zgjr-sFUu5FdJ_1aYigMKeIc,6351
113
+ hud/utils/tests/test_telemetry.py,sha256=t0An1RTBaE0dZVEpF4uwuq5k1R-PXFR5k4u71h60tx8,1224
114
+ hud/utils/tests/test_version.py,sha256=E3nH6EanHfkbILycFXBLt8KxHF2wXah-SjJS4199ZmA,159
115
+ hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
+ hud_python-0.3.1.dist-info/METADATA,sha256=qJBkPKsjYPHijWfbwTX81aEITSUtnk_zP9Twu5juQ80,10249
117
+ hud_python-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
118
+ hud_python-0.3.1.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
119
+ hud_python-0.3.1.dist-info/RECORD,,