hud-python 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +22 -2
- hud/adapters/claude/adapter.py +9 -2
- hud/adapters/claude/tests/__init__.py +1 -0
- hud/adapters/claude/tests/test_adapter.py +519 -0
- hud/adapters/common/types.py +5 -1
- hud/adapters/operator/adapter.py +4 -0
- hud/adapters/operator/tests/__init__.py +1 -0
- hud/adapters/operator/tests/test_adapter.py +370 -0
- hud/agent/__init__.py +4 -0
- hud/agent/base.py +18 -2
- hud/agent/claude.py +20 -17
- hud/agent/claude_plays_pokemon.py +282 -0
- hud/agent/langchain.py +12 -7
- hud/agent/misc/__init__.py +3 -0
- hud/agent/misc/response_agent.py +80 -0
- hud/agent/operator.py +27 -19
- hud/agent/tests/__init__.py +1 -0
- hud/agent/tests/test_base.py +202 -0
- hud/env/docker_client.py +28 -18
- hud/env/environment.py +33 -17
- hud/env/local_docker_client.py +83 -42
- hud/env/remote_client.py +1 -3
- hud/env/remote_docker_client.py +72 -15
- hud/exceptions.py +12 -0
- hud/gym.py +71 -53
- hud/job.py +52 -7
- hud/settings.py +6 -0
- hud/task.py +45 -33
- hud/taskset.py +44 -4
- hud/telemetry/__init__.py +21 -0
- hud/telemetry/_trace.py +173 -0
- hud/telemetry/context.py +193 -0
- hud/telemetry/exporter.py +417 -0
- hud/telemetry/instrumentation/__init__.py +3 -0
- hud/telemetry/instrumentation/mcp.py +498 -0
- hud/telemetry/instrumentation/registry.py +59 -0
- hud/telemetry/mcp_models.py +331 -0
- hud/telemetry/tests/__init__.py +1 -0
- hud/telemetry/tests/test_context.py +203 -0
- hud/telemetry/tests/test_trace.py +270 -0
- hud/types.py +10 -26
- hud/utils/common.py +22 -2
- hud/utils/misc.py +53 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +7 -0
- {hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/METADATA +90 -22
- hud_python-0.2.5.dist-info/RECORD +84 -0
- hud_python-0.2.3.dist-info/RECORD +0 -62
- {hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/WHEEL +0 -0
- {hud_python-0.2.3.dist-info → hud_python-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import uuid
|
|
5
|
+
from unittest.mock import MagicMock
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.telemetry._trace import (
|
|
10
|
+
init_telemetry,
|
|
11
|
+
register_trace,
|
|
12
|
+
trace,
|
|
13
|
+
)
|
|
14
|
+
from hud.telemetry.context import get_current_task_run_id as actual_get_current_task_run_id
|
|
15
|
+
from hud.telemetry.context import is_root_trace as actual_is_root_trace
|
|
16
|
+
from hud.telemetry.context import reset_context
|
|
17
|
+
from hud.telemetry.context import set_current_task_run_id as actual_set_current_task_run_id
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.fixture(autouse=True)
|
|
21
|
+
def reset_telemetry_context_fixture():
|
|
22
|
+
"""Ensures telemetry context is reset before and after each test in this file."""
|
|
23
|
+
reset_context()
|
|
24
|
+
yield
|
|
25
|
+
reset_context()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TestInitTelemetry:
|
|
29
|
+
"""Test telemetry initialization."""
|
|
30
|
+
|
|
31
|
+
def test_init_telemetry(self, mocker):
|
|
32
|
+
"""Test telemetry initialization calls registry.install_all."""
|
|
33
|
+
mock_registry = mocker.patch("hud.telemetry._trace.registry", autospec=True)
|
|
34
|
+
init_telemetry()
|
|
35
|
+
mock_registry.install_all.assert_called_once()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TestTrace:
|
|
39
|
+
"""Test the trace context manager."""
|
|
40
|
+
|
|
41
|
+
def test_trace_basic(self, mocker):
|
|
42
|
+
"""Test basic trace functionality and context setting."""
|
|
43
|
+
mock_flush = mocker.patch(
|
|
44
|
+
"hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
|
|
45
|
+
)
|
|
46
|
+
mock_submit_loop = mocker.patch(
|
|
47
|
+
"hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
initial_root_state = actual_is_root_trace.get()
|
|
51
|
+
|
|
52
|
+
with trace() as task_run_id:
|
|
53
|
+
assert isinstance(task_run_id, str)
|
|
54
|
+
uuid.UUID(task_run_id)
|
|
55
|
+
assert actual_get_current_task_run_id() == task_run_id
|
|
56
|
+
assert actual_is_root_trace.get() is True
|
|
57
|
+
|
|
58
|
+
assert actual_get_current_task_run_id() is None
|
|
59
|
+
assert actual_is_root_trace.get() == initial_root_state
|
|
60
|
+
mock_flush.assert_called_once()
|
|
61
|
+
mock_submit_loop.assert_not_called()
|
|
62
|
+
|
|
63
|
+
def test_trace_with_name_and_attributes(self, mocker):
|
|
64
|
+
"""Test trace with name and attributes, checking they are passed on."""
|
|
65
|
+
mock_mcp_calls = [MagicMock()]
|
|
66
|
+
mock_flush = mocker.patch(
|
|
67
|
+
"hud.telemetry._trace.flush_buffer", return_value=mock_mcp_calls, autospec=True
|
|
68
|
+
)
|
|
69
|
+
mock_submit_loop = mocker.patch(
|
|
70
|
+
"hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
trace_name = "test_trace_with_data"
|
|
74
|
+
attrs = {"key": "value", "number": 42}
|
|
75
|
+
|
|
76
|
+
with trace(name=trace_name, attributes=attrs) as task_run_id:
|
|
77
|
+
assert isinstance(task_run_id, str)
|
|
78
|
+
|
|
79
|
+
mock_flush.assert_called_once()
|
|
80
|
+
mock_submit_loop.assert_called_once()
|
|
81
|
+
|
|
82
|
+
@pytest.mark.asyncio
|
|
83
|
+
async def test_trace_with_mcp_calls_exports(self, mocker):
|
|
84
|
+
"""Test trace with MCP calls exports telemetry with correct data."""
|
|
85
|
+
mock_mcp_calls = [MagicMock(), MagicMock()]
|
|
86
|
+
mock_flush = mocker.patch(
|
|
87
|
+
"hud.telemetry._trace.flush_buffer", return_value=mock_mcp_calls, autospec=True
|
|
88
|
+
)
|
|
89
|
+
mock_submit_loop = mocker.patch(
|
|
90
|
+
"hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
async def mock_export(*args, **kwargs):
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
mock_export_actual_coro = mocker.patch(
|
|
97
|
+
"hud.telemetry._trace.exporter.export_telemetry",
|
|
98
|
+
side_effect=mock_export,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
test_attrs = {"custom_attr": "test_val"}
|
|
102
|
+
test_name = "mcp_export_test"
|
|
103
|
+
|
|
104
|
+
with trace(name=test_name, attributes=test_attrs) as task_run_id:
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
mock_flush.assert_called_once()
|
|
108
|
+
mock_submit_loop.assert_called_once()
|
|
109
|
+
|
|
110
|
+
mock_export_actual_coro.assert_called_once()
|
|
111
|
+
args, kwargs = mock_export_actual_coro.call_args
|
|
112
|
+
assert kwargs["task_run_id"] == task_run_id
|
|
113
|
+
assert kwargs["mcp_calls"] == mock_mcp_calls
|
|
114
|
+
assert kwargs["trace_attributes"]["trace_name"] == test_name
|
|
115
|
+
assert kwargs["trace_attributes"]["custom_attr"] == "test_val"
|
|
116
|
+
assert "start_time" in kwargs["trace_attributes"]
|
|
117
|
+
assert "end_time" in kwargs["trace_attributes"]
|
|
118
|
+
assert "duration" in kwargs["trace_attributes"]
|
|
119
|
+
assert kwargs["trace_attributes"]["is_root"] is True
|
|
120
|
+
|
|
121
|
+
def test_trace_nested(self, mocker):
|
|
122
|
+
"""Test nested traces, verifying context restoration and root trace logic."""
|
|
123
|
+
actual_set_current_task_run_id(None)
|
|
124
|
+
actual_is_root_trace.set(False)
|
|
125
|
+
|
|
126
|
+
mock_flush_internal = mocker.patch(
|
|
127
|
+
"hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
|
|
128
|
+
)
|
|
129
|
+
mock_submit_loop_internal = mocker.patch(
|
|
130
|
+
"hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
assert actual_get_current_task_run_id() is None
|
|
134
|
+
assert actual_is_root_trace.get() is False
|
|
135
|
+
|
|
136
|
+
with trace(name="outer") as outer_id:
|
|
137
|
+
assert actual_get_current_task_run_id() == outer_id
|
|
138
|
+
assert actual_is_root_trace.get() is True
|
|
139
|
+
with trace(name="inner") as inner_id:
|
|
140
|
+
assert actual_get_current_task_run_id() == inner_id
|
|
141
|
+
assert actual_is_root_trace.get() is False
|
|
142
|
+
assert outer_id != inner_id
|
|
143
|
+
assert actual_get_current_task_run_id() == outer_id
|
|
144
|
+
assert actual_is_root_trace.get() is True
|
|
145
|
+
|
|
146
|
+
assert actual_get_current_task_run_id() is None
|
|
147
|
+
assert actual_is_root_trace.get() is False
|
|
148
|
+
assert mock_flush_internal.call_count == 2
|
|
149
|
+
mock_submit_loop_internal.assert_not_called()
|
|
150
|
+
|
|
151
|
+
def test_trace_exception_handling(self, mocker):
|
|
152
|
+
"""Test trace handles exceptions properly and restores context."""
|
|
153
|
+
initial_task_id_before_trace = "pre_existing_id_123"
|
|
154
|
+
initial_root_state_before_trace = True
|
|
155
|
+
actual_set_current_task_run_id(initial_task_id_before_trace)
|
|
156
|
+
actual_is_root_trace.set(initial_root_state_before_trace)
|
|
157
|
+
|
|
158
|
+
mock_flush = mocker.patch(
|
|
159
|
+
"hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
|
|
160
|
+
)
|
|
161
|
+
mock_submit_loop = mocker.patch(
|
|
162
|
+
"hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
with pytest.raises(ValueError, match="Test exception"), trace(name="trace_with_exception"):
|
|
166
|
+
assert actual_get_current_task_run_id() != initial_task_id_before_trace
|
|
167
|
+
assert actual_is_root_trace.get() is False
|
|
168
|
+
raise ValueError("Test exception")
|
|
169
|
+
|
|
170
|
+
mock_flush.assert_called_once()
|
|
171
|
+
assert actual_get_current_task_run_id() == initial_task_id_before_trace
|
|
172
|
+
assert actual_is_root_trace.get() == initial_root_state_before_trace
|
|
173
|
+
mock_submit_loop.assert_not_called()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class TestRegisterTrace:
|
|
177
|
+
"""Test the register_trace decorator."""
|
|
178
|
+
|
|
179
|
+
def test_register_trace_sync_function(self, mocker):
|
|
180
|
+
mock_trace_context_manager = mocker.patch("hud.telemetry._trace.trace", autospec=True)
|
|
181
|
+
mock_trace_context_manager.return_value.__enter__.return_value = "mocked_task_id"
|
|
182
|
+
mock_trace_context_manager.return_value.__exit__.return_value = None
|
|
183
|
+
|
|
184
|
+
@register_trace(name="test_func_sync")
|
|
185
|
+
def sync_function(x, y):
|
|
186
|
+
return x + y
|
|
187
|
+
|
|
188
|
+
result = sync_function(1, 2)
|
|
189
|
+
assert result == 3
|
|
190
|
+
mock_trace_context_manager.assert_called_once_with(name="test_func_sync", attributes=None)
|
|
191
|
+
|
|
192
|
+
def test_register_trace_async_function(self, mocker):
|
|
193
|
+
mock_trace_context_manager = mocker.patch("hud.telemetry._trace.trace", autospec=True)
|
|
194
|
+
mock_trace_context_manager.return_value.__enter__.return_value = "mocked_task_id"
|
|
195
|
+
mock_trace_context_manager.return_value.__exit__.return_value = None
|
|
196
|
+
|
|
197
|
+
@register_trace(name="test_func_async")
|
|
198
|
+
async def async_function(x, y):
|
|
199
|
+
return x + y
|
|
200
|
+
|
|
201
|
+
async def run_test():
|
|
202
|
+
result = await async_function(1, 2)
|
|
203
|
+
assert result == 3
|
|
204
|
+
mock_trace_context_manager.assert_called_once_with(
|
|
205
|
+
name="test_func_async", attributes=None
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
asyncio.run(run_test())
|
|
209
|
+
|
|
210
|
+
def test_register_trace_with_attributes(self, mocker):
|
|
211
|
+
"""Test register_trace with attributes."""
|
|
212
|
+
mock_trace_context_manager = mocker.patch("hud.telemetry._trace.trace", autospec=True)
|
|
213
|
+
|
|
214
|
+
class _MockTraceContextManager:
|
|
215
|
+
def __enter__(self):
|
|
216
|
+
return "task_id"
|
|
217
|
+
|
|
218
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
mock_trace_context_manager.return_value = _MockTraceContextManager()
|
|
222
|
+
|
|
223
|
+
attrs = {"operation": "add"}
|
|
224
|
+
|
|
225
|
+
@register_trace(name="test_func", attributes=attrs)
|
|
226
|
+
def func_with_attrs(x):
|
|
227
|
+
return x * 2
|
|
228
|
+
|
|
229
|
+
result = func_with_attrs(5)
|
|
230
|
+
assert result == 10
|
|
231
|
+
mock_trace_context_manager.assert_called_once_with(name="test_func", attributes=attrs)
|
|
232
|
+
|
|
233
|
+
def test_register_trace_without_name(self, mocker):
|
|
234
|
+
"""Test register_trace uses function name when name not provided."""
|
|
235
|
+
mock_trace_context_manager = mocker.patch("hud.telemetry._trace.trace", autospec=True)
|
|
236
|
+
mock_trace_context_manager.return_value.__enter__.return_value = "task_id"
|
|
237
|
+
mock_trace_context_manager.return_value.__exit__.return_value = None
|
|
238
|
+
|
|
239
|
+
@register_trace()
|
|
240
|
+
def my_function():
|
|
241
|
+
return "result"
|
|
242
|
+
|
|
243
|
+
result = my_function()
|
|
244
|
+
assert result == "result"
|
|
245
|
+
mock_trace_context_manager.assert_called_once_with(name="my_function", attributes=None)
|
|
246
|
+
|
|
247
|
+
def test_register_trace_preserves_function_metadata(self):
|
|
248
|
+
"""Test register_trace preserves original function metadata."""
|
|
249
|
+
|
|
250
|
+
@register_trace(name="test")
|
|
251
|
+
def original_function():
|
|
252
|
+
"""Original docstring."""
|
|
253
|
+
|
|
254
|
+
assert original_function.__name__ == "original_function"
|
|
255
|
+
assert original_function.__doc__ == "Original docstring."
|
|
256
|
+
|
|
257
|
+
def test_register_trace_exception_propagation(self, mocker):
|
|
258
|
+
"""Test register_trace propagates exceptions."""
|
|
259
|
+
mock_trace_context_manager = mocker.patch("hud.telemetry._trace.trace", autospec=True)
|
|
260
|
+
mock_trace_context_manager.return_value.__enter__.return_value = "task_id"
|
|
261
|
+
mock_trace_context_manager.return_value.__exit__.return_value = None
|
|
262
|
+
|
|
263
|
+
@register_trace()
|
|
264
|
+
def failing_function():
|
|
265
|
+
raise RuntimeError("Test error")
|
|
266
|
+
|
|
267
|
+
with pytest.raises(RuntimeError, match="Test error"):
|
|
268
|
+
failing_function()
|
|
269
|
+
|
|
270
|
+
mock_trace_context_manager.assert_called_once()
|
hud/types.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Literal, TypeAlias
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
|
|
@@ -12,38 +12,22 @@ class CustomGym(BaseModel):
|
|
|
12
12
|
Public environment specification with a dockerfile and controller.
|
|
13
13
|
|
|
14
14
|
If the location is remote, the env will be created on the server.
|
|
15
|
-
If the location is
|
|
15
|
+
If the location is local, the env will be created locally via docker.
|
|
16
16
|
|
|
17
17
|
The dockerfile can be specified directly or automatically found in the controller_source_dir.
|
|
18
18
|
If neither is provided, an error will be raised during validation.
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
type: Literal["public"] = "public"
|
|
22
|
-
dockerfile: str | None = None
|
|
23
22
|
location: Literal["local", "remote"]
|
|
24
|
-
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
#
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# Convert string path to Path object if needed
|
|
33
|
-
if isinstance(self.controller_source_dir, str):
|
|
34
|
-
self.controller_source_dir = Path(self.controller_source_dir)
|
|
35
|
-
|
|
36
|
-
if self.dockerfile is None:
|
|
37
|
-
if self.controller_source_dir is None:
|
|
38
|
-
raise ValueError("Either dockerfile or controller_source_dir must be provided")
|
|
39
|
-
|
|
40
|
-
# Look for Dockerfile in the controller_source_dir
|
|
41
|
-
dockerfile_path = self.controller_source_dir / "Dockerfile"
|
|
42
|
-
if not dockerfile_path.exists():
|
|
43
|
-
raise ValueError(f"Dockerfile not found in {self.controller_source_dir}")
|
|
44
|
-
|
|
45
|
-
# Read the Dockerfile content
|
|
46
|
-
self.dockerfile = dockerfile_path.read_text()
|
|
23
|
+
# A. If path, then it is a docker build context on the local computer.
|
|
24
|
+
# If the location is local, docker build will be used to create the image.
|
|
25
|
+
# If the location is remote, we will build the image remotely.
|
|
26
|
+
# The controller will be automatically installed and kept in sync with local changes
|
|
27
|
+
# as long as a pyproject.toml is present at the root of the folder.
|
|
28
|
+
# B. If string, then it is the uri of the docker image to use.
|
|
29
|
+
# The controller must already be installed in the image.
|
|
30
|
+
image_or_build_context: str | Path
|
|
47
31
|
|
|
48
32
|
|
|
49
33
|
class EnvironmentStatus(str, enum.Enum):
|
hud/utils/common.py
CHANGED
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import io
|
|
4
4
|
import logging
|
|
5
5
|
import tarfile
|
|
6
|
+
import zipfile
|
|
6
7
|
from typing import TYPE_CHECKING, Any, TypedDict
|
|
7
8
|
|
|
8
9
|
from pydantic import BaseModel
|
|
@@ -22,6 +23,7 @@ class FunctionConfig(BaseModel):
|
|
|
22
23
|
args: list[Any] # Must be json serializable
|
|
23
24
|
|
|
24
25
|
id: str | None = None # Optional id for remote execution
|
|
26
|
+
metadata: dict[str, Any] | None = None # Optional metadata for telemetry
|
|
25
27
|
|
|
26
28
|
def __len__(self) -> int:
|
|
27
29
|
return len(self.args)
|
|
@@ -33,11 +35,12 @@ class FunctionConfig(BaseModel):
|
|
|
33
35
|
return iter(self.args)
|
|
34
36
|
|
|
35
37
|
def __str__(self) -> str:
|
|
36
|
-
return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
|
|
38
|
+
return f"FC: {self.function}: {', '.join(str(arg) for arg in self.args)} ({self.metadata})"
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
# Type alias for the shorthand config, which just converts to function name and args
|
|
40
|
-
|
|
42
|
+
BasicType = str | int | float | bool | None
|
|
43
|
+
ShorthandConfig = tuple[BasicType | dict[str, Any] | list[BasicType] | list[dict[str, Any]], ...]
|
|
41
44
|
|
|
42
45
|
# Type alias for multiple config formats
|
|
43
46
|
FunctionConfigs = (
|
|
@@ -62,6 +65,11 @@ class Observation(BaseModel):
|
|
|
62
65
|
screenshot: str | None = None # base64 string png
|
|
63
66
|
text: str | None = None
|
|
64
67
|
|
|
68
|
+
def __str__(self) -> str:
|
|
69
|
+
return f"""Observation(screenshot={
|
|
70
|
+
self.screenshot[:100] if self.screenshot else "None"
|
|
71
|
+
}..., text={self.text}...)"""
|
|
72
|
+
|
|
65
73
|
|
|
66
74
|
class ExecuteResult(TypedDict):
|
|
67
75
|
"""
|
|
@@ -107,6 +115,18 @@ def directory_to_tar_bytes(directory_path: Path) -> bytes:
|
|
|
107
115
|
return output.getvalue()
|
|
108
116
|
|
|
109
117
|
|
|
118
|
+
def directory_to_zip_bytes(context_dir: Path) -> bytes:
|
|
119
|
+
"""Zip a directory and return the zip archive as bytes."""
|
|
120
|
+
output = io.BytesIO()
|
|
121
|
+
with zipfile.ZipFile(output, "w", zipfile.ZIP_DEFLATED) as zipf:
|
|
122
|
+
for file_path in context_dir.rglob("*"):
|
|
123
|
+
if file_path.is_file():
|
|
124
|
+
rel_path = file_path.relative_to(context_dir)
|
|
125
|
+
logger.debug("Adding %s to zip archive", rel_path)
|
|
126
|
+
zipf.write(str(file_path), arcname=str(rel_path))
|
|
127
|
+
return output.getvalue()
|
|
128
|
+
|
|
129
|
+
|
|
110
130
|
async def get_gym_id(gym_name_or_id: str) -> str:
|
|
111
131
|
"""
|
|
112
132
|
Get the gym ID for a given gym name or ID.
|
hud/utils/misc.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
from hud.server import make_request
|
|
7
|
+
from hud.settings import settings
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from hud.env.environment import Environment # Import Environment for type hinting
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def upload_env_telemetry(
|
|
16
|
+
environment: Environment,
|
|
17
|
+
results: Any,
|
|
18
|
+
api_key: str | None = None,
|
|
19
|
+
) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Sends telemetry data (results from a cloud runner) to the HUD telemetry upload endpoint.
|
|
22
|
+
"""
|
|
23
|
+
environment_id = environment.client.env_id # type: ignore
|
|
24
|
+
|
|
25
|
+
if not api_key:
|
|
26
|
+
api_key = settings.api_key
|
|
27
|
+
|
|
28
|
+
if not api_key:
|
|
29
|
+
raise ValueError("API key must be provided either as an argument or set in hud.settings.")
|
|
30
|
+
|
|
31
|
+
endpoint_url = f"{settings.base_url}/v2/environments/{environment_id}/telemetry-upload"
|
|
32
|
+
|
|
33
|
+
request_payload = {
|
|
34
|
+
"results": {
|
|
35
|
+
"steps": results,
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
logger.debug("Sending telemetry to %s for env_id: %s", endpoint_url, environment_id)
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
await make_request(
|
|
43
|
+
method="POST",
|
|
44
|
+
url=endpoint_url,
|
|
45
|
+
json=request_payload,
|
|
46
|
+
api_key=api_key,
|
|
47
|
+
)
|
|
48
|
+
logger.info("Successfully uploaded telemetry for environment_id: %s", environment_id)
|
|
49
|
+
except Exception as e:
|
|
50
|
+
logger.error(
|
|
51
|
+
"Failed to upload telemetry for environment_id: %s. Error: %s", environment_id, e
|
|
52
|
+
)
|
|
53
|
+
raise
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
ADDED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
@@ -38,11 +38,13 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
38
38
|
Requires-Python: <3.14,>=3.10
|
|
39
39
|
Requires-Dist: aiodocker>=0.24.0
|
|
40
40
|
Requires-Dist: anthropic
|
|
41
|
+
Requires-Dist: dotenv>=0.9.9
|
|
41
42
|
Requires-Dist: httpx<1,>=0.23.0
|
|
42
43
|
Requires-Dist: inspect-ai>=0.3.80
|
|
43
44
|
Requires-Dist: ipykernel
|
|
44
45
|
Requires-Dist: langchain
|
|
45
46
|
Requires-Dist: langchain-openai
|
|
47
|
+
Requires-Dist: mcp
|
|
46
48
|
Requires-Dist: numpy
|
|
47
49
|
Requires-Dist: openai
|
|
48
50
|
Requires-Dist: pillow>=11.1.0
|
|
@@ -50,6 +52,7 @@ Requires-Dist: pydantic-settings<3,>=2
|
|
|
50
52
|
Requires-Dist: pydantic<3,>=2
|
|
51
53
|
Requires-Dist: textdistance<5,>=4.5.0
|
|
52
54
|
Requires-Dist: toml>=0.10.2
|
|
55
|
+
Requires-Dist: wrapt>=1.14.0
|
|
53
56
|
Provides-Extra: dev
|
|
54
57
|
Requires-Dist: anthropic; extra == 'dev'
|
|
55
58
|
Requires-Dist: dotenv; extra == 'dev'
|
|
@@ -66,15 +69,68 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
|
66
69
|
Requires-Dist: ruff==0.11.8; extra == 'dev'
|
|
67
70
|
Description-Content-Type: text/markdown
|
|
68
71
|
|
|
69
|
-
|
|
72
|
+
<div align="left">
|
|
73
|
+
<img src="https://raw.githubusercontent.com/hud-evals/hud-sdk/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 20px;"/>
|
|
74
|
+
</div>
|
|
75
|
+
|
|
76
|
+
<h3>
|
|
77
|
+
Create, evaluate, and improve AI agents across web browsers, desktop environments, and custom scenarios.
|
|
78
|
+
</h3>
|
|
79
|
+
|
|
80
|
+
> ### 🚀 Are you a startup building agents?
|
|
81
|
+
>
|
|
82
|
+
> [📅 Hop on a call ](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
|
|
83
|
+
>
|
|
84
|
+
> We're here to help with eval strategies, custom environments, or improving your agent architecture!
|
|
70
85
|
|
|
71
|
-
A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
|
|
72
86
|
|
|
73
87
|
> **Early Release Notice**: This SDK is currently in early release status. The API is evolving and may change in future releases as we gather feedback and improve functionality.
|
|
74
88
|
|
|
75
89
|
[](https://pypi.org/project/hud-python/)
|
|
76
90
|
|
|
77
|
-
|
|
91
|
+
## ✨ What You Can Do
|
|
92
|
+
|
|
93
|
+
**Evaluate Existing Benchmarks**
|
|
94
|
+
```python
|
|
95
|
+
from hud import load_taskset, run_job, ClaudeAgent
|
|
96
|
+
|
|
97
|
+
taskset = await load_taskset("WebVoyager") # or GAIA, OSWorld-Ubuntu, Mind2Web
|
|
98
|
+
job = await run_job(ClaudeAgent, taskset, "my-evaluation")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Create Custom Tasks**
|
|
102
|
+
```python
|
|
103
|
+
from hud.task import Task
|
|
104
|
+
|
|
105
|
+
task = Task(
|
|
106
|
+
prompt="Find and book the cheapest flight from NYC to Paris",
|
|
107
|
+
gym="hud-browser",
|
|
108
|
+
setup=("goto", "https://kayak.com"),
|
|
109
|
+
evaluate=("page_contains", "confirmation")
|
|
110
|
+
)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
**Build Custom Environments**
|
|
114
|
+
```python
|
|
115
|
+
from hud.types import CustomGym
|
|
116
|
+
|
|
117
|
+
# Launch any website as an environment
|
|
118
|
+
custom_gym = CustomGym(
|
|
119
|
+
image_or_build_context="nginx:alpine",
|
|
120
|
+
location="local"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Or create complex Docker environments - see environments/ folder for examples
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
**Trace Tool Calls Alongside HUD Environments (or Independently)**
|
|
127
|
+
```python
|
|
128
|
+
import hud
|
|
129
|
+
|
|
130
|
+
with hud.trace("my-agent-run"):
|
|
131
|
+
# Your agent code here - MCP calls automatically captured
|
|
132
|
+
result = await agent.run(task)
|
|
133
|
+
```
|
|
78
134
|
|
|
79
135
|
## API Key Setup
|
|
80
136
|
|
|
@@ -119,7 +175,7 @@ async def main():
|
|
|
119
175
|
# Create environment using the gym module
|
|
120
176
|
env = await gym.make(task)
|
|
121
177
|
|
|
122
|
-
# Initialize
|
|
178
|
+
# Initialize Claude agent (API key is loaded automatically)
|
|
123
179
|
agent = ClaudeAgent()
|
|
124
180
|
|
|
125
181
|
# Agent loop with predict and step functions
|
|
@@ -137,7 +193,6 @@ async def main():
|
|
|
137
193
|
|
|
138
194
|
if __name__ == "__main__":
|
|
139
195
|
asyncio.run(main())
|
|
140
|
-
|
|
141
196
|
```
|
|
142
197
|
|
|
143
198
|
Alternatively, run a full evaluation set via the ```run_job``` command:
|
|
@@ -145,32 +200,45 @@ Alternatively, run a full evaluation set via the ```run_job``` command:
|
|
|
145
200
|
```python
|
|
146
201
|
from hud import load_taskset, run_job, ClaudeAgent
|
|
147
202
|
|
|
148
|
-
#
|
|
203
|
+
# Load a benchmark
|
|
149
204
|
taskset = load_taskset("GAIA")
|
|
150
205
|
|
|
151
|
-
#
|
|
206
|
+
# Evaluate
|
|
152
207
|
job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
|
|
153
208
|
|
|
154
|
-
#
|
|
209
|
+
# Get results OR view them in app.hud.so
|
|
155
210
|
print(await job.get_analytics())
|
|
156
211
|
```
|
|
157
212
|
|
|
213
|
+
## Ready-to-Use TaskSets
|
|
214
|
+
|
|
215
|
+
- **WebVoyager** - Web navigation and interaction
|
|
216
|
+
- **Mind2Web** - Complex web application tasks
|
|
217
|
+
- **GAIA** - Question answering and reasoning
|
|
218
|
+
- **OSWorld-Ubuntu** - Desktop interaction
|
|
219
|
+
- **hud-samples** - Getting started examples
|
|
220
|
+
|
|
221
|
+
## Community
|
|
222
|
+
|
|
223
|
+
**Contributing Custom Environments**
|
|
224
|
+
|
|
225
|
+
Add your environment to the `environments/` folder and submit a PR! Examples:
|
|
226
|
+
- `environments/novnc_ubuntu/` - Ubuntu with VNC access
|
|
227
|
+
- `environments/pokemon_controller/` - Pokemon emulator environment (In Development)
|
|
228
|
+
- `environments/qa_controller/` - Lightweight app sample
|
|
229
|
+
|
|
230
|
+
See [Custom Environments Guide](https://docs.hud.so/environment-creation) for details.
|
|
231
|
+
|
|
158
232
|
## Documentation Sections
|
|
159
233
|
|
|
160
234
|
Explore the core concepts and features of the SDK:
|
|
161
235
|
|
|
162
|
-
* **[
|
|
163
|
-
* **[Environments](https://
|
|
164
|
-
* **[Agents](https://
|
|
165
|
-
* **[
|
|
166
|
-
* **[
|
|
167
|
-
* **[
|
|
168
|
-
* **Advanced Topics**:
|
|
169
|
-
* **[CLA Action Details](https://documentation.hud.so/advanced/cla-details)**: Explore the standardized action format.
|
|
170
|
-
* **[Custom Environments](https://documentation.hud.so/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
|
|
171
|
-
* **[Advanced Environment Control](https://documentation.hud.so/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
|
|
172
|
-
|
|
173
|
-
* **[Full API Reference](https://documentation.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
|
|
236
|
+
* **[Task Creation](https://docs.hud.so/task-creation)**: Build custom evaluation scenarios with setup and evaluation criteria.
|
|
237
|
+
* **[Environments](https://docs.hud.so/environments/browser)**: Understand browser environments and create custom Docker-based environments.
|
|
238
|
+
* **[Agents](https://docs.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
|
|
239
|
+
* **[Jobs](https://docs.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
|
|
240
|
+
* **[MCP Telemetry](https://docs.hud.so/telemetry/mcp)**: Automatic tracing of Model Context Protocol interactions.
|
|
241
|
+
* **[Full API Reference](https://docs.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
|
|
174
242
|
|
|
175
243
|
## [Examples](examples/)
|
|
176
244
|
|
|
@@ -183,7 +251,7 @@ We recommend you first take a look at the example notebooks showing how to use t
|
|
|
183
251
|
|
|
184
252
|
## Documentation
|
|
185
253
|
|
|
186
|
-
For comprehensive guides, examples, and API reference, visit [our docs](https://
|
|
254
|
+
For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
|
|
187
255
|
|
|
188
256
|
## License
|
|
189
257
|
|