hud-python 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +7 -4
- hud/adapters/common/adapter.py +14 -3
- hud/adapters/common/tests/test_adapter.py +16 -4
- hud/datasets.py +188 -0
- hud/env/docker_client.py +14 -2
- hud/env/local_docker_client.py +28 -6
- hud/gym.py +0 -9
- hud/{mcp_agent → mcp}/__init__.py +2 -0
- hud/mcp/base.py +631 -0
- hud/{mcp_agent → mcp}/claude.py +52 -47
- hud/mcp/client.py +312 -0
- hud/{mcp_agent → mcp}/langchain.py +52 -33
- hud/{mcp_agent → mcp}/openai.py +56 -40
- hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
- hud/mcp/tests/test_claude.py +294 -0
- hud/mcp/tests/test_client.py +324 -0
- hud/mcp/tests/test_openai.py +238 -0
- hud/settings.py +6 -0
- hud/task.py +1 -88
- hud/taskset.py +2 -23
- hud/telemetry/__init__.py +5 -0
- hud/telemetry/_trace.py +180 -17
- hud/telemetry/context.py +79 -0
- hud/telemetry/exporter.py +165 -6
- hud/telemetry/job.py +141 -0
- hud/telemetry/tests/test_trace.py +36 -25
- hud/tools/__init__.py +14 -1
- hud/tools/executors/__init__.py +19 -2
- hud/tools/executors/pyautogui.py +84 -50
- hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
- hud/tools/playwright_tool.py +73 -67
- hud/tools/tests/test_edit.py +8 -1
- hud/tools/tests/test_tools.py +3 -0
- hud/trajectory.py +5 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/METADATA +20 -14
- {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/RECORD +41 -46
- hud/evaluators/__init__.py +0 -9
- hud/evaluators/base.py +0 -32
- hud/evaluators/inspect.py +0 -24
- hud/evaluators/judge.py +0 -189
- hud/evaluators/match.py +0 -156
- hud/evaluators/remote.py +0 -65
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +0 -12
- hud/evaluators/tests/test_judge.py +0 -231
- hud/evaluators/tests/test_match.py +0 -115
- hud/evaluators/tests/test_remote.py +0 -98
- hud/mcp_agent/base.py +0 -723
- /hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
- {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
- {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
hud/telemetry/job.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Job context manager for grouping related traces."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
import uuid
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from contextvars import ContextVar
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
from hud.telemetry.exporter import JobStatus, submit_to_worker_loop, update_job_status
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Generator
|
|
16
|
+
from typing import Self
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("hud.telemetry")
|
|
19
|
+
|
|
20
|
+
# Context variables for current job
|
|
21
|
+
current_job_id: ContextVar[str | None] = ContextVar("current_job_id", default=None)
|
|
22
|
+
current_job_name: ContextVar[str | None] = ContextVar("current_job_name", default=None)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class JobContext:
|
|
26
|
+
"""Context manager for grouping traces under a job."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self, name: str, taskset_name: str | None = None, metadata: dict[str, Any] | None = None
|
|
30
|
+
) -> None:
|
|
31
|
+
self.id = str(uuid.uuid4())
|
|
32
|
+
self.name = name
|
|
33
|
+
self.metadata = metadata or {}
|
|
34
|
+
self.taskset_name: str | None = taskset_name
|
|
35
|
+
|
|
36
|
+
def __enter__(self) -> Self:
|
|
37
|
+
# Auto-detect dataset
|
|
38
|
+
if self.taskset_name is None:
|
|
39
|
+
self._detect_dataset()
|
|
40
|
+
|
|
41
|
+
# Set context variables
|
|
42
|
+
current_job_id.set(self.id)
|
|
43
|
+
current_job_name.set(self.name)
|
|
44
|
+
|
|
45
|
+
# Send initial status
|
|
46
|
+
job_metadata = {**self.metadata}
|
|
47
|
+
coro = update_job_status(
|
|
48
|
+
self.id, JobStatus.RUNNING, metadata=job_metadata, taskset_name=self.taskset_name
|
|
49
|
+
)
|
|
50
|
+
submit_to_worker_loop(coro)
|
|
51
|
+
|
|
52
|
+
logger.info("Started job %s (ID: %s)", self.name, self.id)
|
|
53
|
+
return self
|
|
54
|
+
|
|
55
|
+
def __exit__(
|
|
56
|
+
self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: object
|
|
57
|
+
) -> None:
|
|
58
|
+
# Determine final status
|
|
59
|
+
if exc_type is not None:
|
|
60
|
+
# Job failed with exception
|
|
61
|
+
error_msg = f"{exc_type.__name__}: {exc_val}"
|
|
62
|
+
coro = update_job_status(
|
|
63
|
+
self.id, JobStatus.ERROR, error_message=error_msg, taskset_name=self.taskset_name
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
# Job completed successfully
|
|
67
|
+
coro = update_job_status(self.id, JobStatus.COMPLETED, taskset_name=self.taskset_name)
|
|
68
|
+
|
|
69
|
+
submit_to_worker_loop(coro)
|
|
70
|
+
|
|
71
|
+
# Clear context
|
|
72
|
+
current_job_id.set(None)
|
|
73
|
+
current_job_name.set(None)
|
|
74
|
+
|
|
75
|
+
status = "failed" if exc_type else "completed"
|
|
76
|
+
logger.info("Job %s %s", self.name, status)
|
|
77
|
+
|
|
78
|
+
def _detect_dataset(self) -> None:
|
|
79
|
+
"""Auto-detect HuggingFace dataset in parent scope."""
|
|
80
|
+
try:
|
|
81
|
+
# Check frames 2 and 3 (with statement and parent scope)
|
|
82
|
+
for frame_depth in [2, 3]:
|
|
83
|
+
try:
|
|
84
|
+
frame = sys._getframe(frame_depth)
|
|
85
|
+
|
|
86
|
+
# Search for Dataset objects
|
|
87
|
+
for var_value in frame.f_locals.values():
|
|
88
|
+
if hasattr(var_value, "info") and hasattr(var_value.info, "builder_name"):
|
|
89
|
+
self.taskset_name = var_value.info.builder_name
|
|
90
|
+
logger.debug(
|
|
91
|
+
"Auto-detected dataset at frame %d: %s",
|
|
92
|
+
frame_depth,
|
|
93
|
+
self.taskset_name,
|
|
94
|
+
)
|
|
95
|
+
return
|
|
96
|
+
elif hasattr(var_value, "builder_name"):
|
|
97
|
+
# Older dataset format
|
|
98
|
+
self.taskset_name = var_value.builder_name
|
|
99
|
+
logger.debug(
|
|
100
|
+
"Auto-detected dataset at frame %d: %s",
|
|
101
|
+
frame_depth,
|
|
102
|
+
self.taskset_name,
|
|
103
|
+
)
|
|
104
|
+
return
|
|
105
|
+
except ValueError:
|
|
106
|
+
# Frame doesn't exist
|
|
107
|
+
continue
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.debug("Dataset auto-detection failed: %s", e)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@contextmanager
|
|
113
|
+
def job(
|
|
114
|
+
name: str, taskset_name: str | None = None, metadata: dict[str, Any] | None = None
|
|
115
|
+
) -> Generator[JobContext, None, None]:
|
|
116
|
+
"""
|
|
117
|
+
Create a job context for grouping related traces.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
name: Name for the job
|
|
121
|
+
metadata: Optional metadata to include with the job
|
|
122
|
+
|
|
123
|
+
Example:
|
|
124
|
+
with hud.job("evaluation_run") as job:
|
|
125
|
+
for task in tasks:
|
|
126
|
+
with hud.trace(f"task_{task.id}"):
|
|
127
|
+
# Trace automatically includes job_id
|
|
128
|
+
result = await agent.run(task)
|
|
129
|
+
"""
|
|
130
|
+
with JobContext(name, taskset_name, metadata) as ctx:
|
|
131
|
+
yield ctx
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def get_current_job_id() -> str | None:
|
|
135
|
+
"""Get the current job ID if inside a job context."""
|
|
136
|
+
return current_job_id.get()
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_current_job_name() -> str | None:
|
|
140
|
+
"""Get the current job name if inside a job context."""
|
|
141
|
+
return current_job_name.get()
|
|
@@ -48,7 +48,7 @@ class TestTrace:
|
|
|
48
48
|
"hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
|
|
49
49
|
)
|
|
50
50
|
mock_submit_loop = mocker.patch(
|
|
51
|
-
"hud.telemetry.
|
|
51
|
+
"hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
initial_root_state = actual_is_root_trace.get()
|
|
@@ -62,7 +62,8 @@ class TestTrace:
|
|
|
62
62
|
assert actual_get_current_task_run_id() is None
|
|
63
63
|
assert actual_is_root_trace.get() == initial_root_state
|
|
64
64
|
mock_flush.assert_called_once()
|
|
65
|
-
|
|
65
|
+
# submit_to_worker_loop is now called for status updates
|
|
66
|
+
assert mock_submit_loop.call_count == 2 # INITIALIZING and COMPLETED
|
|
66
67
|
|
|
67
68
|
def test_trace_with_name_and_attributes(self, mocker):
|
|
68
69
|
"""Test trace with name and attributes, checking they are passed on."""
|
|
@@ -71,7 +72,7 @@ class TestTrace:
|
|
|
71
72
|
"hud.telemetry._trace.flush_buffer", return_value=mock_mcp_calls, autospec=True
|
|
72
73
|
)
|
|
73
74
|
mock_submit_loop = mocker.patch(
|
|
74
|
-
"hud.telemetry.
|
|
75
|
+
"hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
75
76
|
)
|
|
76
77
|
|
|
77
78
|
trace_name = "test_trace_with_data"
|
|
@@ -81,7 +82,8 @@ class TestTrace:
|
|
|
81
82
|
assert isinstance(task_run_id, str)
|
|
82
83
|
|
|
83
84
|
mock_flush.assert_called_once()
|
|
84
|
-
|
|
85
|
+
# submit_to_worker_loop is now called for status updates
|
|
86
|
+
assert mock_submit_loop.call_count == 2 # INITIALIZING and COMPLETED
|
|
85
87
|
|
|
86
88
|
@pytest.mark.asyncio
|
|
87
89
|
async def test_trace_with_mcp_calls_exports(self, mocker):
|
|
@@ -91,14 +93,14 @@ class TestTrace:
|
|
|
91
93
|
"hud.telemetry._trace.flush_buffer", return_value=mock_mcp_calls, autospec=True
|
|
92
94
|
)
|
|
93
95
|
mock_submit_loop = mocker.patch(
|
|
94
|
-
"hud.telemetry.
|
|
96
|
+
"hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
95
97
|
)
|
|
96
98
|
|
|
97
99
|
async def mock_export(*args, **kwargs):
|
|
98
100
|
return None
|
|
99
101
|
|
|
100
|
-
|
|
101
|
-
"hud.telemetry.
|
|
102
|
+
mocker.patch(
|
|
103
|
+
"hud.telemetry.exporter.export_telemetry",
|
|
102
104
|
side_effect=mock_export,
|
|
103
105
|
)
|
|
104
106
|
|
|
@@ -109,16 +111,14 @@ class TestTrace:
|
|
|
109
111
|
pass
|
|
110
112
|
|
|
111
113
|
mock_flush.assert_called_once()
|
|
112
|
-
|
|
114
|
+
# submit_to_worker_loop is now called for status updates and export
|
|
115
|
+
# The exact count may vary depending on whether export_incremental is called
|
|
116
|
+
assert mock_submit_loop.call_count >= 2 # At least INITIALIZING and COMPLETED
|
|
113
117
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
assert
|
|
118
|
-
assert kwargs["trace_attributes"]["trace_name"] == test_name
|
|
119
|
-
assert kwargs["trace_attributes"]["custom_attr"] == "test_val"
|
|
120
|
-
assert "duration_seconds" in kwargs["trace_attributes"]
|
|
121
|
-
assert kwargs["trace_attributes"]["is_root_trace"] is True
|
|
118
|
+
# With the new export flow, export_telemetry is submitted to worker loop
|
|
119
|
+
# so we can't directly assert on it being called synchronously
|
|
120
|
+
# Instead, verify that the trace completed successfully
|
|
121
|
+
assert task_run_id is not None
|
|
122
122
|
|
|
123
123
|
def test_trace_nested(self, mocker):
|
|
124
124
|
"""Test nested traces, verifying context restoration and root trace logic."""
|
|
@@ -129,7 +129,7 @@ class TestTrace:
|
|
|
129
129
|
"hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
|
|
130
130
|
)
|
|
131
131
|
mock_submit_loop_internal = mocker.patch(
|
|
132
|
-
"hud.telemetry.
|
|
132
|
+
"hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
133
133
|
)
|
|
134
134
|
|
|
135
135
|
assert actual_get_current_task_run_id() is None
|
|
@@ -148,7 +148,8 @@ class TestTrace:
|
|
|
148
148
|
assert actual_get_current_task_run_id() is None
|
|
149
149
|
assert actual_is_root_trace.get() is False
|
|
150
150
|
assert mock_flush_internal.call_count == 2
|
|
151
|
-
|
|
151
|
+
# submit_to_worker_loop is now called for status updates
|
|
152
|
+
assert mock_submit_loop_internal.call_count == 2 # Only outer trace sends status updates
|
|
152
153
|
|
|
153
154
|
def test_trace_exception_handling(self, mocker):
|
|
154
155
|
"""Test trace handles exceptions properly and restores context."""
|
|
@@ -161,7 +162,7 @@ class TestTrace:
|
|
|
161
162
|
"hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
|
|
162
163
|
)
|
|
163
164
|
mock_submit_loop = mocker.patch(
|
|
164
|
-
"hud.telemetry.
|
|
165
|
+
"hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
|
|
165
166
|
)
|
|
166
167
|
|
|
167
168
|
with (
|
|
@@ -191,7 +192,7 @@ class TestTraceSync:
|
|
|
191
192
|
with trace(name="test_sync") as task_run_id:
|
|
192
193
|
assert task_run_id == "test-task-id"
|
|
193
194
|
|
|
194
|
-
mock_trace_open.assert_called_once_with(name="test_sync", attributes=None)
|
|
195
|
+
mock_trace_open.assert_called_once_with(name="test_sync", agent_model=None, attributes=None)
|
|
195
196
|
mock_flush.assert_called_once()
|
|
196
197
|
|
|
197
198
|
def test_trace_sync_with_attributes(self, mocker):
|
|
@@ -205,7 +206,9 @@ class TestTraceSync:
|
|
|
205
206
|
with trace(name="test_sync", attributes=attrs):
|
|
206
207
|
pass
|
|
207
208
|
|
|
208
|
-
mock_trace_open.assert_called_once_with(
|
|
209
|
+
mock_trace_open.assert_called_once_with(
|
|
210
|
+
name="test_sync", agent_model=None, attributes=attrs
|
|
211
|
+
)
|
|
209
212
|
mock_flush.assert_called_once()
|
|
210
213
|
|
|
211
214
|
|
|
@@ -224,7 +227,9 @@ class TestTraceDecorator:
|
|
|
224
227
|
|
|
225
228
|
result = sync_function(1, 2)
|
|
226
229
|
assert result == 3
|
|
227
|
-
mock_trace_open.assert_called_once_with(
|
|
230
|
+
mock_trace_open.assert_called_once_with(
|
|
231
|
+
name="test_func_sync", agent_model=None, attributes=None
|
|
232
|
+
)
|
|
228
233
|
|
|
229
234
|
def test_trace_decorator_async_function(self, mocker):
|
|
230
235
|
"""Test trace_decorator on asynchronous functions."""
|
|
@@ -239,7 +244,9 @@ class TestTraceDecorator:
|
|
|
239
244
|
async def run_test():
|
|
240
245
|
result = await async_function(1, 2)
|
|
241
246
|
assert result == 3
|
|
242
|
-
mock_trace_open.assert_called_once_with(
|
|
247
|
+
mock_trace_open.assert_called_once_with(
|
|
248
|
+
name="test_func_async", agent_model=None, attributes=None
|
|
249
|
+
)
|
|
243
250
|
|
|
244
251
|
asyncio.run(run_test())
|
|
245
252
|
|
|
@@ -257,7 +264,9 @@ class TestTraceDecorator:
|
|
|
257
264
|
|
|
258
265
|
result = func_with_attrs(5)
|
|
259
266
|
assert result == 10
|
|
260
|
-
mock_trace_open.assert_called_once_with(
|
|
267
|
+
mock_trace_open.assert_called_once_with(
|
|
268
|
+
name="test_func", agent_model=None, attributes=attrs
|
|
269
|
+
)
|
|
261
270
|
|
|
262
271
|
def test_trace_decorator_without_name(self, mocker):
|
|
263
272
|
"""Test trace_decorator uses module.function name when name not provided."""
|
|
@@ -273,7 +282,9 @@ class TestTraceDecorator:
|
|
|
273
282
|
assert result == "result"
|
|
274
283
|
# Should use module.function name
|
|
275
284
|
expected_name = f"{my_function.__module__}.my_function"
|
|
276
|
-
mock_trace_open.assert_called_once_with(
|
|
285
|
+
mock_trace_open.assert_called_once_with(
|
|
286
|
+
name=expected_name, agent_model=None, attributes=None
|
|
287
|
+
)
|
|
277
288
|
|
|
278
289
|
def test_trace_decorator_preserves_function_metadata(self):
|
|
279
290
|
"""Test trace_decorator preserves original function metadata."""
|
hud/tools/__init__.py
CHANGED
|
@@ -2,12 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
5
7
|
from .base import ToolError, ToolResult, tool_result_to_content_blocks
|
|
6
8
|
from .bash import BashTool
|
|
7
|
-
from .computer import AnthropicComputerTool, HudComputerTool, OpenAIComputerTool
|
|
8
9
|
from .edit import EditTool
|
|
9
10
|
from .playwright_tool import PlaywrightTool
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from .computer import AnthropicComputerTool, HudComputerTool, OpenAIComputerTool
|
|
14
|
+
|
|
11
15
|
__all__ = [
|
|
12
16
|
"AnthropicComputerTool",
|
|
13
17
|
"BashTool",
|
|
@@ -19,3 +23,12 @@ __all__ = [
|
|
|
19
23
|
"ToolResult",
|
|
20
24
|
"tool_result_to_content_blocks",
|
|
21
25
|
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def __getattr__(name: str) -> Any:
|
|
29
|
+
"""Lazy import computer tools to avoid importing pyautogui unless needed."""
|
|
30
|
+
if name in ("AnthropicComputerTool", "HudComputerTool", "OpenAIComputerTool"):
|
|
31
|
+
from . import computer
|
|
32
|
+
|
|
33
|
+
return getattr(computer, name)
|
|
34
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
hud/tools/executors/__init__.py
CHANGED
|
@@ -2,12 +2,29 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
5
7
|
from .base import BaseExecutor
|
|
6
|
-
|
|
7
|
-
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from .pyautogui import PyAutoGUIExecutor
|
|
11
|
+
from .xdo import XDOExecutor
|
|
8
12
|
|
|
9
13
|
__all__ = [
|
|
10
14
|
"BaseExecutor",
|
|
11
15
|
"PyAutoGUIExecutor",
|
|
12
16
|
"XDOExecutor",
|
|
13
17
|
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def __getattr__(name: str) -> Any:
|
|
21
|
+
"""Lazy import executors to avoid importing pyautogui unless needed."""
|
|
22
|
+
if name == "PyAutoGUIExecutor":
|
|
23
|
+
from .pyautogui import PyAutoGUIExecutor
|
|
24
|
+
|
|
25
|
+
return PyAutoGUIExecutor
|
|
26
|
+
elif name == "XDOExecutor":
|
|
27
|
+
from .xdo import XDOExecutor
|
|
28
|
+
|
|
29
|
+
return XDOExecutor
|
|
30
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|