hud-python 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (53) hide show
  1. hud/__init__.py +7 -4
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +14 -2
  6. hud/env/local_docker_client.py +28 -6
  7. hud/gym.py +0 -9
  8. hud/{mcp_agent → mcp}/__init__.py +2 -0
  9. hud/mcp/base.py +631 -0
  10. hud/{mcp_agent → mcp}/claude.py +52 -47
  11. hud/mcp/client.py +312 -0
  12. hud/{mcp_agent → mcp}/langchain.py +52 -33
  13. hud/{mcp_agent → mcp}/openai.py +56 -40
  14. hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
  15. hud/mcp/tests/test_claude.py +294 -0
  16. hud/mcp/tests/test_client.py +324 -0
  17. hud/mcp/tests/test_openai.py +238 -0
  18. hud/settings.py +6 -0
  19. hud/task.py +1 -88
  20. hud/taskset.py +2 -23
  21. hud/telemetry/__init__.py +5 -0
  22. hud/telemetry/_trace.py +180 -17
  23. hud/telemetry/context.py +79 -0
  24. hud/telemetry/exporter.py +165 -6
  25. hud/telemetry/job.py +141 -0
  26. hud/telemetry/tests/test_trace.py +36 -25
  27. hud/tools/__init__.py +14 -1
  28. hud/tools/executors/__init__.py +19 -2
  29. hud/tools/executors/pyautogui.py +84 -50
  30. hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
  31. hud/tools/playwright_tool.py +73 -67
  32. hud/tools/tests/test_edit.py +8 -1
  33. hud/tools/tests/test_tools.py +3 -0
  34. hud/trajectory.py +5 -1
  35. hud/utils/tests/test_version.py +1 -1
  36. hud/version.py +1 -1
  37. {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/METADATA +20 -14
  38. {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/RECORD +41 -46
  39. hud/evaluators/__init__.py +0 -9
  40. hud/evaluators/base.py +0 -32
  41. hud/evaluators/inspect.py +0 -24
  42. hud/evaluators/judge.py +0 -189
  43. hud/evaluators/match.py +0 -156
  44. hud/evaluators/remote.py +0 -65
  45. hud/evaluators/tests/__init__.py +0 -0
  46. hud/evaluators/tests/test_inspect.py +0 -12
  47. hud/evaluators/tests/test_judge.py +0 -231
  48. hud/evaluators/tests/test_match.py +0 -115
  49. hud/evaluators/tests/test_remote.py +0 -98
  50. hud/mcp_agent/base.py +0 -723
  51. /hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
  52. {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
  53. {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
hud/telemetry/job.py ADDED
@@ -0,0 +1,141 @@
1
+ """Job context manager for grouping related traces."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sys
7
+ import uuid
8
+ from contextlib import contextmanager
9
+ from contextvars import ContextVar
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ from hud.telemetry.exporter import JobStatus, submit_to_worker_loop, update_job_status
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Generator
16
+ from typing import Self
17
+
18
+ logger = logging.getLogger("hud.telemetry")
19
+
20
+ # Context variables for current job
21
+ current_job_id: ContextVar[str | None] = ContextVar("current_job_id", default=None)
22
+ current_job_name: ContextVar[str | None] = ContextVar("current_job_name", default=None)
23
+
24
+
25
+ class JobContext:
26
+ """Context manager for grouping traces under a job."""
27
+
28
+ def __init__(
29
+ self, name: str, taskset_name: str | None = None, metadata: dict[str, Any] | None = None
30
+ ) -> None:
31
+ self.id = str(uuid.uuid4())
32
+ self.name = name
33
+ self.metadata = metadata or {}
34
+ self.taskset_name: str | None = taskset_name
35
+
36
+ def __enter__(self) -> Self:
37
+ # Auto-detect dataset
38
+ if self.taskset_name is None:
39
+ self._detect_dataset()
40
+
41
+ # Set context variables
42
+ current_job_id.set(self.id)
43
+ current_job_name.set(self.name)
44
+
45
+ # Send initial status
46
+ job_metadata = {**self.metadata}
47
+ coro = update_job_status(
48
+ self.id, JobStatus.RUNNING, metadata=job_metadata, taskset_name=self.taskset_name
49
+ )
50
+ submit_to_worker_loop(coro)
51
+
52
+ logger.info("Started job %s (ID: %s)", self.name, self.id)
53
+ return self
54
+
55
+ def __exit__(
56
+ self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: object
57
+ ) -> None:
58
+ # Determine final status
59
+ if exc_type is not None:
60
+ # Job failed with exception
61
+ error_msg = f"{exc_type.__name__}: {exc_val}"
62
+ coro = update_job_status(
63
+ self.id, JobStatus.ERROR, error_message=error_msg, taskset_name=self.taskset_name
64
+ )
65
+ else:
66
+ # Job completed successfully
67
+ coro = update_job_status(self.id, JobStatus.COMPLETED, taskset_name=self.taskset_name)
68
+
69
+ submit_to_worker_loop(coro)
70
+
71
+ # Clear context
72
+ current_job_id.set(None)
73
+ current_job_name.set(None)
74
+
75
+ status = "failed" if exc_type else "completed"
76
+ logger.info("Job %s %s", self.name, status)
77
+
78
+ def _detect_dataset(self) -> None:
79
+ """Auto-detect HuggingFace dataset in parent scope."""
80
+ try:
81
+ # Check frames 2 and 3 (with statement and parent scope)
82
+ for frame_depth in [2, 3]:
83
+ try:
84
+ frame = sys._getframe(frame_depth)
85
+
86
+ # Search for Dataset objects
87
+ for var_value in frame.f_locals.values():
88
+ if hasattr(var_value, "info") and hasattr(var_value.info, "builder_name"):
89
+ self.taskset_name = var_value.info.builder_name
90
+ logger.debug(
91
+ "Auto-detected dataset at frame %d: %s",
92
+ frame_depth,
93
+ self.taskset_name,
94
+ )
95
+ return
96
+ elif hasattr(var_value, "builder_name"):
97
+ # Older dataset format
98
+ self.taskset_name = var_value.builder_name
99
+ logger.debug(
100
+ "Auto-detected dataset at frame %d: %s",
101
+ frame_depth,
102
+ self.taskset_name,
103
+ )
104
+ return
105
+ except ValueError:
106
+ # Frame doesn't exist
107
+ continue
108
+ except Exception as e:
109
+ logger.debug("Dataset auto-detection failed: %s", e)
110
+
111
+
112
+ @contextmanager
113
+ def job(
114
+ name: str, taskset_name: str | None = None, metadata: dict[str, Any] | None = None
115
+ ) -> Generator[JobContext, None, None]:
116
+ """
117
+ Create a job context for grouping related traces.
118
+
119
+ Args:
120
+ name: Name for the job
121
+ metadata: Optional metadata to include with the job
122
+
123
+ Example:
124
+ with hud.job("evaluation_run") as job:
125
+ for task in tasks:
126
+ with hud.trace(f"task_{task.id}"):
127
+ # Trace automatically includes job_id
128
+ result = await agent.run(task)
129
+ """
130
+ with JobContext(name, taskset_name, metadata) as ctx:
131
+ yield ctx
132
+
133
+
134
+ def get_current_job_id() -> str | None:
135
+ """Get the current job ID if inside a job context."""
136
+ return current_job_id.get()
137
+
138
+
139
+ def get_current_job_name() -> str | None:
140
+ """Get the current job name if inside a job context."""
141
+ return current_job_name.get()
@@ -48,7 +48,7 @@ class TestTrace:
48
48
  "hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
49
49
  )
50
50
  mock_submit_loop = mocker.patch(
51
- "hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
51
+ "hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
52
52
  )
53
53
 
54
54
  initial_root_state = actual_is_root_trace.get()
@@ -62,7 +62,8 @@ class TestTrace:
62
62
  assert actual_get_current_task_run_id() is None
63
63
  assert actual_is_root_trace.get() == initial_root_state
64
64
  mock_flush.assert_called_once()
65
- mock_submit_loop.assert_not_called()
65
+ # submit_to_worker_loop is now called for status updates
66
+ assert mock_submit_loop.call_count == 2 # INITIALIZING and COMPLETED
66
67
 
67
68
  def test_trace_with_name_and_attributes(self, mocker):
68
69
  """Test trace with name and attributes, checking they are passed on."""
@@ -71,7 +72,7 @@ class TestTrace:
71
72
  "hud.telemetry._trace.flush_buffer", return_value=mock_mcp_calls, autospec=True
72
73
  )
73
74
  mock_submit_loop = mocker.patch(
74
- "hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
75
+ "hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
75
76
  )
76
77
 
77
78
  trace_name = "test_trace_with_data"
@@ -81,7 +82,8 @@ class TestTrace:
81
82
  assert isinstance(task_run_id, str)
82
83
 
83
84
  mock_flush.assert_called_once()
84
- mock_submit_loop.assert_called_once()
85
+ # submit_to_worker_loop is now called for status updates
86
+ assert mock_submit_loop.call_count == 2 # INITIALIZING and COMPLETED
85
87
 
86
88
  @pytest.mark.asyncio
87
89
  async def test_trace_with_mcp_calls_exports(self, mocker):
@@ -91,14 +93,14 @@ class TestTrace:
91
93
  "hud.telemetry._trace.flush_buffer", return_value=mock_mcp_calls, autospec=True
92
94
  )
93
95
  mock_submit_loop = mocker.patch(
94
- "hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
96
+ "hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
95
97
  )
96
98
 
97
99
  async def mock_export(*args, **kwargs):
98
100
  return None
99
101
 
100
- mock_export_actual_coro = mocker.patch(
101
- "hud.telemetry._trace.exporter.export_telemetry",
102
+ mocker.patch(
103
+ "hud.telemetry.exporter.export_telemetry",
102
104
  side_effect=mock_export,
103
105
  )
104
106
 
@@ -109,16 +111,14 @@ class TestTrace:
109
111
  pass
110
112
 
111
113
  mock_flush.assert_called_once()
112
- mock_submit_loop.assert_called_once()
114
+ # submit_to_worker_loop is now called for status updates and export
115
+ # The exact count may vary depending on whether export_incremental is called
116
+ assert mock_submit_loop.call_count >= 2 # At least INITIALIZING and COMPLETED
113
117
 
114
- mock_export_actual_coro.assert_called_once()
115
- args, kwargs = mock_export_actual_coro.call_args
116
- assert kwargs["task_run_id"] == task_run_id
117
- assert kwargs["mcp_calls"] == mock_mcp_calls
118
- assert kwargs["trace_attributes"]["trace_name"] == test_name
119
- assert kwargs["trace_attributes"]["custom_attr"] == "test_val"
120
- assert "duration_seconds" in kwargs["trace_attributes"]
121
- assert kwargs["trace_attributes"]["is_root_trace"] is True
118
+ # With the new export flow, export_telemetry is submitted to worker loop
119
+ # so we can't directly assert on it being called synchronously
120
+ # Instead, verify that the trace completed successfully
121
+ assert task_run_id is not None
122
122
 
123
123
  def test_trace_nested(self, mocker):
124
124
  """Test nested traces, verifying context restoration and root trace logic."""
@@ -129,7 +129,7 @@ class TestTrace:
129
129
  "hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
130
130
  )
131
131
  mock_submit_loop_internal = mocker.patch(
132
- "hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
132
+ "hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
133
133
  )
134
134
 
135
135
  assert actual_get_current_task_run_id() is None
@@ -148,7 +148,8 @@ class TestTrace:
148
148
  assert actual_get_current_task_run_id() is None
149
149
  assert actual_is_root_trace.get() is False
150
150
  assert mock_flush_internal.call_count == 2
151
- mock_submit_loop_internal.assert_not_called()
151
+ # submit_to_worker_loop is now called for status updates
152
+ assert mock_submit_loop_internal.call_count == 2 # Only outer trace sends status updates
152
153
 
153
154
  def test_trace_exception_handling(self, mocker):
154
155
  """Test trace handles exceptions properly and restores context."""
@@ -161,7 +162,7 @@ class TestTrace:
161
162
  "hud.telemetry._trace.flush_buffer", return_value=[], autospec=True
162
163
  )
163
164
  mock_submit_loop = mocker.patch(
164
- "hud.telemetry._trace.submit_to_worker_loop", return_value=MagicMock(), autospec=True
165
+ "hud.telemetry.exporter.submit_to_worker_loop", return_value=MagicMock(), autospec=True
165
166
  )
166
167
 
167
168
  with (
@@ -191,7 +192,7 @@ class TestTraceSync:
191
192
  with trace(name="test_sync") as task_run_id:
192
193
  assert task_run_id == "test-task-id"
193
194
 
194
- mock_trace_open.assert_called_once_with(name="test_sync", attributes=None)
195
+ mock_trace_open.assert_called_once_with(name="test_sync", agent_model=None, attributes=None)
195
196
  mock_flush.assert_called_once()
196
197
 
197
198
  def test_trace_sync_with_attributes(self, mocker):
@@ -205,7 +206,9 @@ class TestTraceSync:
205
206
  with trace(name="test_sync", attributes=attrs):
206
207
  pass
207
208
 
208
- mock_trace_open.assert_called_once_with(name="test_sync", attributes=attrs)
209
+ mock_trace_open.assert_called_once_with(
210
+ name="test_sync", agent_model=None, attributes=attrs
211
+ )
209
212
  mock_flush.assert_called_once()
210
213
 
211
214
 
@@ -224,7 +227,9 @@ class TestTraceDecorator:
224
227
 
225
228
  result = sync_function(1, 2)
226
229
  assert result == 3
227
- mock_trace_open.assert_called_once_with(name="test_func_sync", attributes=None)
230
+ mock_trace_open.assert_called_once_with(
231
+ name="test_func_sync", agent_model=None, attributes=None
232
+ )
228
233
 
229
234
  def test_trace_decorator_async_function(self, mocker):
230
235
  """Test trace_decorator on asynchronous functions."""
@@ -239,7 +244,9 @@ class TestTraceDecorator:
239
244
  async def run_test():
240
245
  result = await async_function(1, 2)
241
246
  assert result == 3
242
- mock_trace_open.assert_called_once_with(name="test_func_async", attributes=None)
247
+ mock_trace_open.assert_called_once_with(
248
+ name="test_func_async", agent_model=None, attributes=None
249
+ )
243
250
 
244
251
  asyncio.run(run_test())
245
252
 
@@ -257,7 +264,9 @@ class TestTraceDecorator:
257
264
 
258
265
  result = func_with_attrs(5)
259
266
  assert result == 10
260
- mock_trace_open.assert_called_once_with(name="test_func", attributes=attrs)
267
+ mock_trace_open.assert_called_once_with(
268
+ name="test_func", agent_model=None, attributes=attrs
269
+ )
261
270
 
262
271
  def test_trace_decorator_without_name(self, mocker):
263
272
  """Test trace_decorator uses module.function name when name not provided."""
@@ -273,7 +282,9 @@ class TestTraceDecorator:
273
282
  assert result == "result"
274
283
  # Should use module.function name
275
284
  expected_name = f"{my_function.__module__}.my_function"
276
- mock_trace_open.assert_called_once_with(name=expected_name, attributes=None)
285
+ mock_trace_open.assert_called_once_with(
286
+ name=expected_name, agent_model=None, attributes=None
287
+ )
277
288
 
278
289
  def test_trace_decorator_preserves_function_metadata(self):
279
290
  """Test trace_decorator preserves original function metadata."""
hud/tools/__init__.py CHANGED
@@ -2,12 +2,16 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from typing import TYPE_CHECKING, Any
6
+
5
7
  from .base import ToolError, ToolResult, tool_result_to_content_blocks
6
8
  from .bash import BashTool
7
- from .computer import AnthropicComputerTool, HudComputerTool, OpenAIComputerTool
8
9
  from .edit import EditTool
9
10
  from .playwright_tool import PlaywrightTool
10
11
 
12
+ if TYPE_CHECKING:
13
+ from .computer import AnthropicComputerTool, HudComputerTool, OpenAIComputerTool
14
+
11
15
  __all__ = [
12
16
  "AnthropicComputerTool",
13
17
  "BashTool",
@@ -19,3 +23,12 @@ __all__ = [
19
23
  "ToolResult",
20
24
  "tool_result_to_content_blocks",
21
25
  ]
26
+
27
+
28
+ def __getattr__(name: str) -> Any:
29
+ """Lazy import computer tools to avoid importing pyautogui unless needed."""
30
+ if name in ("AnthropicComputerTool", "HudComputerTool", "OpenAIComputerTool"):
31
+ from . import computer
32
+
33
+ return getattr(computer, name)
34
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
@@ -2,12 +2,29 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from typing import TYPE_CHECKING, Any
6
+
5
7
  from .base import BaseExecutor
6
- from .pyautogui import PyAutoGUIExecutor
7
- from .xdo import XDOExecutor
8
+
9
+ if TYPE_CHECKING:
10
+ from .pyautogui import PyAutoGUIExecutor
11
+ from .xdo import XDOExecutor
8
12
 
9
13
  __all__ = [
10
14
  "BaseExecutor",
11
15
  "PyAutoGUIExecutor",
12
16
  "XDOExecutor",
13
17
  ]
18
+
19
+
20
+ def __getattr__(name: str) -> Any:
21
+ """Lazy import executors to avoid importing pyautogui unless needed."""
22
+ if name == "PyAutoGUIExecutor":
23
+ from .pyautogui import PyAutoGUIExecutor
24
+
25
+ return PyAutoGUIExecutor
26
+ elif name == "XDOExecutor":
27
+ from .xdo import XDOExecutor
28
+
29
+ return XDOExecutor
30
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")