hud-python 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (53) hide show
  1. hud/__init__.py +7 -4
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +14 -2
  6. hud/env/local_docker_client.py +28 -6
  7. hud/gym.py +0 -9
  8. hud/{mcp_agent → mcp}/__init__.py +2 -0
  9. hud/mcp/base.py +631 -0
  10. hud/{mcp_agent → mcp}/claude.py +52 -47
  11. hud/mcp/client.py +312 -0
  12. hud/{mcp_agent → mcp}/langchain.py +52 -33
  13. hud/{mcp_agent → mcp}/openai.py +56 -40
  14. hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
  15. hud/mcp/tests/test_claude.py +294 -0
  16. hud/mcp/tests/test_client.py +324 -0
  17. hud/mcp/tests/test_openai.py +238 -0
  18. hud/settings.py +6 -0
  19. hud/task.py +1 -88
  20. hud/taskset.py +2 -23
  21. hud/telemetry/__init__.py +5 -0
  22. hud/telemetry/_trace.py +180 -17
  23. hud/telemetry/context.py +79 -0
  24. hud/telemetry/exporter.py +165 -6
  25. hud/telemetry/job.py +141 -0
  26. hud/telemetry/tests/test_trace.py +36 -25
  27. hud/tools/__init__.py +14 -1
  28. hud/tools/executors/__init__.py +19 -2
  29. hud/tools/executors/pyautogui.py +84 -50
  30. hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
  31. hud/tools/playwright_tool.py +73 -67
  32. hud/tools/tests/test_edit.py +8 -1
  33. hud/tools/tests/test_tools.py +3 -0
  34. hud/trajectory.py +5 -1
  35. hud/utils/tests/test_version.py +1 -1
  36. hud/version.py +1 -1
  37. {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/METADATA +20 -14
  38. {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/RECORD +41 -46
  39. hud/evaluators/__init__.py +0 -9
  40. hud/evaluators/base.py +0 -32
  41. hud/evaluators/inspect.py +0 -24
  42. hud/evaluators/judge.py +0 -189
  43. hud/evaluators/match.py +0 -156
  44. hud/evaluators/remote.py +0 -65
  45. hud/evaluators/tests/__init__.py +0 -0
  46. hud/evaluators/tests/test_inspect.py +0 -12
  47. hud/evaluators/tests/test_judge.py +0 -231
  48. hud/evaluators/tests/test_match.py +0 -115
  49. hud/evaluators/tests/test_remote.py +0 -98
  50. hud/mcp_agent/base.py +0 -723
  51. /hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
  52. {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
  53. {hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
hud/__init__.py CHANGED
@@ -4,13 +4,13 @@ HUD SDK for interacting with the HUD evaluation platform.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- from . import agent, env, gym, settings, task, taskset, types, utils
7
+ from . import agent, datasets, env, gym, settings, task, taskset, types, utils
8
8
  from .adapters import ResponseAction as Response
9
+ from .datasets import run_dataset, to_taskconfigs
9
10
  from .job import create_job, load_job, run_job
10
- from .job import job as register_job
11
11
  from .task import Task
12
12
  from .taskset import load_taskset
13
- from .telemetry import flush, trace, trace_open
13
+ from .telemetry import flush, job, trace, trace_open # New context-based job
14
14
  from .version import __version__
15
15
 
16
16
 
@@ -42,17 +42,20 @@ __all__ = [
42
42
  "__version__",
43
43
  "agent",
44
44
  "create_job",
45
+ "datasets",
45
46
  "env",
46
47
  "flush",
47
48
  "gym",
48
49
  "init_telemetry",
50
+ "job",
49
51
  "load_job",
50
52
  "load_taskset",
51
- "register_job",
53
+ "run_dataset",
52
54
  "run_job",
53
55
  "settings",
54
56
  "task",
55
57
  "taskset",
58
+ "to_taskconfigs",
56
59
  "trace",
57
60
  "trace_open",
58
61
  "types",
@@ -2,16 +2,18 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING, Any, TypeAlias
4
4
 
5
- import numpy as np
6
- from PIL import Image
7
5
  from pydantic import TypeAdapter, ValidationError
8
6
 
9
7
  from .types import CLA
10
8
 
11
9
  if TYPE_CHECKING:
10
+ import numpy as np
11
+ from PIL import Image
12
12
  from typing_extensions import TypeIs
13
13
 
14
- ImageType: TypeAlias = np.ndarray[Any, Any] | Image.Image | str | None
14
+ ImageType: TypeAlias = np.ndarray[Any, Any] | Image.Image | str | None
15
+ else:
16
+ ImageType: TypeAlias = Any | str | None
15
17
 
16
18
 
17
19
  def _is_numpy_array(observation: Any) -> TypeIs[np.ndarray]:
@@ -69,6 +71,15 @@ class Adapter:
69
71
  if observation is None:
70
72
  return None
71
73
 
74
+ # Import PIL only when needed
75
+ try:
76
+ from PIL import Image
77
+ except ImportError:
78
+ raise ImportError(
79
+ "PIL (Pillow) is required for image processing. "
80
+ "Please install it with 'pip install Pillow'"
81
+ ) from None
82
+
72
83
  # Handle different input types.
73
84
  if _is_numpy_array(observation):
74
85
  # Convert numpy array to PIL Image
@@ -4,10 +4,17 @@ import base64
4
4
  import io
5
5
  from unittest.mock import MagicMock, patch
6
6
 
7
- import numpy as np
8
7
  import pytest
9
8
  from PIL import Image
10
9
 
10
+ try:
11
+ import numpy as np
12
+
13
+ HAS_NUMPY = True
14
+ except ImportError:
15
+ HAS_NUMPY = False
16
+ np = None
17
+
11
18
  from hud.adapters.common import Adapter
12
19
  from hud.adapters.common.types import ClickAction, Point, TypeAction
13
20
 
@@ -25,15 +32,19 @@ def test_image():
25
32
  img_bytes = io.BytesIO()
26
33
  img.save(img_bytes, format="PNG")
27
34
  img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
28
- img_array = np.array(img)
29
35
 
30
- return {
36
+ result = {
31
37
  "pil": img,
32
38
  "bytes": img_bytes.getvalue(),
33
39
  "base64": img_base64,
34
- "array": img_array,
35
40
  }
36
41
 
42
+ if HAS_NUMPY:
43
+ img_array = np.array(img) # type: ignore
44
+ result["array"] = img_array
45
+
46
+ return result
47
+
37
48
 
38
49
  def test_init(adapter):
39
50
  """Test adapter initialization."""
@@ -99,6 +110,7 @@ def test_rescale_pil_image(adapter, test_image):
99
110
  assert img.size == (adapter.agent_width, adapter.agent_height)
100
111
 
101
112
 
113
+ @pytest.mark.skipif(not HAS_NUMPY, reason="numpy not available")
102
114
  def test_rescale_numpy_array(adapter, test_image):
103
115
  """Test rescaling numpy array."""
104
116
  result = adapter.rescale(test_image["array"])
hud/datasets.py ADDED
@@ -0,0 +1,188 @@
1
+ """Dataset utilities for working with HuggingFace datasets and TaskConfigs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from string import Template
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from mcp.types import CallToolRequestParams as MCPToolParams
11
+ from pydantic import BaseModel, Field, field_validator
12
+
13
+ from hud.telemetry.job import job
14
+
15
+ if TYPE_CHECKING:
16
+ from datasets import Dataset
17
+
18
+ from hud.mcp.base import AgentResult, BaseMCPAgent
19
+
20
+ logger = logging.getLogger("hud.datasets")
21
+
22
+
23
+ class TaskConfig(BaseModel):
24
+ """
25
+ A task configuration that can be used to create a task.
26
+
27
+ The mcp_config field supports environment variable substitution using
28
+ template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
29
+
30
+ Example:
31
+ mcp_config: {
32
+ "hud": {
33
+ "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
34
+ "headers": {
35
+ "Authorization": "Bearer ${HUD_API_KEY}",
36
+ "Run-Id": "${RUN_ID}"
37
+ }
38
+ }
39
+ }
40
+ """
41
+
42
+ id: str | None = None
43
+ prompt: str
44
+ mcp_config: dict[str, Any]
45
+ setup_tool: MCPToolParams | None = None
46
+ evaluate_tool: MCPToolParams | None = None
47
+ metadata: dict[str, Any] = Field(default_factory=dict)
48
+
49
+ @field_validator("mcp_config", mode="before")
50
+ @classmethod
51
+ def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
52
+ """
53
+ Automatically resolve environment variables in mcp_config using Template.
54
+
55
+ Supports ${VAR_NAME} syntax with variable substitution from:
56
+ 1. System environment variables (including HUD_API_KEY, etc.)
57
+ 2. Runtime context variables (e.g., RUN_ID from telemetry context)
58
+
59
+ Missing variables resolve to empty strings.
60
+ """
61
+ import os
62
+
63
+ from hud.telemetry.context import get_current_task_run_id
64
+
65
+ # Start with current environment variables
66
+ mapping = dict(os.environ)
67
+
68
+ # Add runtime context variables if available
69
+ run_id = get_current_task_run_id()
70
+ if run_id:
71
+ mapping["RUN_ID"] = run_id
72
+
73
+ def substitute_in_value(obj: Any) -> Any:
74
+ """Recursively substitute variables in nested structures."""
75
+ if isinstance(obj, str):
76
+ # Use Template's safe_substitute - missing vars become empty strings
77
+ return Template(obj).safe_substitute(mapping)
78
+ elif isinstance(obj, dict):
79
+ return {k: substitute_in_value(v) for k, v in obj.items()}
80
+ elif isinstance(obj, list):
81
+ return [substitute_in_value(item) for item in obj]
82
+ else:
83
+ return obj
84
+
85
+ return substitute_in_value(v)
86
+
87
+
88
+ def to_taskconfigs(dataset: Dataset) -> Dataset:
89
+ """
90
+ Convert a HuggingFace dataset to contain TaskConfig objects.
91
+
92
+ Args:
93
+ dataset: HuggingFace dataset with task data
94
+
95
+ Returns:
96
+ Dataset with 'task' column containing TaskConfig objects
97
+
98
+ Example:
99
+ >>> dataset = load_dataset("hud/sheetbench-v1", split="test")
100
+ >>> tasks = to_taskconfigs(dataset)
101
+ >>> tasks[0]["task"] # This is a TaskConfig object
102
+ """
103
+
104
+ def _convert(example: dict[str, Any]) -> dict[str, TaskConfig]:
105
+ return {"task": TaskConfig(**example)}
106
+
107
+ # Map and keep only the task column
108
+ return dataset.map(_convert, remove_columns=dataset.column_names)
109
+
110
+
111
+ async def run_dataset(
112
+ name: str,
113
+ dataset: Dataset,
114
+ agent_class: type[BaseMCPAgent],
115
+ agent_config: dict[str, Any] | None = None,
116
+ max_concurrent: int = 5,
117
+ metadata: dict[str, Any] | None = None,
118
+ ) -> list[Any]:
119
+ """
120
+ Run all tasks in a dataset with automatic job tracking.
121
+
122
+ Args:
123
+ name: Name for the job
124
+ dataset: HuggingFace Dataset (raw, not converted)
125
+ agent_class: Agent class to instantiate (e.g., ClaudeMCPAgent)
126
+ agent_config: Configuration for agent (model, etc.)
127
+ max_concurrent: Maximum parallel task execution
128
+ metadata: Optional metadata for the job
129
+
130
+ Returns:
131
+ List of results from agent.run() in dataset order
132
+
133
+ Example:
134
+ >>> from datasets import load_dataset
135
+ >>> from hud.mcp import ClaudeMCPAgent
136
+ >>> dataset = load_dataset("hud/sheetbench-v1", split="test")
137
+ >>> results = await run_dataset(
138
+ ... "sheetbench_eval",
139
+ ... dataset,
140
+ ... ClaudeMCPAgent,
141
+ ... {"model": "claude-3-5-sonnet-20241022"},
142
+ ... max_concurrent=3,
143
+ ... )
144
+ """
145
+ # Import here to avoid circular imports
146
+ import hud
147
+ from hud.mcp.client import MCPClient
148
+
149
+ # Convert dataset to TaskConfigs internally
150
+ tasks = to_taskconfigs(dataset)
151
+
152
+ # Create job context
153
+ job_metadata = metadata or {}
154
+ job_metadata["agent_class"] = agent_class.__name__
155
+ if agent_config:
156
+ job_metadata["agent_config"] = agent_config
157
+
158
+ with job(name, metadata=job_metadata):
159
+ # Run tasks with semaphore for concurrency control
160
+ sem = asyncio.Semaphore(max_concurrent)
161
+ results: list[AgentResult | None] = [None] * len(tasks)
162
+
163
+ async def _worker(index: int, row: Any) -> None:
164
+ async with sem:
165
+ task = row["task"]
166
+
167
+ # Create trace for this task
168
+ with hud.trace(f"task_{index}"):
169
+ # Create fresh MCP client per task
170
+ if task.mcp_config:
171
+ client = MCPClient(mcp_config=task.mcp_config)
172
+ agent = agent_class(mcp_client=client, **(agent_config or {}))
173
+
174
+ try:
175
+ results[index] = await agent.run(task)
176
+ finally:
177
+ await client.close()
178
+ else:
179
+ logger.warning("Task %d has no mcp_config defined", index)
180
+ results[index] = None
181
+
182
+ # Execute all tasks
183
+ await asyncio.gather(
184
+ *[_worker(i, row) for i, row in enumerate(tasks)],
185
+ return_exceptions=True, # Don't fail entire batch on one error
186
+ )
187
+
188
+ return results
hud/env/docker_client.py CHANGED
@@ -8,8 +8,6 @@ import uuid
8
8
  from pathlib import Path
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
- import toml
12
-
13
11
  from hud.env.client import Client
14
12
  from hud.types import EnvironmentStatus
15
13
  from hud.utils.common import _compile_pathspec, directory_to_tar_bytes
@@ -97,6 +95,13 @@ class DockerClient(Client):
97
95
  raise FileNotFoundError(f"pyproject.toml not found in {source_path}")
98
96
 
99
97
  # validate package name
98
+ try:
99
+ import toml
100
+ except ImportError:
101
+ raise ImportError(
102
+ "toml is required for parsing pyproject.toml files. "
103
+ "Please install it with 'pip install toml'"
104
+ ) from None
100
105
  pyproject_data = toml.load(pyproject_path)
101
106
  package_name = pyproject_data.get("project", {}).get("name")
102
107
  if not package_name:
@@ -241,6 +246,13 @@ class DockerClient(Client):
241
246
  or self._last_pyproject_toml_str != current_pyproject_content
242
247
  ):
243
248
  # Update package name if pyproject.toml changed
249
+ try:
250
+ import toml
251
+ except ImportError:
252
+ raise ImportError(
253
+ "toml is required for parsing pyproject.toml files. "
254
+ "Please install it with 'pip install toml'"
255
+ ) from None
244
256
  pyproject_data = toml.loads(current_pyproject_content)
245
257
  self._package_name = pyproject_data.get("project", {}).get("name")
246
258
  if not self._package_name:
@@ -9,8 +9,15 @@ import time
9
9
  import uuid
10
10
  from typing import TYPE_CHECKING, Any
11
11
 
12
- import aiodocker
13
- from aiohttp import ClientTimeout
12
+ try:
13
+ import aiodocker
14
+ from aiohttp import ClientTimeout
15
+
16
+ AIODOCKER_AVAILABLE = True
17
+ except ImportError:
18
+ AIODOCKER_AVAILABLE = False
19
+ aiodocker = None # type: ignore
20
+ ClientTimeout = None # type: ignore
14
21
 
15
22
  from hud.env.docker_client import DockerClient, EnvironmentStatus
16
23
  from hud.utils import ExecuteResult
@@ -40,7 +47,12 @@ class LocalDockerClient(DockerClient):
40
47
  image_tag = f"hud-env-{uuid.uuid4().hex[:8]}"
41
48
 
42
49
  # Initialize Docker client
43
- docker_client = aiodocker.Docker()
50
+ if not AIODOCKER_AVAILABLE:
51
+ raise ImportError(
52
+ "aiodocker is required for LocalDockerClient. "
53
+ "Please install it with 'pip install aiodocker'"
54
+ )
55
+ docker_client = aiodocker.Docker() # type: ignore
44
56
 
45
57
  # Create a tar file from the path
46
58
  tar_bytes = directory_to_tar_bytes(build_context)
@@ -82,7 +94,12 @@ class LocalDockerClient(DockerClient):
82
94
  """
83
95
 
84
96
  # Initialize Docker client
85
- docker_client = aiodocker.Docker()
97
+ if not AIODOCKER_AVAILABLE:
98
+ raise ImportError(
99
+ "aiodocker is required for LocalDockerClient. "
100
+ "Please install it with 'pip install aiodocker'"
101
+ )
102
+ docker_client = aiodocker.Docker() # type: ignore
86
103
 
87
104
  # Default host config
88
105
  if host_config is None:
@@ -156,7 +173,7 @@ class LocalDockerClient(DockerClient):
156
173
  client._log_task = log_task # type: ignore[attr-defined]
157
174
  return client
158
175
 
159
- def __init__(self, docker_conn: aiodocker.Docker, container_id: str) -> None:
176
+ def __init__(self, docker_conn: aiodocker.Docker, container_id: str) -> None: # type: ignore
160
177
  """
161
178
  Initialize the DockerClient.
162
179
 
@@ -164,6 +181,11 @@ class LocalDockerClient(DockerClient):
164
181
  docker_conn: Docker client connection
165
182
  container_id: ID of the Docker container to control
166
183
  """
184
+ if not AIODOCKER_AVAILABLE:
185
+ raise ImportError(
186
+ "aiodocker is required for LocalDockerClient. "
187
+ "Please install it with 'pip install aiodocker'"
188
+ )
167
189
  super().__init__()
168
190
 
169
191
  # Store container ID instead of container object
@@ -239,7 +261,7 @@ class LocalDockerClient(DockerClient):
239
261
  exec_result = await container.exec(
240
262
  cmd=command,
241
263
  )
242
- output: Stream = exec_result.start(timeout=ClientTimeout(timeout), detach=False)
264
+ output: Stream = exec_result.start(timeout=ClientTimeout(timeout), detach=False) # type: ignore
243
265
 
244
266
  stdout_data = bytearray()
245
267
  stderr_data = bytearray()
hud/gym.py CHANGED
@@ -50,15 +50,6 @@ async def make(
50
50
  effective_job_id = job.id
51
51
  elif job_id is not None:
52
52
  effective_job_id = job_id
53
- else:
54
- try:
55
- import hud.job
56
-
57
- active_job = hud.job.get_active_job()
58
- if active_job:
59
- effective_job_id = active_job.id
60
- except ImportError:
61
- pass
62
53
 
63
54
  build_data = {}
64
55
  try:
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from .base import BaseMCPAgent
6
6
  from .claude import ClaudeMCPAgent
7
+ from .client import MCPClient
7
8
  from .langchain import LangChainMCPAgent
8
9
  from .openai import OpenAIMCPAgent
9
10
 
@@ -11,5 +12,6 @@ __all__ = [
11
12
  "BaseMCPAgent",
12
13
  "ClaudeMCPAgent",
13
14
  "LangChainMCPAgent",
15
+ "MCPClient",
14
16
  "OpenAIMCPAgent",
15
17
  ]