PyPI - hud-python - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

hud-python 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (54) hide show

hud/__init__.py +7 -4
hud/adapters/common/adapter.py +14 -3
hud/adapters/common/tests/test_adapter.py +16 -4
hud/datasets.py +188 -0
hud/env/docker_client.py +14 -2
hud/env/local_docker_client.py +28 -6
hud/gym.py +0 -9
hud/{mcp_agent → mcp}/__init__.py +2 -0
hud/mcp/base.py +631 -0
hud/{mcp_agent → mcp}/claude.py +52 -47
hud/mcp/client.py +312 -0
hud/{mcp_agent → mcp}/langchain.py +52 -33
hud/{mcp_agent → mcp}/openai.py +56 -40
hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
hud/mcp/tests/test_claude.py +294 -0
hud/mcp/tests/test_client.py +324 -0
hud/mcp/tests/test_openai.py +238 -0
hud/settings.py +6 -0
hud/task.py +2 -88
hud/taskset.py +2 -23
hud/telemetry/__init__.py +5 -0
hud/telemetry/_trace.py +180 -17
hud/telemetry/context.py +79 -0
hud/telemetry/exporter.py +165 -6
hud/telemetry/job.py +141 -0
hud/telemetry/tests/test_trace.py +36 -25
hud/tools/__init__.py +14 -1
hud/tools/computer/hud.py +13 -0
hud/tools/executors/__init__.py +19 -2
hud/tools/executors/pyautogui.py +84 -50
hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
hud/tools/playwright_tool.py +73 -67
hud/tools/tests/test_edit.py +8 -1
hud/tools/tests/test_tools.py +3 -0
hud/trajectory.py +5 -1
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/METADATA +20 -14
{hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/RECORD +42 -47
hud/evaluators/__init__.py +0 -9
hud/evaluators/base.py +0 -32
hud/evaluators/inspect.py +0 -24
hud/evaluators/judge.py +0 -189
hud/evaluators/match.py +0 -156
hud/evaluators/remote.py +0 -65
hud/evaluators/tests/__init__.py +0 -0
hud/evaluators/tests/test_inspect.py +0 -12
hud/evaluators/tests/test_judge.py +0 -231
hud/evaluators/tests/test_match.py +0 -115
hud/evaluators/tests/test_remote.py +0 -98
hud/mcp_agent/base.py +0 -723
/hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
{hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/WHEEL +0 -0
{hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/licenses/LICENSE +0 -0

hud/__init__.py CHANGED Viewed

@@ -4,13 +4,13 @@ HUD SDK for interacting with the HUD evaluation platform.
 from __future__ import annotations
-from . import agent, env, gym, settings, task, taskset, types, utils
+from . import agent, datasets, env, gym, settings, task, taskset, types, utils
 from .adapters import ResponseAction as Response
+from .datasets import run_dataset, to_taskconfigs
 from .job import create_job, load_job, run_job
-from .job import job as register_job
 from .task import Task
 from .taskset import load_taskset
-from .telemetry import flush, trace, trace_open
+from .telemetry import flush, job, trace, trace_open  # New context-based job
 from .version import __version__
@@ -42,17 +42,20 @@ __all__ = [
     "__version__",
     "agent",
     "create_job",
+    "datasets",
     "env",
     "flush",
     "gym",
     "init_telemetry",
+    "job",
     "load_job",
     "load_taskset",
-    "register_job",
+    "run_dataset",
     "run_job",
     "settings",
     "task",
     "taskset",
+    "to_taskconfigs",
     "trace",
     "trace_open",
     "types",

hud/adapters/common/adapter.py CHANGED Viewed

@@ -2,16 +2,18 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any, TypeAlias
-import numpy as np
-from PIL import Image
 from pydantic import TypeAdapter, ValidationError
 from .types import CLA
 if TYPE_CHECKING:
+    import numpy as np
+    from PIL import Image
     from typing_extensions import TypeIs
-ImageType: TypeAlias = np.ndarray[Any, Any] | Image.Image | str | None
+    ImageType: TypeAlias = np.ndarray[Any, Any] | Image.Image | str | None
+else:
+    ImageType: TypeAlias = Any | str | None
 def _is_numpy_array(observation: Any) -> TypeIs[np.ndarray]:
@@ -69,6 +71,15 @@ class Adapter:
         if observation is None:
             return None
+        # Import PIL only when needed
+        try:
+            from PIL import Image
+        except ImportError:
+            raise ImportError(
+                "PIL (Pillow) is required for image processing. "
+                "Please install it with 'pip install Pillow'"
+            ) from None
         # Handle different input types.
         if _is_numpy_array(observation):
             # Convert numpy array to PIL Image

hud/adapters/common/tests/test_adapter.py CHANGED Viewed

@@ -4,10 +4,17 @@ import base64
 import io
 from unittest.mock import MagicMock, patch
-import numpy as np
 import pytest
 from PIL import Image
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ImportError:
+    HAS_NUMPY = False
+    np = None
 from hud.adapters.common import Adapter
 from hud.adapters.common.types import ClickAction, Point, TypeAction
@@ -25,15 +32,19 @@ def test_image():
     img_bytes = io.BytesIO()
     img.save(img_bytes, format="PNG")
     img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
-    img_array = np.array(img)
-    return {
+    result = {
         "pil": img,
         "bytes": img_bytes.getvalue(),
         "base64": img_base64,
-        "array": img_array,
     }
+    if HAS_NUMPY:
+        img_array = np.array(img)  # type: ignore
+        result["array"] = img_array
+    return result
 def test_init(adapter):
     """Test adapter initialization."""
@@ -99,6 +110,7 @@ def test_rescale_pil_image(adapter, test_image):
     assert img.size == (adapter.agent_width, adapter.agent_height)
+@pytest.mark.skipif(not HAS_NUMPY, reason="numpy not available")
 def test_rescale_numpy_array(adapter, test_image):
     """Test rescaling numpy array."""
     result = adapter.rescale(test_image["array"])

hud/datasets.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""Dataset utilities for working with HuggingFace datasets and TaskConfigs."""
+from __future__ import annotations
+import asyncio
+import logging
+from string import Template
+from typing import TYPE_CHECKING, Any
+from mcp.types import CallToolRequestParams as MCPToolParams
+from pydantic import BaseModel, Field, field_validator
+from hud.telemetry.job import job
+if TYPE_CHECKING:
+    from datasets import Dataset
+    from hud.mcp.base import AgentResult, BaseMCPAgent
+logger = logging.getLogger("hud.datasets")
+class TaskConfig(BaseModel):
+    """
+    A task configuration that can be used to create a task.
+    The mcp_config field supports environment variable substitution using
+    template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
+    Example:
+        mcp_config: {
+            "hud": {
+                "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
+                "headers": {
+                    "Authorization": "Bearer ${HUD_API_KEY}",
+                    "Run-Id": "${RUN_ID}"
+                }
+            }
+        }
+    """
+    id: str | None = None
+    prompt: str
+    mcp_config: dict[str, Any]
+    setup_tool: MCPToolParams | None = None
+    evaluate_tool: MCPToolParams | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @field_validator("mcp_config", mode="before")
+    @classmethod
+    def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
+        """
+        Automatically resolve environment variables in mcp_config using Template.
+        Supports ${VAR_NAME} syntax with variable substitution from:
+        1. System environment variables (including HUD_API_KEY, etc.)
+        2. Runtime context variables (e.g., RUN_ID from telemetry context)
+        Missing variables resolve to empty strings.
+        """
+        import os
+        from hud.telemetry.context import get_current_task_run_id
+        # Start with current environment variables
+        mapping = dict(os.environ)
+        # Add runtime context variables if available
+        run_id = get_current_task_run_id()
+        if run_id:
+            mapping["RUN_ID"] = run_id
+        def substitute_in_value(obj: Any) -> Any:
+            """Recursively substitute variables in nested structures."""
+            if isinstance(obj, str):
+                # Use Template's safe_substitute - missing vars become empty strings
+                return Template(obj).safe_substitute(mapping)
+            elif isinstance(obj, dict):
+                return {k: substitute_in_value(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [substitute_in_value(item) for item in obj]
+            else:
+                return obj
+        return substitute_in_value(v)
+def to_taskconfigs(dataset: Dataset) -> Dataset:
+    """
+    Convert a HuggingFace dataset to contain TaskConfig objects.
+    Args:
+        dataset: HuggingFace dataset with task data
+    Returns:
+        Dataset with 'task' column containing TaskConfig objects
+    Example:
+        >>> dataset = load_dataset("hud/sheetbench-v1", split="test")
+        >>> tasks = to_taskconfigs(dataset)
+        >>> tasks[0]["task"]  # This is a TaskConfig object
+    """
+    def _convert(example: dict[str, Any]) -> dict[str, TaskConfig]:
+        return {"task": TaskConfig(**example)}
+    # Map and keep only the task column
+    return dataset.map(_convert, remove_columns=dataset.column_names)
+async def run_dataset(
+    name: str,
+    dataset: Dataset,
+    agent_class: type[BaseMCPAgent],
+    agent_config: dict[str, Any] | None = None,
+    max_concurrent: int = 5,
+    metadata: dict[str, Any] | None = None,
+) -> list[Any]:
+    """
+    Run all tasks in a dataset with automatic job tracking.
+    Args:
+        name: Name for the job
+        dataset: HuggingFace Dataset (raw, not converted)
+        agent_class: Agent class to instantiate (e.g., ClaudeMCPAgent)
+        agent_config: Configuration for agent (model, etc.)
+        max_concurrent: Maximum parallel task execution
+        metadata: Optional metadata for the job
+    Returns:
+        List of results from agent.run() in dataset order
+    Example:
+        >>> from datasets import load_dataset
+        >>> from hud.mcp import ClaudeMCPAgent
+        >>> dataset = load_dataset("hud/sheetbench-v1", split="test")
+        >>> results = await run_dataset(
+        ...     "sheetbench_eval",
+        ...     dataset,
+        ...     ClaudeMCPAgent,
+        ...     {"model": "claude-3-5-sonnet-20241022"},
+        ...     max_concurrent=3,
+        ... )
+    """
+    # Import here to avoid circular imports
+    import hud
+    from hud.mcp.client import MCPClient
+    # Convert dataset to TaskConfigs internally
+    tasks = to_taskconfigs(dataset)
+    # Create job context
+    job_metadata = metadata or {}
+    job_metadata["agent_class"] = agent_class.__name__
+    if agent_config:
+        job_metadata["agent_config"] = agent_config
+    with job(name, metadata=job_metadata):
+        # Run tasks with semaphore for concurrency control
+        sem = asyncio.Semaphore(max_concurrent)
+        results: list[AgentResult | None] = [None] * len(tasks)
+        async def _worker(index: int, row: Any) -> None:
+            async with sem:
+                task = row["task"]
+                # Create trace for this task
+                with hud.trace(f"task_{index}"):
+                    # Create fresh MCP client per task
+                    if task.mcp_config:
+                        client = MCPClient(mcp_config=task.mcp_config)
+                        agent = agent_class(mcp_client=client, **(agent_config or {}))
+                        try:
+                            results[index] = await agent.run(task)
+                        finally:
+                            await client.close()
+                    else:
+                        logger.warning("Task %d has no mcp_config defined", index)
+                        results[index] = None
+        # Execute all tasks
+        await asyncio.gather(
+            *[_worker(i, row) for i, row in enumerate(tasks)],
+            return_exceptions=True,  # Don't fail entire batch on one error
+        )
+    return results

hud/env/docker_client.py CHANGED Viewed

@@ -8,8 +8,6 @@ import uuid
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
-import toml
 from hud.env.client import Client
 from hud.types import EnvironmentStatus
 from hud.utils.common import _compile_pathspec, directory_to_tar_bytes
@@ -97,6 +95,13 @@ class DockerClient(Client):
             raise FileNotFoundError(f"pyproject.toml not found in {source_path}")
         # validate package name
+        try:
+            import toml
+        except ImportError:
+            raise ImportError(
+                "toml is required for parsing pyproject.toml files. "
+                "Please install it with 'pip install toml'"
+            ) from None
         pyproject_data = toml.load(pyproject_path)
         package_name = pyproject_data.get("project", {}).get("name")
         if not package_name:
@@ -241,6 +246,13 @@ class DockerClient(Client):
             or self._last_pyproject_toml_str != current_pyproject_content
         ):
             # Update package name if pyproject.toml changed
+            try:
+                import toml
+            except ImportError:
+                raise ImportError(
+                    "toml is required for parsing pyproject.toml files. "
+                    "Please install it with 'pip install toml'"
+                ) from None
             pyproject_data = toml.loads(current_pyproject_content)
             self._package_name = pyproject_data.get("project", {}).get("name")
             if not self._package_name:

hud/env/local_docker_client.py CHANGED Viewed

@@ -9,8 +9,15 @@ import time
 import uuid
 from typing import TYPE_CHECKING, Any
-import aiodocker
-from aiohttp import ClientTimeout
+try:
+    import aiodocker
+    from aiohttp import ClientTimeout
+    AIODOCKER_AVAILABLE = True
+except ImportError:
+    AIODOCKER_AVAILABLE = False
+    aiodocker = None  # type: ignore
+    ClientTimeout = None  # type: ignore
 from hud.env.docker_client import DockerClient, EnvironmentStatus
 from hud.utils import ExecuteResult
@@ -40,7 +47,12 @@ class LocalDockerClient(DockerClient):
         image_tag = f"hud-env-{uuid.uuid4().hex[:8]}"
         # Initialize Docker client
-        docker_client = aiodocker.Docker()
+        if not AIODOCKER_AVAILABLE:
+            raise ImportError(
+                "aiodocker is required for LocalDockerClient. "
+                "Please install it with 'pip install aiodocker'"
+            )
+        docker_client = aiodocker.Docker()  # type: ignore
         # Create a tar file from the path
         tar_bytes = directory_to_tar_bytes(build_context)
@@ -82,7 +94,12 @@ class LocalDockerClient(DockerClient):
         """
         # Initialize Docker client
-        docker_client = aiodocker.Docker()
+        if not AIODOCKER_AVAILABLE:
+            raise ImportError(
+                "aiodocker is required for LocalDockerClient. "
+                "Please install it with 'pip install aiodocker'"
+            )
+        docker_client = aiodocker.Docker()  # type: ignore
         # Default host config
         if host_config is None:
@@ -156,7 +173,7 @@ class LocalDockerClient(DockerClient):
         client._log_task = log_task  # type: ignore[attr-defined]
         return client
-    def __init__(self, docker_conn: aiodocker.Docker, container_id: str) -> None:
+    def __init__(self, docker_conn: aiodocker.Docker, container_id: str) -> None:  # type: ignore
         """
         Initialize the DockerClient.
@@ -164,6 +181,11 @@ class LocalDockerClient(DockerClient):
             docker_conn: Docker client connection
             container_id: ID of the Docker container to control
         """
+        if not AIODOCKER_AVAILABLE:
+            raise ImportError(
+                "aiodocker is required for LocalDockerClient. "
+                "Please install it with 'pip install aiodocker'"
+            )
         super().__init__()
         # Store container ID instead of container object
@@ -239,7 +261,7 @@ class LocalDockerClient(DockerClient):
         exec_result = await container.exec(
             cmd=command,
         )
-        output: Stream = exec_result.start(timeout=ClientTimeout(timeout), detach=False)
+        output: Stream = exec_result.start(timeout=ClientTimeout(timeout), detach=False)  # type: ignore
         stdout_data = bytearray()
         stderr_data = bytearray()

hud/gym.py CHANGED Viewed

@@ -50,15 +50,6 @@ async def make(
         effective_job_id = job.id
     elif job_id is not None:
         effective_job_id = job_id
-    else:
-        try:
-            import hud.job
-            active_job = hud.job.get_active_job()
-            if active_job:
-                effective_job_id = active_job.id
-        except ImportError:
-            pass
     build_data = {}
     try:

hud/{mcp_agent → mcp}/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 from .base import BaseMCPAgent
 from .claude import ClaudeMCPAgent
+from .client import MCPClient
 from .langchain import LangChainMCPAgent
 from .openai import OpenAIMCPAgent
@@ -11,5 +12,6 @@ __all__ = [
     "BaseMCPAgent",
     "ClaudeMCPAgent",
     "LangChainMCPAgent",
+    "MCPClient",
     "OpenAIMCPAgent",
 ]

hud-python 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

hud-python 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl