PyPI - hud-python - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl - Mend

hud-python 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (29) hide show

hud/agents/base.py +50 -1
hud/cli/__init__.py +120 -1
hud/cli/analyze_metadata.py +29 -41
hud/cli/build.py +7 -0
hud/cli/debug.py +8 -1
hud/cli/eval.py +226 -0
hud/cli/list_func.py +212 -0
hud/cli/pull.py +4 -13
hud/cli/push.py +84 -41
hud/cli/registry.py +155 -0
hud/cli/remove.py +200 -0
hud/cli/tests/test_analyze_metadata.py +277 -0
hud/cli/tests/test_build.py +450 -0
hud/cli/tests/test_list_func.py +288 -0
hud/cli/tests/test_pull.py +400 -0
hud/cli/tests/test_push.py +379 -0
hud/cli/tests/test_registry.py +264 -0
hud/clients/base.py +13 -1
hud/tools/__init__.py +2 -0
hud/tools/response.py +54 -0
hud/utils/design.py +10 -0
hud/utils/mcp.py +14 -2
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.8.dist-info → hud_python-0.4.9.dist-info}/METADATA +12 -1
{hud_python-0.4.8.dist-info → hud_python-0.4.9.dist-info}/RECORD +29 -18
{hud_python-0.4.8.dist-info → hud_python-0.4.9.dist-info}/WHEEL +0 -0
{hud_python-0.4.8.dist-info → hud_python-0.4.9.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.8.dist-info → hud_python-0.4.9.dist-info}/licenses/LICENSE +0 -0

hud/agents/base.py CHANGED Viewed

@@ -85,6 +85,7 @@ class MCPAgent(ABC):
         self._tool_map: dict[str, types.Tool] = {}  # Simplified: just name to tool
         self.screenshot_history: list[str] = []
         self._auto_trace = auto_trace
+        self._auto_trace_cm: Any | None = None  # Store auto-created trace context manager
         self.initialization_complete = False
         # Response agent to automatically interact with the model
@@ -303,6 +304,9 @@ class MCPAgent(ABC):
                             except Exception as e:
                                 logger.warning("ResponseAgent failed: %s", e)
                         if decision == "STOP":
+                            # Try to submit response through lifecycle tool
+                            await self._maybe_submit_response(response, messages)
                             logger.info("Stopping execution")
                             final_response = response
                             break
@@ -483,6 +487,40 @@ class MCPAgent(ABC):
             self._available_tools.append(tool)
             # Simplified mapping - just tool name to tool
             self._tool_map[tool.name] = tool
+            # Auto-detect response tool as a lifecycle tool
+            if tool.name == "response" and "response" not in self.lifecycle_tools:
+                logger.debug("Auto-detected 'response' tool as a lifecycle tool")
+                self.lifecycle_tools.append("response")
+    async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
+        """Submit response through lifecycle tool if available.
+        Args:
+            response: The agent's response
+            messages: The current message history (will be modified in-place)
+        """
+        # Check if we have a response lifecycle tool
+        if "response" in self.lifecycle_tools and "response" in self._tool_map:
+            logger.debug("Calling response lifecycle tool")
+            try:
+                # Call the response tool with the agent's response
+                response_tool_call = MCPToolCall(
+                    name="response",
+                    arguments={"response": response.content, "messages": messages}
+                )
+                response_results = await self.call_tools(response_tool_call)
+                # Format and add the response tool results to messages
+                response_messages = await self.format_tool_results(
+                    [response_tool_call], response_results
+                )
+                messages.extend(response_messages)
+                # Mark the task as done
+                logger.info("Response lifecycle tool executed, marking task as done")
+            except Exception as e:
+                logger.error("Response lifecycle tool failed: %s", e)
     async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
         """Inject metadata into the metadata of the initialize request."""
@@ -491,7 +529,7 @@ class MCPAgent(ABC):
                 mcp_config,
                 MCPConfigPatch(meta=self.metadata),
             )
-        setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
+        self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
     def get_available_tools(self) -> list[types.Tool]:
         """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
@@ -532,6 +570,17 @@ class MCPAgent(ABC):
     async def _cleanup(self) -> None:
         """Cleanup resources."""
+        # Clean up auto-created trace if any
+        if self._auto_trace_cm:
+            try:
+                self._auto_trace_cm.__exit__(None, None, None)
+                logger.info("Closed auto-created trace")
+            except Exception as e:
+                logger.warning("Failed to close auto-created trace: %s", e)
+            finally:
+                self._auto_trace_cm = None
+        # Clean up auto-created client
         if self._auto_created_client and self.mcp_client:
             try:
                 await self.mcp_client.shutdown()

hud/cli/__init__.py CHANGED Viewed

@@ -23,10 +23,13 @@ from .clone import clone_repository, get_clone_message, print_error, print_tutor
 from .cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
 from .debug import debug_mcp_stdio
 from .init import create_environment
+from . import list_func as list_module
 from .mcp_server import run_mcp_dev_server
 from .pull import pull_command
 from .push import push_command
+from .remove import remove_command
 from .utils import CaptureLogger
+from .eval import eval_command
 # Create the main Typer app
 app = typer.Typer(
@@ -442,7 +445,8 @@ def run(
         # Get URL from options or environment
         if not url:
-            url = os.getenv("HUD_MCP_URL", "https://mcp.hud.so/v3/mcp")
+            from hud.settings import settings
+            url = settings.hud_mcp_url
         run_remote_server(image, docker_args, transport, port, url, api_key, run_id, verbose)
@@ -561,6 +565,63 @@ def pull(
     pull_command(target, lock_file, yes, verify_only, verbose)
+@app.command(name="list")
+def list_environments(
+    filter_name: str | None = typer.Option(
+        None, "--filter", "-f", help="Filter environments by name (case-insensitive)"
+    ),
+    json_output: bool = typer.Option(
+        False, "--json", help="Output as JSON"
+    ),
+    show_all: bool = typer.Option(
+        False, "--all", "-a", help="Show all columns including digest"
+    ),
+    verbose: bool = typer.Option(
+        False, "--verbose", "-v", help="Show detailed output"
+    ),
+) -> None:
+    """📋 List all HUD environments in local registry.
+    Shows environments pulled with 'hud pull' stored in ~/.hud/envs/
+    Examples:
+        hud list                    # List all environments
+        hud list --filter text      # Filter by name
+        hud list --json            # Output as JSON
+        hud list --all             # Show digest column
+        hud list --verbose         # Show full descriptions
+    """
+    list_module.list_command(filter_name, json_output, show_all, verbose)
+@app.command()
+def remove(
+    target: str | None = typer.Argument(
+        None,
+        help="Environment to remove (digest, name, or 'all' for all environments)"
+    ),
+    yes: bool = typer.Option(
+        False, "--yes", "-y", help="Skip confirmation prompt"
+    ),
+    verbose: bool = typer.Option(
+        False, "--verbose", "-v", help="Show detailed output"
+    ),
+) -> None:
+    """🗑️ Remove HUD environments from local registry.
+    Removes environment metadata from ~/.hud/envs/
+    Note: This does not remove the Docker images.
+    Examples:
+        hud remove abc123              # Remove by digest
+        hud remove text_2048           # Remove by name
+        hud remove hudpython/test_init # Remove by full name
+        hud remove all                 # Remove all environments
+        hud remove all --yes           # Remove all without confirmation
+    """
+    remove_command(target, yes, verbose)
 @app.command()
 def init(
     name: str = typer.Argument(None, help="Environment name (default: current directory name)"),
@@ -592,6 +653,64 @@ def quickstart() -> None:
     clone("https://github.com/hud-evals/quickstart.git")
+@app.command()
+def eval(
+    source: str = typer.Argument(
+        ...,
+        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
+    ),
+    full: bool = typer.Option(
+        False,
+        "--full",
+        help="Run the entire dataset (omit for single-task debug mode)",
+    ),
+    agent: str = typer.Option(
+        "claude",
+        "--agent",
+        help="Agent backend to use (claude or openai)",
+    ),
+    model: str | None = typer.Option(
+        None,
+        "--model",
+        help="Model name for the chosen agent",
+    ),
+    allowed_tools: str | None = typer.Option(
+        None,
+        "--allowed-tools",
+        help="Comma-separated list of allowed tools",
+    ),
+    max_concurrent: int = typer.Option(
+        30,
+        "--max-concurrent",
+        help="Concurrency level for full-dataset mode",
+    ),
+    max_steps: int = typer.Option(
+        30,
+        "--max-steps",
+        help="Maximum steps per task (default: 10 for single, 50 for full)",
+    ),
+) -> None:
+    """🚀 Run evaluation on datasets or individual tasks with agents."""
+    # Validate agent choice
+    valid_agents = ["claude", "openai"]
+    if agent not in valid_agents:
+        from hud.utils.design import HUDDesign
+        design = HUDDesign()
+        design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
+        raise typer.Exit(1)
+    # Import and run the command
+    eval_command(
+        source=source,
+        full=full,
+        agent=agent,  # type: ignore
+        model=model,
+        allowed_tools=allowed_tools,
+        max_concurrent=max_concurrent,
+        max_steps=max_steps,
+    )
 def main() -> None:
     """Main entry point for the CLI."""
     # Show header for main help

hud/cli/analyze_metadata.py CHANGED Viewed

@@ -12,6 +12,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
 from hud.settings import settings
 from hud.utils.design import HUDDesign
+from .registry import get_registry_dir, list_registry_entries, extract_digest_from_image, load_from_registry
 console = Console()
 design = HUDDesign()
@@ -50,38 +52,31 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
 def check_local_cache(reference: str) -> dict | None:
     """Check local cache for lock file."""
-    # Extract digest if present
-    if "@sha256:" in reference:
-        digest = reference.split("@sha256:")[-1][:12]
-    elif "/" in reference:
-        # Try to find by name pattern
-        cache_dir = Path.home() / ".hud" / "envs"
-        if cache_dir.exists():
-            # Look for any cached version of this image
-            for env_dir in cache_dir.iterdir():
-                if env_dir.is_dir():
-                    lock_file = env_dir / "hud.lock.yaml"
-                    if lock_file.exists():
-                        with open(lock_file) as f:
-                            lock_data = yaml.safe_load(f)
-                        # Check if this matches our reference
-                        if lock_data and "image" in lock_data:
-                            image = lock_data["image"]
-                            # Match by name (ignoring tag/digest)
-                            ref_base = reference.split("@")[0].split(":")[0]
-                            img_base = image.split("@")[0].split(":")[0]
-                            if ref_base in img_base or img_base in ref_base:
-                                return lock_data
-        return None
-    else:
-        digest = "latest"
-    # Check specific digest directory
-    lock_file = Path.home() / ".hud" / "envs" / digest / "hud.lock.yaml"
-    if lock_file.exists():
-        with open(lock_file) as f:
-            return yaml.safe_load(f)
+    # First try exact digest match
+    digest = extract_digest_from_image(reference)
+    lock_data = load_from_registry(digest)
+    if lock_data:
+        return lock_data
+    # If not found and reference has a name, search by name pattern
+    if "/" in reference:
+        # Look for any cached version of this image
+        ref_base = reference.split("@")[0].split(":")[0]
+        for digest, lock_file in list_registry_entries():
+            try:
+                with open(lock_file) as f:
+                    lock_data = yaml.safe_load(f)
+                # Check if this matches our reference
+                if lock_data and "image" in lock_data:
+                    image = lock_data["image"]
+                    # Match by name (ignoring tag/digest)
+                    img_base = image.split("@")[0].split(":")[0]
+                    if ref_base in img_base or img_base in ref_base:
+                        return lock_data
+            except Exception:
+                continue
     return None
@@ -147,15 +142,8 @@ async def analyze_from_metadata(reference: str, output_format: str, verbose: boo
                 source = "registry"
                 # Save to local cache for next time
-                if "@sha256:" in lock_data.get("image", ""):
-                    digest = lock_data["image"].split("@sha256:")[-1][:12]
-                else:
-                    digest = "latest"
-                cache_dir = Path.home() / ".hud" / "envs" / digest
-                cache_dir.mkdir(parents=True, exist_ok=True)
-                with open(cache_dir / "hud.lock.yaml", "w") as f:  # noqa: ASYNC230
-                    yaml.dump(lock_data, f, default_flow_style=False, sort_keys=False)
+                from .registry import save_to_registry
+                save_to_registry(lock_data, lock_data.get("image", ""), verbose=False)
             else:
                 progress.update(task, description="[red]✗ Not found[/red]")

hud/cli/build.py CHANGED Viewed

@@ -17,6 +17,8 @@ from hud.clients import MCPClient
 from hud.utils.design import HUDDesign
 from hud.version import __version__ as hud_version
+from .registry import save_to_registry
 def parse_version(version_str: str) -> tuple[int, int, int]:
     """Parse version string like '1.0.0' or '1.0' into tuple of integers."""
@@ -459,6 +461,11 @@ def build_environment(
     # Remove temp image after we're done
     subprocess.run(["docker", "rmi", temp_tag], capture_output=True)  # noqa: S603, S607
+    # Add to local registry
+    if image_id:
+        # Save to local registry using the helper
+        save_to_registry(lock_content, lock_content.get("image", tag), verbose)
     # Print summary
     design.section_title("Build Complete")

hud/cli/debug.py CHANGED Viewed

@@ -167,7 +167,14 @@ async def debug_mcp_stdio(command: list[str], logger: CaptureLogger, max_phase:
                         break
                 except Exception as e:
                     logger.error(f"Failed to parse MCP response: {e}")
-                    continue
+                    logger.error(f"Raw output that caused the error: {repr(line)}")
+                    logger.hint("This usually means non-JSON output is being sent to STDOUT")
+                    logger.hint("Common causes:")
+                    logger.hint("  - Print statements in your server code")
+                    logger.hint("  - Library warnings (use warnings.filterwarnings)")
+                    logger.hint("  - Import-time output from dependencies")
+                    phases_completed = 1  # Mark as failed
+                    break  # Stop trying to parse
         if response and "result" in response:
             logger.success("MCP server initialized successfully")

hud/cli/eval.py ADDED Viewed

@@ -0,0 +1,226 @@
+"""HUD evaluation command for running tasks and datasets."""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+from pathlib import Path
+from typing import Any, Literal
+import typer
+from datasets import load_dataset
+import hud
+from hud.agents import ClaudeAgent, OperatorAgent
+from hud.agents.misc.response_agent import ResponseAgent
+from hud.datasets import Task, run_dataset
+from hud.utils.design import HUDDesign
+logger = logging.getLogger(__name__)
+design = HUDDesign()
+def build_agent(
+    agent_type: Literal["claude", "openai"],
+    *,
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+) -> ClaudeAgent | OperatorAgent:
+    """Create and return the requested agent type."""
+    if agent_type == "openai":
+        allowed_tools = allowed_tools or ["openai_computer"]
+        return OperatorAgent(
+            allowed_tools=allowed_tools,
+            response_agent=ResponseAgent(),
+        )
+    # Fallback Claude agent (Anthropic)
+    model = model or "claude-sonnet-4-20250514"
+    allowed_tools = allowed_tools or ["anthropic_computer"]
+    return ClaudeAgent(
+        model=model,
+        allowed_tools=allowed_tools,
+        response_agent=ResponseAgent(),
+    )
+async def run_single_task(
+    source: str,
+    *,
+    agent_type: Literal["claude", "openai"] = "claude",
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    max_steps: int = 10,
+) -> None:
+    """Load one task and execute it."""
+    design.info("📊 Loading dataset…")
+    # Check if it's a single task JSON file
+    path = Path(source)
+    if path.exists() and path.suffix == ".json":
+        with open(path, "r") as f:
+            task_data = json.load(f)
+        task = Task(**task_data)
+    else:
+        # Load from HuggingFace dataset
+        dataset = load_dataset(source, split="train")
+        # Get first task from dataset
+        sample_task = dataset[0]  # type: ignore[index]
+        task = Task(**sample_task)  # type: ignore[arg-type]
+    task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
+    with hud.trace(name=task_prompt):
+        agent = build_agent(
+            agent_type,
+            model=model,
+            allowed_tools=allowed_tools,
+        )
+        design.info(task.prompt)
+        result = await agent.run(task, max_steps=max_steps)
+        design.success(f"Reward: {result.reward}")
+async def run_full_dataset(
+    source: str,
+    *,
+    agent_type: Literal["claude", "openai"] = "claude",
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    max_concurrent: int = 30,
+    max_steps: int = 50,
+) -> list[Any]:
+    """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
+    # Build agent class + config for run_dataset
+    if agent_type == "openai":
+        agent_class = OperatorAgent
+        agent_config: dict[str, Any] = {
+            "allowed_tools": allowed_tools or ["openai_computer"],
+        }
+    else:
+        agent_class = ClaudeAgent
+        agent_config = {
+            "model": model or "claude-sonnet-4-20250514",
+            "allowed_tools": allowed_tools or ["anthropic_computer"],
+        }
+    design.info("🚀 Running evaluation…")
+    return await run_dataset(
+        name=f"Evaluation {source.split('/')[-1]}",
+        dataset=source,
+        agent_class=agent_class,
+        agent_config=agent_config,
+        max_concurrent=max_concurrent,
+        metadata={"dataset": source},
+        max_steps=max_steps,
+        auto_respond=True,
+    )
+def eval_command(
+    source: str = typer.Argument(
+        ...,
+        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
+    ),
+    full: bool = typer.Option(
+        False,
+        "--full",
+        help="Run the entire dataset (omit for single-task debug mode)",
+    ),
+    agent: Literal["claude", "openai"] = typer.Option(
+        "claude",
+        "--agent",
+        help="Agent backend to use",
+    ),
+    model: str | None = typer.Option(
+        None,
+        "--model",
+        help="Model name for the chosen agent",
+    ),
+    allowed_tools: str | None = typer.Option(
+        None,
+        "--allowed-tools",
+        help="Comma-separated list of allowed tools",
+    ),
+    max_concurrent: int = typer.Option(
+        50,
+        "--max-concurrent",
+        help="Concurrency level for full-dataset mode",
+    ),
+    max_steps: int = typer.Option(
+        None,
+        "--max-steps",
+        help="Maximum steps per task (default: 10 for single, 50 for full)",
+    ),
+) -> None:
+    """🚀 Run evaluation on datasets or individual tasks with agents.
+    Examples:
+        # Evaluate a single task from SheetBench
+        hud eval hud-evals/SheetBench-50
+        # Evaluate the FULL SheetBench dataset with Claude
+        hud eval hud-evals/SheetBench-50 --full --agent claude
+        # Run a single task from a JSON file
+        hud eval task.json
+        # Run with OpenAI Operator agent
+        hud eval hud-evals/OSWorld-Gold-Beta --agent openai
+    """
+    from hud.settings import settings
+    import os
+    # Check for required API keys
+    if agent == "claude":
+        if not settings.anthropic_api_key or not os.environ.get("ANTHROPIC_API_KEY"):
+            design.error("ANTHROPIC_API_KEY is required for Claude agent")
+            design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
+            raise typer.Exit(1)
+    elif agent == "openai":
+        if not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY"):
+            design.error("OPENAI_API_KEY is required for OpenAI agent")
+            design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
+            raise typer.Exit(1)
+    # Check for HUD_API_KEY if using HUD services
+    if not settings.api_key or not os.environ.get("HUD_API_KEY"):
+        design.warning("HUD_API_KEY not set. Some features may be limited.")
+        design.info("Get your API key at: https://app.hud.so")
+    # Parse allowed tools
+    allowed_tools_list = (
+        [t.strip() for t in allowed_tools.split(",") if t.strip()]
+        if allowed_tools
+        else None
+    )
+    # Set default max_steps if not provided
+    if max_steps is None:
+        max_steps = 50 if full else 10
+    # Run evaluation
+    if full:
+        asyncio.run(run_full_dataset(
+            source,
+            agent_type=agent,
+            model=model,
+            allowed_tools=allowed_tools_list,
+            max_concurrent=max_concurrent,
+            max_steps=max_steps,
+        ))
+    else:
+        asyncio.run(run_single_task(
+            source,
+            agent_type=agent,
+            model=model,
+            allowed_tools=allowed_tools_list,
+            max_steps=max_steps,
+        ))

hud-python 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl