PyPI - hud-python - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.10__py3-none-any.whl - Mend

hud-python 0.4.8py3-none-any.whl → 0.4.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (32) hide show

hud/agents/base.py +50 -1
hud/cli/__init__.py +187 -11
hud/cli/analyze_metadata.py +33 -42
hud/cli/build.py +7 -0
hud/cli/debug.py +8 -1
hud/cli/env_utils.py +133 -0
hud/cli/eval.py +302 -0
hud/cli/list_func.py +213 -0
hud/cli/mcp_server.py +3 -79
hud/cli/pull.py +20 -15
hud/cli/push.py +84 -41
hud/cli/registry.py +155 -0
hud/cli/remove.py +200 -0
hud/cli/runner.py +1 -1
hud/cli/tests/test_analyze_metadata.py +277 -0
hud/cli/tests/test_build.py +450 -0
hud/cli/tests/test_list_func.py +288 -0
hud/cli/tests/test_pull.py +400 -0
hud/cli/tests/test_push.py +379 -0
hud/cli/tests/test_registry.py +264 -0
hud/clients/base.py +13 -1
hud/tools/__init__.py +2 -0
hud/tools/response.py +54 -0
hud/utils/design.py +10 -0
hud/utils/mcp.py +14 -2
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/METADATA +12 -1
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/RECORD +32 -20
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/WHEEL +0 -0
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/licenses/LICENSE +0 -0

hud/cli/eval.py ADDED Viewed

@@ -0,0 +1,302 @@
+"""HUD evaluation command for running tasks and datasets."""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal
+import typer
+import hud
+from hud.utils.design import HUDDesign
+if TYPE_CHECKING:
+    from datasets import Dataset
+    from hud.agents import ClaudeAgent, OperatorAgent
+    from hud.agents.misc.response_agent import ResponseAgent
+logger = logging.getLogger(__name__)
+design = HUDDesign()
+def build_agent(
+    agent_type: Literal["claude", "openai"],
+    *,
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+) -> Any:
+    """Create and return the requested agent type."""
+    # Import agents lazily to avoid dependency issues
+    try:
+        from hud.agents.misc.response_agent import ResponseAgent
+    except ImportError as e:
+        design.error(
+            "Agent dependencies are not installed. "
+            "Please install with: pip install 'hud-python[agent]'"
+        )
+        raise typer.Exit(1) from e
+    if agent_type == "openai":
+        try:
+            from hud.agents import OperatorAgent
+        except ImportError as e:
+            design.error(
+                "OpenAI agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+        allowed_tools = allowed_tools or ["openai_computer"]
+        return OperatorAgent(
+            allowed_tools=allowed_tools,
+            response_agent=ResponseAgent(),
+        )
+    # Fallback Claude agent (Anthropic)
+    try:
+        from hud.agents import ClaudeAgent
+    except ImportError as e:
+        design.error(
+            "Claude agent dependencies are not installed. "
+            "Please install with: pip install 'hud-python[agent]'"
+        )
+        raise typer.Exit(1) from e
+    model = model or "claude-sonnet-4-20250514"
+    allowed_tools = allowed_tools or ["anthropic_computer"]
+    return ClaudeAgent(
+        model=model,
+        allowed_tools=allowed_tools,
+        response_agent=ResponseAgent(),
+    )
+async def run_single_task(
+    source: str,
+    *,
+    agent_type: Literal["claude", "openai"] = "claude",
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    max_steps: int = 10,
+) -> None:
+    """Load one task and execute it."""
+    design.info("📊 Loading dataset…")
+    # Import Task lazily
+    try:
+        from hud.datasets import Task
+    except ImportError as e:
+        design.error(
+            "Dataset dependencies are not installed. "
+            "Please install with: pip install 'hud-python[agent]'"
+        )
+        raise typer.Exit(1) from e
+    # Check if it's a single task JSON file
+    path = Path(source)
+    if path.exists() and path.suffix == ".json":
+        with open(path, "r") as f:
+            task_data = json.load(f)
+        task = Task(**task_data)
+    else:
+        # Load from HuggingFace dataset
+        try:
+            from datasets import load_dataset
+        except ImportError as e:
+            design.error(
+                "Datasets library is not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+        dataset = load_dataset(source, split="train")
+        # Get first task from dataset
+        sample_task = dataset[0]  # type: ignore[index]
+        task = Task(**sample_task)  # type: ignore[arg-type]
+    task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
+    with hud.trace(name=task_prompt):
+        agent = build_agent(
+            agent_type,
+            model=model,
+            allowed_tools=allowed_tools,
+        )
+        design.info(task.prompt)
+        result = await agent.run(task, max_steps=max_steps)
+        design.success(f"Reward: {result.reward}")
+async def run_full_dataset(
+    source: str,
+    *,
+    agent_type: Literal["claude", "openai"] = "claude",
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    max_concurrent: int = 30,
+    max_steps: int = 50,
+) -> list[Any]:
+    """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
+    # Import run_dataset lazily
+    try:
+        from hud.datasets import run_dataset
+    except ImportError as e:
+        design.error(
+            "Dataset dependencies are not installed. "
+            "Please install with: pip install 'hud-python[agent]'"
+        )
+        raise typer.Exit(1) from e
+    # Build agent class + config for run_dataset
+    if agent_type == "openai":
+        try:
+            from hud.agents import OperatorAgent
+            agent_class = OperatorAgent
+        except ImportError as e:
+            design.error(
+                "OpenAI agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+        agent_config: dict[str, Any] = {
+            "allowed_tools": allowed_tools or ["openai_computer"],
+        }
+    else:
+        try:
+            from hud.agents import ClaudeAgent
+            agent_class = ClaudeAgent
+        except ImportError as e:
+            design.error(
+                "Claude agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+        agent_config = {
+            "model": model or "claude-sonnet-4-20250514",
+            "allowed_tools": allowed_tools or ["anthropic_computer"],
+        }
+    design.info("🚀 Running evaluation…")
+    return await run_dataset(
+        name=f"Evaluation {source.split('/')[-1]}",
+        dataset=source,
+        agent_class=agent_class,
+        agent_config=agent_config,
+        max_concurrent=max_concurrent,
+        metadata={"dataset": source},
+        max_steps=max_steps,
+        auto_respond=True,
+    )
+def eval_command(
+    source: str = typer.Argument(
+        ...,
+        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
+    ),
+    full: bool = typer.Option(
+        False,
+        "--full",
+        help="Run the entire dataset (omit for single-task debug mode)",
+    ),
+    agent: Literal["claude", "openai"] = typer.Option(
+        "claude",
+        "--agent",
+        help="Agent backend to use",
+    ),
+    model: str | None = typer.Option(
+        None,
+        "--model",
+        help="Model name for the chosen agent",
+    ),
+    allowed_tools: str | None = typer.Option(
+        None,
+        "--allowed-tools",
+        help="Comma-separated list of allowed tools",
+    ),
+    max_concurrent: int = typer.Option(
+        50,
+        "--max-concurrent",
+        help="Concurrency level for full-dataset mode",
+    ),
+    max_steps: int = typer.Option(
+        None,
+        "--max-steps",
+        help="Maximum steps per task (default: 10 for single, 50 for full)",
+    ),
+) -> None:
+    """🚀 Run evaluation on datasets or individual tasks with agents.
+    Examples:
+        # Evaluate a single task from SheetBench
+        hud eval hud-evals/SheetBench-50
+        # Evaluate the FULL SheetBench dataset with Claude
+        hud eval hud-evals/SheetBench-50 --full --agent claude
+        # Run a single task from a JSON file
+        hud eval task.json
+        # Run with OpenAI Operator agent
+        hud eval hud-evals/OSWorld-Gold-Beta --agent openai
+    """
+    from hud.settings import settings
+    import os
+    # Check for required API keys
+    if agent == "claude":
+        if not settings.anthropic_api_key or not os.environ.get("ANTHROPIC_API_KEY"):
+            design.error("ANTHROPIC_API_KEY is required for Claude agent")
+            design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
+            raise typer.Exit(1)
+    elif agent == "openai":
+        if not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY"):
+            design.error("OPENAI_API_KEY is required for OpenAI agent")
+            design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
+            raise typer.Exit(1)
+    # Check for HUD_API_KEY if using HUD services
+    if not settings.api_key or not os.environ.get("HUD_API_KEY"):
+        design.warning("HUD_API_KEY not set. Some features may be limited.")
+        design.info("Get your API key at: https://app.hud.so")
+    # Parse allowed tools
+    allowed_tools_list = (
+        [t.strip() for t in allowed_tools.split(",") if t.strip()]
+        if allowed_tools
+        else None
+    )
+    # Set default max_steps if not provided
+    if max_steps is None:
+        max_steps = 50 if full else 10
+    # Run evaluation
+    if full:
+        asyncio.run(run_full_dataset(
+            source,
+            agent_type=agent,
+            model=model,
+            allowed_tools=allowed_tools_list,
+            max_concurrent=max_concurrent,
+            max_steps=max_steps,
+        ))
+    else:
+        asyncio.run(run_single_task(
+            source,
+            agent_type=agent,
+            model=model,
+            allowed_tools=allowed_tools_list,
+            max_steps=max_steps,
+        ))

hud/cli/list_func.py ADDED Viewed

@@ -0,0 +1,213 @@
+"""List HUD environments from local registry."""
+from __future__ import annotations
+import re
+from datetime import datetime
+from pathlib import Path
+import typer
+import yaml
+from rich.table import Table
+from hud.utils.design import HUDDesign
+from .registry import get_registry_dir, list_registry_entries, extract_name_and_tag
+def format_timestamp(timestamp: float | None) -> str:
+    """Format timestamp to human-readable relative time."""
+    if not timestamp:
+        return "unknown"
+    dt = datetime.fromtimestamp(timestamp)
+    now = datetime.now()
+    delta = now - dt
+    if delta.days > 365:
+        return f"{delta.days // 365}y ago"
+    elif delta.days > 30:
+        return f"{delta.days // 30}mo ago"
+    elif delta.days > 0:
+        return f"{delta.days}d ago"
+    elif delta.seconds > 3600:
+        return f"{delta.seconds // 3600}h ago"
+    elif delta.seconds > 60:
+        return f"{delta.seconds // 60}m ago"
+    else:
+        return "just now"
+def list_environments(
+    filter_name: str | None = None,
+    json_output: bool = False,
+    show_all: bool = False,
+    verbose: bool = False,
+) -> None:
+    """List all HUD environments in the local registry."""
+    design = HUDDesign()
+    if not json_output:
+        design.header("HUD Environment Registry")
+    # Check for environment directory
+    env_dir = get_registry_dir()
+    if not env_dir.exists():
+        if json_output:
+            print("[]")
+        else:
+            design.info("No environments found in local registry.")
+            design.info("")
+            design.info("Pull environments with: [cyan]hud pull <org/name:tag>[/cyan]")
+            design.info("Build environments with: [cyan]hud build[/cyan]")
+        return
+    # Collect all environments using the registry helper
+    environments = []
+    for digest, lock_file in list_registry_entries():
+        try:
+            # Read lock file
+            with open(lock_file) as f:
+                lock_data = yaml.safe_load(f)
+            # Extract metadata
+            image = lock_data.get("image", "unknown")
+            name, tag = extract_name_and_tag(image)
+            # Apply filter if specified
+            if filter_name and filter_name.lower() not in name.lower():
+                continue
+            # Get additional metadata
+            metadata = lock_data.get("metadata", {})
+            description = metadata.get("description", "")
+            tools_count = len(metadata.get("tools", []))
+            # Get file modification time as pulled time
+            pulled_time = lock_file.stat().st_mtime
+            environments.append({
+                "name": name,
+                "tag": tag,
+                "digest": digest,
+                "description": description,
+                "tools_count": tools_count,
+                "pulled_time": pulled_time,
+                "image": image,
+                "path": str(lock_file),
+            })
+        except Exception as e:
+            if verbose:
+                design.warning(f"Failed to read {lock_file}: {e}")
+    # Sort by pulled time (newest first)
+    environments.sort(key=lambda x: x["pulled_time"], reverse=True)
+    if json_output:
+        # Output as JSON
+        import json
+        json_data = []
+        for env in environments:
+            json_data.append({
+                "name": env["name"],
+                "tag": env["tag"],
+                "digest": env["digest"],
+                "description": env["description"],
+                "tools_count": env["tools_count"],
+                "pulled_time": env["pulled_time"],
+                "image": env["image"],
+                "path": env["path"],
+            })
+        print(json.dumps(json_data, indent=2))
+        return
+    if not environments:
+        design.info("No environments found matching criteria.")
+        design.info("")
+        design.info("Pull environments with: [cyan]hud pull <org/name:tag>[/cyan]")
+        design.info("Build environments with: [cyan]hud build[/cyan]")
+        return
+    # Create table
+    table = Table(title=f"Found {len(environments)} environment{'s' if len(environments) != 1 else ''}")
+    table.add_column("Environment", style="cyan", no_wrap=True)
+    table.add_column("Description", style="white")
+    table.add_column("Tools", justify="right", style="yellow")
+    table.add_column("Pulled", style="dim")
+    if show_all or verbose:
+        table.add_column("Digest", style="dim")
+    # Add rows
+    for env in environments:
+        # Truncate description if too long
+        desc = env["description"]
+        if desc and len(desc) > 50 and not verbose:
+            desc = desc[:47] + "..."
+        # Combine name and tag for easy copying
+        full_ref = f"{env['name']}:{env['tag']}"
+        row = [
+            full_ref,
+            desc or "[dim]No description[/dim]",
+            str(env["tools_count"]),
+            format_timestamp(env["pulled_time"]),
+        ]
+        if show_all or verbose:
+            row.append(env["digest"][:12])
+        table.add_row(*row)
+    design.print(table)
+    design.info("")
+    # Show usage hints
+    design.section_title("Usage")
+    if environments:
+        # Use the most recently pulled environment as example
+        example_env = environments[0]
+        example_ref = f"{example_env['name']}:{example_env['tag']}"
+        design.info(f"Run an environment: [cyan]hud run {example_ref}[/cyan]")
+        design.info(f"Analyze tools: [cyan]hud analyze {example_ref}[/cyan]")
+        design.info(f"Debug server: [cyan]hud debug {example_ref}[/cyan]")
+    design.info("Pull more environments: [cyan]hud pull <org/name:tag>[/cyan]")
+    design.info("Build new environments: [cyan]hud build[/cyan]")
+    if verbose:
+        design.info("")
+        design.info(f"[dim]Registry location: {env_dir}[/dim]")
+def list_command(
+    filter_name: str | None = typer.Option(
+        None, "--filter", "-f", help="Filter environments by name (case-insensitive)"
+    ),
+    json_output: bool = typer.Option(
+        False, "--json", help="Output as JSON"
+    ),
+    show_all: bool = typer.Option(
+        False, "--all", "-a", help="Show all columns including digest"
+    ),
+    verbose: bool = typer.Option(
+        False, "--verbose", "-v", help="Show detailed output"
+    ),
+) -> None:
+    """📋 List all HUD environments in local registry.
+    Shows environments pulled with 'hud pull' or built with 'hud build',
+    stored in ~/.hud/envs/
+    Examples:
+        hud list                    # List all environments
+        hud list --filter text      # Filter by name
+        hud list --json            # Output as JSON
+        hud list --all             # Show digest column
+        hud list --verbose         # Show full descriptions
+    """
+    list_environments(filter_name, json_output, show_all, verbose)

hud/cli/mcp_server.py CHANGED Viewed

@@ -13,92 +13,16 @@ import toml
 from fastmcp import FastMCP
 from hud.utils.design import HUDDesign
-from .docker_utils import get_docker_cmd, image_exists, inject_supervisor
+from .docker_utils import get_docker_cmd, inject_supervisor
+from .env_utils import get_image_name, update_pyproject_toml, build_environment, image_exists
 # Global design instance
 design = HUDDesign()
-def get_image_name(directory: str | Path, image_override: str | None = None) -> tuple[str, str]:
-    """
-    Resolve image name with source tracking.
-        Returns:
-        Tuple of (image_name, source) where source is "override", "cache", or "auto"
-    """
-    if image_override:
-        return image_override, "override"
-    # Check pyproject.toml
-    pyproject_path = Path(directory) / "pyproject.toml"
-    if pyproject_path.exists():
-        try:
-            with open(pyproject_path) as f:
-                config = toml.load(f)
-            if config.get("tool", {}).get("hud", {}).get("image"):
-                return config["tool"]["hud"]["image"], "cache"
-        except Exception:
-            pass  # Silent failure, will use auto-generated name
-    # Auto-generate with :dev tag
-    dir_path = Path(directory).resolve()  # Get absolute path first
-    dir_name = dir_path.name
-    if not dir_name or dir_name == ".":
-        # If we're in root or have empty name, use parent directory
-        dir_name = dir_path.parent.name
-    clean_name = dir_name.replace("_", "-")
-    return f"hud-{clean_name}:dev", "auto"
-def update_pyproject_toml(directory: str | Path, image_name: str, silent: bool = False) -> None:
-    """Update pyproject.toml with image name."""
-    pyproject_path = Path(directory) / "pyproject.toml"
-    if pyproject_path.exists():
-        try:
-            with open(pyproject_path) as f:
-                config = toml.load(f)
-            # Ensure [tool.hud] exists
-            if "tool" not in config:
-                config["tool"] = {}
-            if "hud" not in config["tool"]:
-                config["tool"]["hud"] = {}
-            # Update image name
-            config["tool"]["hud"]["image"] = image_name
-            # Write back
-            with open(pyproject_path, "w") as f:
-                toml.dump(config, f)
-            if not silent:
-                design.success(f"Updated pyproject.toml with image: {image_name}")
-        except Exception as e:
-            if not silent:
-                design.warning(f"Could not update pyproject.toml: {e}")
 def build_and_update(directory: str | Path, image_name: str, no_cache: bool = False) -> None:
     """Build Docker image and update pyproject.toml."""
-    build_cmd = ["docker", "build", "-t", image_name]
-    if no_cache:
-        build_cmd.append("--no-cache")
-    build_cmd.append(str(directory))
-    design.info(f"🔨 Building image: {image_name}{' (no cache)' if no_cache else ''}")
-    design.info("")  # Empty line before Docker output
-    # Just run Docker build directly - it has its own nice live display
-    result = subprocess.run(build_cmd)  # noqa: S603
-    if result.returncode == 0:
-        design.info("")  # Empty line after Docker output
-        design.success(f"Build successful! Image: {image_name}")
-        # Update pyproject.toml (silently since we already showed success)
-        update_pyproject_toml(directory, image_name, silent=True)
-    else:
-        design.error("Build failed!")
+    if not build_environment(directory, image_name, no_cache):
         raise click.Abort

hud/cli/pull.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import subprocess
 from pathlib import Path
+from urllib.parse import quote
 import click
 import requests
@@ -14,6 +15,8 @@ from rich.table import Table
 from hud.settings import settings
 from hud.utils.design import HUDDesign
+from .registry import save_to_registry
 def get_docker_manifest(image: str) -> dict | None:
     """Get manifest from Docker registry without pulling the image."""
@@ -59,7 +62,9 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
         if "/" in reference and ":" not in reference:
             reference = f"{reference}:latest"
-        registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{reference}"
+        # URL-encode the path segments to handle special characters in tags
+        url_safe_path = "/".join(quote(part, safe="") for part in reference.split("/"))
+        registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{url_safe_path}"
         headers = {}
         if settings.api_key:
@@ -77,7 +82,18 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
             else:
                 # Try to treat the whole response as lock data
                 return data
+        elif response.status_code == 404:
+            # Not found - expected error, return None silently
+            return None
+        elif response.status_code == 401:
+            # Authentication issue - might be a private environment
+            return None
+        else:
+            # Other errors - also return None but could log if verbose
+            return None
+    except requests.exceptions.Timeout:
+        return None
+    except requests.exceptions.ConnectionError:
         return None
     except Exception:
         return None
@@ -282,19 +298,8 @@ def pull_environment(
     # Store lock file locally if we have full lock data (not minimal manifest data)
     if lock_data and lock_data.get("source") != "docker-manifest":
-        # Extract digest from image ref
-        digest = image_ref.split("@sha256:")[-1][:12] if "@sha256:" in image_ref else "latest"
-        # Store under ~/.hud/envs/<digest>/
-        local_env_dir = Path.home() / ".hud" / "envs" / digest
-        local_env_dir.mkdir(parents=True, exist_ok=True)
-        local_lock_path = local_env_dir / "hud.lock.yaml"
-        with open(local_lock_path, "w") as f:
-            yaml.dump(lock_data, f, default_flow_style=False, sort_keys=False)
-        if verbose:
-            design.info(f"Stored lock file: {local_lock_path}")
+        # Save to local registry using the helper
+        save_to_registry(lock_data, image_ref, verbose)
     # Success!
     design.success("Pull complete!")

hud-python 0.4.8__py3-none-any.whl → 0.4.10__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.8py3-none-any.whl → 0.4.10py3-none-any.whl