PyPI - hud-python - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.10__py3-none-any.whl - Mend

hud-python 0.4.8py3-none-any.whl → 0.4.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (32) hide show

hud/agents/base.py +50 -1
hud/cli/__init__.py +187 -11
hud/cli/analyze_metadata.py +33 -42
hud/cli/build.py +7 -0
hud/cli/debug.py +8 -1
hud/cli/env_utils.py +133 -0
hud/cli/eval.py +302 -0
hud/cli/list_func.py +213 -0
hud/cli/mcp_server.py +3 -79
hud/cli/pull.py +20 -15
hud/cli/push.py +84 -41
hud/cli/registry.py +155 -0
hud/cli/remove.py +200 -0
hud/cli/runner.py +1 -1
hud/cli/tests/test_analyze_metadata.py +277 -0
hud/cli/tests/test_build.py +450 -0
hud/cli/tests/test_list_func.py +288 -0
hud/cli/tests/test_pull.py +400 -0
hud/cli/tests/test_push.py +379 -0
hud/cli/tests/test_registry.py +264 -0
hud/clients/base.py +13 -1
hud/tools/__init__.py +2 -0
hud/tools/response.py +54 -0
hud/utils/design.py +10 -0
hud/utils/mcp.py +14 -2
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/METADATA +12 -1
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/RECORD +32 -20
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/WHEEL +0 -0
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.8.dist-info → hud_python-0.4.10.dist-info}/licenses/LICENSE +0 -0

hud/agents/base.py CHANGED Viewed

@@ -85,6 +85,7 @@ class MCPAgent(ABC):
         self._tool_map: dict[str, types.Tool] = {}  # Simplified: just name to tool
         self.screenshot_history: list[str] = []
         self._auto_trace = auto_trace
+        self._auto_trace_cm: Any | None = None  # Store auto-created trace context manager
         self.initialization_complete = False
         # Response agent to automatically interact with the model
@@ -303,6 +304,9 @@ class MCPAgent(ABC):
                             except Exception as e:
                                 logger.warning("ResponseAgent failed: %s", e)
                         if decision == "STOP":
+                            # Try to submit response through lifecycle tool
+                            await self._maybe_submit_response(response, messages)
                             logger.info("Stopping execution")
                             final_response = response
                             break
@@ -483,6 +487,40 @@ class MCPAgent(ABC):
             self._available_tools.append(tool)
             # Simplified mapping - just tool name to tool
             self._tool_map[tool.name] = tool
+            # Auto-detect response tool as a lifecycle tool
+            if tool.name == "response" and "response" not in self.lifecycle_tools:
+                logger.debug("Auto-detected 'response' tool as a lifecycle tool")
+                self.lifecycle_tools.append("response")
+    async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
+        """Submit response through lifecycle tool if available.
+        Args:
+            response: The agent's response
+            messages: The current message history (will be modified in-place)
+        """
+        # Check if we have a response lifecycle tool
+        if "response" in self.lifecycle_tools and "response" in self._tool_map:
+            logger.debug("Calling response lifecycle tool")
+            try:
+                # Call the response tool with the agent's response
+                response_tool_call = MCPToolCall(
+                    name="response",
+                    arguments={"response": response.content, "messages": messages}
+                )
+                response_results = await self.call_tools(response_tool_call)
+                # Format and add the response tool results to messages
+                response_messages = await self.format_tool_results(
+                    [response_tool_call], response_results
+                )
+                messages.extend(response_messages)
+                # Mark the task as done
+                logger.info("Response lifecycle tool executed, marking task as done")
+            except Exception as e:
+                logger.error("Response lifecycle tool failed: %s", e)
     async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
         """Inject metadata into the metadata of the initialize request."""
@@ -491,7 +529,7 @@ class MCPAgent(ABC):
                 mcp_config,
                 MCPConfigPatch(meta=self.metadata),
             )
-        setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
+        self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
     def get_available_tools(self) -> list[types.Tool]:
         """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
@@ -532,6 +570,17 @@ class MCPAgent(ABC):
     async def _cleanup(self) -> None:
         """Cleanup resources."""
+        # Clean up auto-created trace if any
+        if self._auto_trace_cm:
+            try:
+                self._auto_trace_cm.__exit__(None, None, None)
+                logger.info("Closed auto-created trace")
+            except Exception as e:
+                logger.warning("Failed to close auto-created trace: %s", e)
+            finally:
+                self._auto_trace_cm = None
+        # Clean up auto-created client
         if self._auto_created_client and self.mcp_client:
             try:
                 await self.mcp_client.shutdown()

hud/cli/__init__.py CHANGED Viewed

@@ -23,9 +23,11 @@ from .clone import clone_repository, get_clone_message, print_error, print_tutor
 from .cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
 from .debug import debug_mcp_stdio
 from .init import create_environment
+from . import list_func as list_module
 from .mcp_server import run_mcp_dev_server
 from .pull import pull_command
 from .push import push_command
+from .remove import remove_command
 from .utils import CaptureLogger
 # Create the main Typer app
@@ -129,7 +131,7 @@ def analyze(
 def debug(
     params: list[str] = typer.Argument(  # type: ignore[arg-type]  # noqa: B008
         None,
-        help="Docker image followed by optional Docker run arguments (e.g., 'hud-image:latest -e KEY=value')",  # noqa: E501
+        help="Docker image, environment directory, or config file followed by optional Docker arguments",  # noqa: E501
     ),
     config: Path = typer.Option(  # noqa: B008
         None,
@@ -145,6 +147,12 @@ def debug(
         "--cursor",
         help="Debug a server from Cursor config",
     ),
+    build: bool = typer.Option(
+        False,
+        "--build",
+        "-b",
+        help="Build image before debugging (for directory mode)",
+    ),
     max_phase: int = typer.Option(
         5,
         "--max-phase",
@@ -157,15 +165,24 @@ def debug(
     """🐛 Debug MCP environment - test initialization, tools, and readiness.
     Examples:
-        hud debug hud-text-2048:latest
-        hud debug my-mcp-server:v1 -e API_KEY=xxx -p 8080:8080
+        hud debug .                              # Debug current directory
+        hud debug environments/browser           # Debug specific directory
+        hud debug . --build                      # Build then debug
+        hud debug hud-text-2048:latest          # Debug Docker image
+        hud debug my-mcp-server:v1 -e API_KEY=xxx
         hud debug --config mcp-config.json
         hud debug --cursor text-2048-dev
-        hud debug hud-browser:dev --max-phase 3
+        hud debug . --max-phase 3               # Stop after phase 3
     """
+    # Import here to avoid circular imports
+    from .env_utils import get_image_name, is_environment_directory, build_environment, image_exists
+    from hud.utils.design import HUDDesign
+    design = HUDDesign()
     # Determine the command to run
     command = None
+    docker_args = []
     if config:
         # Load config from JSON file
@@ -183,13 +200,44 @@ def debug(
             console.print(f"[red]❌ {error or 'Failed to parse cursor config'}[/red]")
             raise typer.Exit(1)
     elif params:
-        image, *docker_args = params
-        # Build Docker command
-        command = ["docker", "run", "--rm", "-i", *docker_args, image]
+        first_param = params[0]
+        docker_args = params[1:] if len(params) > 1 else []
+        # Check if it's a directory
+        if Path(first_param).exists() and is_environment_directory(first_param):
+            # Directory mode - like hud dev
+            directory = first_param
+            # Get or generate image name
+            image_name, source = get_image_name(directory)
+            if source == "auto":
+                design.info(f"Auto-generated image name: {image_name}")
+            # Build if requested or if image doesn't exist
+            if build or not image_exists(image_name):
+                if not build and not image_exists(image_name):
+                    if typer.confirm(f"Image {image_name} not found. Build it now?"):
+                        build = True
+                    else:
+                        raise typer.Exit(1)
+                if build:
+                    if not build_environment(directory, image_name):
+                        raise typer.Exit(1)
+            # Build Docker command
+            command = ["docker", "run", "--rm", "-i", *docker_args, image_name]
+        else:
+            # Assume it's an image name
+            image = first_param
+            command = ["docker", "run", "--rm", "-i", *docker_args, image]
     else:
-        console.print("[red]Error: Must specify either a Docker image, --config, or --cursor[/red]")
+        console.print("[red]Error: Must specify a directory, Docker image, --config, or --cursor[/red]")
         console.print("\nExamples:")
-        console.print("  hud debug hud-text-2048:latest")
+        console.print("  hud debug .                      # Debug current directory")
+        console.print("  hud debug environments/browser   # Debug specific directory")
+        console.print("  hud debug hud-text-2048:latest  # Debug Docker image")
         console.print("  hud debug --config mcp-config.json")
         console.print("  hud debug --cursor my-server")
         raise typer.Exit(1)
@@ -442,7 +490,8 @@ def run(
         # Get URL from options or environment
         if not url:
-            url = os.getenv("HUD_MCP_URL", "https://mcp.hud.so/v3/mcp")
+            from hud.settings import settings
+            url = settings.hud_mcp_url
         run_remote_server(image, docker_args, transport, port, url, api_key, run_id, verbose)
@@ -561,6 +610,63 @@ def pull(
     pull_command(target, lock_file, yes, verify_only, verbose)
+@app.command(name="list")
+def list_environments(
+    filter_name: str | None = typer.Option(
+        None, "--filter", "-f", help="Filter environments by name (case-insensitive)"
+    ),
+    json_output: bool = typer.Option(
+        False, "--json", help="Output as JSON"
+    ),
+    show_all: bool = typer.Option(
+        False, "--all", "-a", help="Show all columns including digest"
+    ),
+    verbose: bool = typer.Option(
+        False, "--verbose", "-v", help="Show detailed output"
+    ),
+) -> None:
+    """📋 List all HUD environments in local registry.
+    Shows environments pulled with 'hud pull' stored in ~/.hud/envs/
+    Examples:
+        hud list                    # List all environments
+        hud list --filter text      # Filter by name
+        hud list --json            # Output as JSON
+        hud list --all             # Show digest column
+        hud list --verbose         # Show full descriptions
+    """
+    list_module.list_command(filter_name, json_output, show_all, verbose)
+@app.command()
+def remove(
+    target: str | None = typer.Argument(
+        None,
+        help="Environment to remove (digest, name, or 'all' for all environments)"
+    ),
+    yes: bool = typer.Option(
+        False, "--yes", "-y", help="Skip confirmation prompt"
+    ),
+    verbose: bool = typer.Option(
+        False, "--verbose", "-v", help="Show detailed output"
+    ),
+) -> None:
+    """🗑️ Remove HUD environments from local registry.
+    Removes environment metadata from ~/.hud/envs/
+    Note: This does not remove the Docker images.
+    Examples:
+        hud remove abc123              # Remove by digest
+        hud remove text_2048           # Remove by name
+        hud remove hudpython/test_init # Remove by full name
+        hud remove all                 # Remove all environments
+        hud remove all --yes           # Remove all without confirmation
+    """
+    remove_command(target, yes, verbose)
 @app.command()
 def init(
     name: str = typer.Argument(None, help="Environment name (default: current directory name)"),
@@ -592,6 +698,76 @@ def quickstart() -> None:
     clone("https://github.com/hud-evals/quickstart.git")
+@app.command()
+def eval(
+    source: str = typer.Argument(
+        ...,
+        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
+    ),
+    full: bool = typer.Option(
+        False,
+        "--full",
+        help="Run the entire dataset (omit for single-task debug mode)",
+    ),
+    agent: str = typer.Option(
+        "claude",
+        "--agent",
+        help="Agent backend to use (claude or openai)",
+    ),
+    model: str | None = typer.Option(
+        None,
+        "--model",
+        help="Model name for the chosen agent",
+    ),
+    allowed_tools: str | None = typer.Option(
+        None,
+        "--allowed-tools",
+        help="Comma-separated list of allowed tools",
+    ),
+    max_concurrent: int = typer.Option(
+        30,
+        "--max-concurrent",
+        help="Concurrency level for full-dataset mode",
+    ),
+    max_steps: int = typer.Option(
+        30,
+        "--max-steps",
+        help="Maximum steps per task (default: 10 for single, 50 for full)",
+    ),
+) -> None:
+    """🚀 Run evaluation on datasets or individual tasks with agents."""
+    # Validate agent choice
+    valid_agents = ["claude", "openai"]
+    if agent not in valid_agents:
+        from hud.utils.design import HUDDesign
+        design = HUDDesign()
+        design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
+        raise typer.Exit(1)
+    # Import eval_command lazily to avoid importing agent dependencies
+    try:
+        from .eval import eval_command
+    except ImportError as e:
+        from hud.utils.design import HUDDesign
+        design = HUDDesign()
+        design.error(
+            "Evaluation dependencies are not installed. "
+            "Please install with: pip install 'hud-python[agent]'"
+        )
+        raise typer.Exit(1) from e
+    # Run the command
+    eval_command(
+        source=source,
+        full=full,
+        agent=agent,  # type: ignore
+        model=model,
+        allowed_tools=allowed_tools,
+        max_concurrent=max_concurrent,
+        max_steps=max_steps,
+    )
 def main() -> None:
     """Main entry point for the CLI."""
     # Show header for main help

hud/cli/analyze_metadata.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 from pathlib import Path
+from urllib.parse import quote
 import requests
 import yaml
@@ -12,6 +13,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
 from hud.settings import settings
 from hud.utils.design import HUDDesign
+from .registry import get_registry_dir, list_registry_entries, extract_digest_from_image, load_from_registry
 console = Console()
 design = HUDDesign()
@@ -24,7 +27,9 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
         if "/" in reference and ":" not in reference:
             reference = f"{reference}:latest"
-        registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{reference}"
+        # URL-encode the path segments to handle special characters in tags
+        url_safe_path = "/".join(quote(part, safe="") for part in reference.split("/"))
+        registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{url_safe_path}"
         headers = {}
         if settings.api_key:
@@ -50,38 +55,31 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
 def check_local_cache(reference: str) -> dict | None:
     """Check local cache for lock file."""
-    # Extract digest if present
-    if "@sha256:" in reference:
-        digest = reference.split("@sha256:")[-1][:12]
-    elif "/" in reference:
-        # Try to find by name pattern
-        cache_dir = Path.home() / ".hud" / "envs"
-        if cache_dir.exists():
-            # Look for any cached version of this image
-            for env_dir in cache_dir.iterdir():
-                if env_dir.is_dir():
-                    lock_file = env_dir / "hud.lock.yaml"
-                    if lock_file.exists():
-                        with open(lock_file) as f:
-                            lock_data = yaml.safe_load(f)
-                        # Check if this matches our reference
-                        if lock_data and "image" in lock_data:
-                            image = lock_data["image"]
-                            # Match by name (ignoring tag/digest)
-                            ref_base = reference.split("@")[0].split(":")[0]
-                            img_base = image.split("@")[0].split(":")[0]
-                            if ref_base in img_base or img_base in ref_base:
-                                return lock_data
-        return None
-    else:
-        digest = "latest"
-    # Check specific digest directory
-    lock_file = Path.home() / ".hud" / "envs" / digest / "hud.lock.yaml"
-    if lock_file.exists():
-        with open(lock_file) as f:
-            return yaml.safe_load(f)
+    # First try exact digest match
+    digest = extract_digest_from_image(reference)
+    lock_data = load_from_registry(digest)
+    if lock_data:
+        return lock_data
+    # If not found and reference has a name, search by name pattern
+    if "/" in reference:
+        # Look for any cached version of this image
+        ref_base = reference.split("@")[0].split(":")[0]
+        for digest, lock_file in list_registry_entries():
+            try:
+                with open(lock_file) as f:
+                    lock_data = yaml.safe_load(f)
+                # Check if this matches our reference
+                if lock_data and "image" in lock_data:
+                    image = lock_data["image"]
+                    # Match by name (ignoring tag/digest)
+                    img_base = image.split("@")[0].split(":")[0]
+                    if ref_base in img_base or img_base in ref_base:
+                        return lock_data
+            except Exception:
+                continue
     return None
@@ -147,15 +145,8 @@ async def analyze_from_metadata(reference: str, output_format: str, verbose: boo
                 source = "registry"
                 # Save to local cache for next time
-                if "@sha256:" in lock_data.get("image", ""):
-                    digest = lock_data["image"].split("@sha256:")[-1][:12]
-                else:
-                    digest = "latest"
-                cache_dir = Path.home() / ".hud" / "envs" / digest
-                cache_dir.mkdir(parents=True, exist_ok=True)
-                with open(cache_dir / "hud.lock.yaml", "w") as f:  # noqa: ASYNC230
-                    yaml.dump(lock_data, f, default_flow_style=False, sort_keys=False)
+                from .registry import save_to_registry
+                save_to_registry(lock_data, lock_data.get("image", ""), verbose=False)
             else:
                 progress.update(task, description="[red]✗ Not found[/red]")

hud/cli/build.py CHANGED Viewed

@@ -17,6 +17,8 @@ from hud.clients import MCPClient
 from hud.utils.design import HUDDesign
 from hud.version import __version__ as hud_version
+from .registry import save_to_registry
 def parse_version(version_str: str) -> tuple[int, int, int]:
     """Parse version string like '1.0.0' or '1.0' into tuple of integers."""
@@ -459,6 +461,11 @@ def build_environment(
     # Remove temp image after we're done
     subprocess.run(["docker", "rmi", temp_tag], capture_output=True)  # noqa: S603, S607
+    # Add to local registry
+    if image_id:
+        # Save to local registry using the helper
+        save_to_registry(lock_content, lock_content.get("image", tag), verbose)
     # Print summary
     design.section_title("Build Complete")

hud/cli/debug.py CHANGED Viewed

@@ -167,7 +167,14 @@ async def debug_mcp_stdio(command: list[str], logger: CaptureLogger, max_phase:
                         break
                 except Exception as e:
                     logger.error(f"Failed to parse MCP response: {e}")
-                    continue
+                    logger.error(f"Raw output that caused the error: {repr(line)}")
+                    logger.hint("This usually means non-JSON output is being sent to STDOUT")
+                    logger.hint("Common causes:")
+                    logger.hint("  - Print statements in your server code")
+                    logger.hint("  - Library warnings (use warnings.filterwarnings)")
+                    logger.hint("  - Import-time output from dependencies")
+                    phases_completed = 1  # Mark as failed
+                    break  # Stop trying to parse
         if response and "result" in response:
             logger.success("MCP server initialized successfully")

hud/cli/env_utils.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Shared utilities for environment directory handling."""
+from __future__ import annotations
+import subprocess
+from pathlib import Path
+from typing import Any
+import toml
+from hud.utils.design import HUDDesign
+design = HUDDesign()
+def get_image_name(directory: str | Path, image_override: str | None = None) -> tuple[str, str]:
+    """
+    Resolve image name with source tracking.
+    Returns:
+        Tuple of (image_name, source) where source is "override", "cache", or "auto"
+    """
+    if image_override:
+        return image_override, "override"
+    # Check pyproject.toml
+    pyproject_path = Path(directory) / "pyproject.toml"
+    if pyproject_path.exists():
+        try:
+            with open(pyproject_path) as f:
+                config = toml.load(f)
+            if config.get("tool", {}).get("hud", {}).get("image"):
+                return config["tool"]["hud"]["image"], "cache"
+        except Exception:
+            pass  # Silent failure, will use auto-generated name
+    # Auto-generate with :dev tag
+    dir_path = Path(directory).resolve()  # Get absolute path first
+    dir_name = dir_path.name
+    if not dir_name or dir_name == ".":
+        # If we're in root or have empty name, use parent directory
+        dir_name = dir_path.parent.name
+    clean_name = dir_name.replace("_", "-")
+    return f"hud-{clean_name}:dev", "auto"
+def update_pyproject_toml(directory: str | Path, image_name: str, silent: bool = False) -> None:
+    """Update pyproject.toml with image name."""
+    pyproject_path = Path(directory) / "pyproject.toml"
+    if pyproject_path.exists():
+        try:
+            with open(pyproject_path) as f:
+                config = toml.load(f)
+            # Ensure [tool.hud] exists
+            if "tool" not in config:
+                config["tool"] = {}
+            if "hud" not in config["tool"]:
+                config["tool"]["hud"] = {}
+            # Update image name
+            config["tool"]["hud"]["image"] = image_name
+            # Write back
+            with open(pyproject_path, "w") as f:
+                toml.dump(config, f)
+            if not silent:
+                design.success(f"Updated pyproject.toml with image: {image_name}")
+        except Exception as e:
+            if not silent:
+                design.warning(f"Could not update pyproject.toml: {e}")
+def build_environment(directory: str | Path, image_name: str, no_cache: bool = False) -> bool:
+    """Build Docker image for an environment.
+    Returns:
+        True if build succeeded, False otherwise
+    """
+    build_cmd = ["docker", "build", "-t", image_name]
+    if no_cache:
+        build_cmd.append("--no-cache")
+    build_cmd.append(str(directory))
+    design.info(f"🔨 Building image: {image_name}{' (no cache)' if no_cache else ''}")
+    design.info("")  # Empty line before Docker output
+    # Just run Docker build directly - it has its own nice live display
+    result = subprocess.run(build_cmd)  # noqa: S603
+    if result.returncode == 0:
+        design.info("")  # Empty line after Docker output
+        design.success(f"Build successful! Image: {image_name}")
+        # Update pyproject.toml (silently since we already showed success)
+        update_pyproject_toml(directory, image_name, silent=True)
+        return True
+    else:
+        design.error("Build failed!")
+        return False
+def image_exists(image_name: str) -> bool:
+    """Check if a Docker image exists locally."""
+    result = subprocess.run(  # noqa: S603
+        ["docker", "image", "inspect", image_name],  # noqa: S607
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    return result.returncode == 0
+def is_environment_directory(path: str | Path) -> bool:
+    """Check if a path looks like an environment directory.
+    An environment directory should have:
+    - A Dockerfile
+    - A pyproject.toml file
+    - Optionally a src directory
+    """
+    dir_path = Path(path)
+    if not dir_path.is_dir():
+        return False
+    # Must have Dockerfile
+    if not (dir_path / "Dockerfile").exists():
+        return False
+    # Must have pyproject.toml
+    if not (dir_path / "pyproject.toml").exists():
+        return False
+    return True

hud-python 0.4.8__py3-none-any.whl → 0.4.10__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.8py3-none-any.whl → 0.4.10py3-none-any.whl