PyPI - hud-python - Versions diffs - 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl - Mend

hud-python 0.4.13py3-none-any.whl → 0.4.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (35) hide show

hud/agents/base.py +118 -33
hud/agents/claude.py +1 -1
hud/agents/openai.py +5 -16
hud/agents/tests/test_openai.py +24 -79
hud/cli/__init__.py +137 -15
hud/cli/analyze.py +2 -4
hud/cli/build.py +6 -2
hud/cli/dev.py +67 -0
hud/cli/eval.py +90 -35
hud/cli/hf.py +406 -0
hud/cli/init.py +49 -30
hud/cli/tests/test_mcp_server.py +1 -4
hud/clients/base.py +2 -0
hud/clients/fastmcp.py +7 -2
hud/clients/mcp_use.py +3 -1
hud/clients/utils/retry_transport.py +34 -8
hud/datasets/__init__.py +32 -0
hud/datasets/execution/__init__.py +13 -0
hud/datasets/execution/parallel.py +592 -0
hud/datasets/execution/runner.py +123 -0
hud/datasets/task.py +107 -0
hud/datasets/utils.py +118 -0
hud/otel/instrumentation.py +6 -1
hud/server/server.py +58 -21
hud/settings.py +12 -0
hud/types.py +31 -10
hud/utils/design.py +168 -2
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/METADATA +4 -3
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/RECORD +34 -28
hud/datasets.py +0 -327
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/WHEEL +0 -0
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/licenses/LICENSE +0 -0

hud/cli/__init__.py CHANGED Viewed

@@ -22,10 +22,14 @@ from .build import build_command
 from .clone import clone_repository, get_clone_message, print_error, print_tutorial
 from .debug import debug_mcp_stdio
 from .dev import run_mcp_dev_server
+# Import new commands
+from .hf import hf_command
 from .init import create_environment
 from .pull import pull_command
 from .push import push_command
 from .remove import remove_command
+from .rl import rl_app
 from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
 from .utils.logging import CaptureLogger
@@ -760,19 +764,24 @@ def quickstart() -> None:
 @app.command()
 def eval(
-    source: str = typer.Argument(
-        ...,
-        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
+    source: str | None = typer.Argument(
+        None,
+        help=(
+            "HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file. "
+            "If not provided, looks for task.json in current directory."
+        ),
     ),
     full: bool = typer.Option(
         False,
         "--full",
         help="Run the entire dataset (omit for single-task debug mode)",
     ),
-    agent: str = typer.Option(
-        "claude",
+    agent: str | None = typer.Option(
+        None,
         "--agent",
-        help="Agent backend to use (claude or openai)",
+        help=(
+            "Agent backend to use (claude or openai). If not provided, will prompt interactively."
+        ),
     ),
     model: str | None = typer.Option(
         None,
@@ -785,23 +794,99 @@ def eval(
         help="Comma-separated list of allowed tools",
     ),
     max_concurrent: int = typer.Option(
-        30,
+        50,
         "--max-concurrent",
-        help="Concurrency level for full-dataset mode",
+        help="Max concurrent tasks (prevents rate limits in both asyncio and parallel modes)",
     ),
     max_steps: int = typer.Option(
         30,
         "--max-steps",
         help="Maximum steps per task (default: 10 for single, 50 for full)",
     ),
+    parallel: bool = typer.Option(
+        False,
+        "--parallel",
+        help="Use process-based parallel execution for large datasets (100+ tasks)",
+    ),
+    max_workers: int | None = typer.Option(
+        None,
+        "--max-workers",
+        help="Number of worker processes for parallel mode (auto-optimized if not set)",
+    ),
+    max_concurrent_per_worker: int = typer.Option(
+        20,
+        "--max-concurrent-per-worker",
+        help="Maximum concurrent tasks per worker in parallel mode",
+    ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents."""
+    from hud.utils.design import HUDDesign
+    design = HUDDesign()
+    # If no source provided, look for task/eval JSON files in current directory
+    if source is None:
+        # Search for JSON files with "task" or "eval" in the name (case-insensitive)
+        json_files = []
+        patterns = [
+            "*task*.json",
+            "*eval*.json",
+            "*Task*.json",
+            "*Eval*.json",
+            "*TASK*.json",
+            "*EVAL*.json",
+        ]
+        # First check current directory
+        for pattern in patterns:
+            json_files.extend(Path(".").glob(pattern))
+        # If no files found, search recursively (but limit depth to avoid deep searches)
+        if not json_files:
+            for pattern in patterns:
+                # Search up to 2 levels deep
+                json_files.extend(Path(".").glob(f"*/{pattern}"))
+                json_files.extend(Path(".").glob(f"*/*/{pattern}"))
+        # Remove duplicates and sort
+        json_files = sorted(set(json_files))
+        if not json_files:
+            design.error(
+                "No source provided and no task/eval JSON files found in current directory"
+            )
+            design.info(
+                "Usage: hud eval <source> or create a task JSON file "
+                "(e.g., task.json, eval_config.json)"
+            )
+            raise typer.Exit(1)
+        elif len(json_files) == 1:
+            source = str(json_files[0])
+            design.info(f"Found task file: {source}")
+        else:
+            # Multiple files found, let user choose
+            design.info("Multiple task files found:")
+            file_choice = design.select(
+                "Select a task file to run:",
+                choices=[str(f) for f in json_files],
+            )
+            source = file_choice
+            design.success(f"Selected: {source}")
+    # If no agent specified, prompt for selection
+    if agent is None:
+        agent = design.select(
+            "Select an agent to use:",
+            choices=[
+                {"name": "Claude 4 Sonnet", "value": "claude"},
+                {"name": "OpenAI Computer Use", "value": "openai"},
+            ],
+            default="Claude 4 Sonnet",
+        )
     # Validate agent choice
     valid_agents = ["claude", "openai"]
     if agent not in valid_agents:
-        from hud.utils.design import HUDDesign
-        design = HUDDesign()
         design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
         raise typer.Exit(1)
@@ -809,9 +894,6 @@ def eval(
     try:
         from .eval import eval_command
     except ImportError as e:
-        from hud.utils.design import HUDDesign
-        design = HUDDesign()
         design.error(
             "Evaluation dependencies are not installed. "
             "Please install with: pip install 'hud-python[agent]'"
@@ -827,9 +909,45 @@ def eval(
         allowed_tools=allowed_tools,
         max_concurrent=max_concurrent,
         max_steps=max_steps,
+        parallel=parallel,
+        max_workers=max_workers,
+        max_concurrent_per_worker=max_concurrent_per_worker,
     )
+# Add the RL subcommand group
+app.add_typer(rl_app, name="rl")
+@app.command()
+def hf(
+    tasks_file: Path | None = typer.Argument(  # noqa: B008
+        None, help="JSON file containing tasks (auto-detected if not provided)"
+    ),
+    name: str | None = typer.Option(
+        None, "--name", "-n", help="Dataset name (e.g., 'my-org/my-dataset')"
+    ),
+    push: bool = typer.Option(True, "--push/--no-push", help="Push to HuggingFace Hub"),
+    private: bool = typer.Option(False, "--private", help="Make dataset private on Hub"),
+    update_lock: bool = typer.Option(
+        True, "--update-lock/--no-update-lock", help="Update hud.lock.yaml"
+    ),
+    token: str | None = typer.Option(None, "--token", help="HuggingFace API token"),
+) -> None:
+    """📊 Convert tasks to HuggingFace dataset format.
+    Automatically detects task files if not specified.
+    Suggests dataset name based on environment if not provided.
+    Examples:
+        hud hf                      # Auto-detect tasks and suggest name
+        hud hf tasks.json           # Use specific file, suggest name
+        hud hf --name my-org/my-tasks  # Auto-detect tasks, use name
+        hud hf tasks.json --name hud-evals/web-tasks --private
+    """
+    hf_command(tasks_file, name, push, private, update_lock, token)
 def main() -> None:
     """Main entry point for the CLI."""
     # Show header for main help
@@ -846,7 +964,11 @@ def main() -> None:
         console.print("  3. Build for production: [cyan]hud build[/cyan]")
         console.print("  4. Share your environment: [cyan]hud push[/cyan]")
         console.print("  5. Get shared environments: [cyan]hud pull <org/name:tag>[/cyan]")
-        console.print("  6. Run and test: [cyan]hud run <image>[/cyan]\n")
+        console.print("  6. Run and test: [cyan]hud run <image>[/cyan]")
+        console.print("\n[yellow]RL Training:[/yellow]")
+        console.print("  1. Generate config: [cyan]hud rl init my-env:latest[/cyan]")
+        console.print("  2. Create dataset: [cyan]hud hf tasks.json --name my-org/my-tasks[/cyan]")
+        console.print("  3. Start training: [cyan]hud rl --model Qwen/Qwen2.5-3B[/cyan]\n")
     app()

hud/cli/analyze.py CHANGED Viewed

@@ -3,7 +3,8 @@
 from __future__ import annotations
 import json
-from typing import TYPE_CHECKING, Any
+from pathlib import Path  # noqa: TC003
+from typing import Any
 from rich.console import Console
 from rich.progress import Progress, SpinnerColumn, TextColumn
@@ -14,9 +15,6 @@ from rich.tree import Tree
 from hud.clients import MCPClient
 from hud.utils.design import HUDDesign
-if TYPE_CHECKING:
-    from pathlib import Path
 console = Console()
 design = HUDDesign()

hud/cli/build.py CHANGED Viewed

@@ -431,10 +431,14 @@ def build_environment(
         if optional_env:
             lock_content["environment"]["variables"]["optional"] = optional_env
-    # Add tool summary (not full schemas to keep it concise)
+    # Add tools with full schemas for RL config generation
     if analysis["tools"]:
         lock_content["tools"] = [
-            {"name": tool["name"], "description": tool.get("description", "")}
+            {
+                "name": tool["name"],
+                "description": tool.get("description", ""),
+                "inputSchema": tool.get("inputSchema", {}),
+            }
             for tool in analysis["tools"]
         ]

hud/cli/dev.py CHANGED Viewed

@@ -7,6 +7,7 @@ import base64
 import json
 import subprocess
 from pathlib import Path
+from typing import Any
 import click
 from fastmcp import FastMCP
@@ -155,6 +156,7 @@ async def start_mcp_proxy(
     import asyncio
     import logging
     import os
+    import signal
     import sys
     from .utils.logging import find_free_port
@@ -440,12 +442,30 @@ async def start_mcp_proxy(
                     log_design.warning(f"Traceback: {traceback.format_exc()}")  # noqa: G004
                 await asyncio.sleep(1)
+    # Import contextlib here so it's available in the finally block
+    import contextlib
     # CRITICAL: Create proxy AFTER all logging setup to prevent it from resetting logging config
     # This is important because FastMCP might initialize loggers during creation
     proxy = create_proxy_server(
         directory, image_name, no_reload, full_reload, verbose, docker_args or [], interactive
     )
+    # Set up signal handlers for graceful shutdown
+    shutdown_event = asyncio.Event()
+    def signal_handler(signum: int, frame: Any) -> None:
+        """Handle signals by setting shutdown event."""
+        design.info(f"\n📡 Received signal {signum}, shutting down gracefully...")
+        shutdown_event.set()
+    # Register signal handlers - SIGINT is available on all platforms
+    signal.signal(signal.SIGINT, signal_handler)
+    # SIGTERM is not available on Windows
+    if hasattr(signal, "SIGTERM"):
+        signal.signal(signal.SIGTERM, signal_handler)
     # One more attempt to suppress the FastMCP server log
     if not verbose:
         # Re-apply the filter in case new handlers were created
@@ -479,6 +499,47 @@ async def start_mcp_proxy(
             for handler in logger.handlers:
                 handler.addFilter(block_filter)
+    # Track if container has been stopped to avoid duplicate stops
+    container_stopped = False
+    # Function to stop the container gracefully
+    async def stop_container() -> None:
+        """Stop the Docker container gracefully with SIGTERM, wait 30s, then SIGKILL if needed."""
+        nonlocal container_stopped
+        if container_stopped:
+            return  # Already stopped, don't do it again
+        try:
+            # Check if container exists
+            check_result = await asyncio.create_subprocess_exec(
+                "docker",
+                "ps",
+                "--format",
+                "{{.Names}}",
+                "--filter",
+                f"name={container_name}",
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.DEVNULL,
+            )
+            stdout, _ = await check_result.communicate()
+            if container_name in stdout.decode():
+                design.info("🛑 Stopping container gracefully...")
+                # Stop with 30 second timeout before SIGKILL
+                stop_result = await asyncio.create_subprocess_exec(
+                    "docker",
+                    "stop",
+                    "--time=30",
+                    container_name,
+                    stdout=asyncio.subprocess.DEVNULL,
+                    stderr=asyncio.subprocess.DEVNULL,
+                )
+                await stop_result.communicate()
+                design.success("✅ Container stopped successfully")
+                container_stopped = True
+        except Exception as e:
+            design.warning(f"Failed to stop container: {e}")
     try:
         # Start Docker logs streaming if enabled
         log_task = None
@@ -530,6 +591,9 @@ async def start_mcp_proxy(
     except KeyboardInterrupt:
         design.info("\n👋 Shutting down...")
+        # Stop the container before showing next steps
+        await stop_container()
         # Show next steps tutorial
         if not interactive:  # Only show if not in interactive mode
             design.section_title("Next Steps")
@@ -565,6 +629,9 @@ async def start_mcp_proxy(
             except asyncio.CancelledError:
                 contextlib.suppress(asyncio.CancelledError)
+        # Always try to stop container on exit
+        await stop_container()
 def run_mcp_dev_server(
     directory: str = ".",

hud/cli/eval.py CHANGED Viewed

@@ -76,8 +76,6 @@ async def run_single_task(
 ) -> None:
     """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
-    design.info("📊 Loading dataset…")
     # Import Task and run_dataset lazily
     try:
         from hud.datasets import Task, run_dataset
@@ -91,6 +89,7 @@ async def run_single_task(
     # Check if it's a JSON file
     path = Path(source)
     if path.exists() and path.suffix == ".json":
+        design.info("📊 Loading task file…")
         with open(path) as f:  # noqa: ASYNC230
             json_data = json.load(f)
@@ -111,8 +110,7 @@ async def run_single_task(
                     )
                     raise typer.Exit(1) from e
-                agent_config: dict[str, Any] = {
-                }
+                agent_config: dict[str, Any] = {}
                 if allowed_tools:
                     agent_config["allowed_tools"] = allowed_tools
@@ -161,6 +159,7 @@ async def run_single_task(
             raise typer.Exit(1)
     else:
         # Load from HuggingFace dataset
+        design.info(f"📊 Loading dataset from HuggingFace: {source}…")
         try:
             from datasets import load_dataset
         except ImportError as e:
@@ -195,14 +194,20 @@ async def run_full_dataset(
     agent_type: Literal["claude", "openai"] = "claude",
     model: str | None = None,
     allowed_tools: list[str] | None = None,
-    max_concurrent: int = 30,
-    max_steps: int = 50,
+    max_concurrent: int = 50,
+    max_steps: int = 10,
+    parallel: bool = False,
+    max_workers: int | None = None,
+    max_concurrent_per_worker: int = 25,
 ) -> list[Any]:
-    """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
+    """Run evaluation across the entire dataset.
+    Uses either asyncio-based run_dataset or process-based parallel execution
+    depending on the parallel flag."""
     # Import run_dataset lazily
     try:
-        from hud.datasets import run_dataset
+        from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
     except ImportError as e:
         design.error(
             "Dataset dependencies are not installed. "
@@ -240,8 +245,7 @@ async def run_full_dataset(
             )
             raise typer.Exit(1) from e
-        agent_config: dict[str, Any] = {
-        }
+        agent_config: dict[str, Any] = {}
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
@@ -263,16 +267,47 @@ async def run_full_dataset(
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
-    design.info("🚀 Running evaluation…")
-    return await run_dataset(
-        name=f"Evaluation {dataset_name}",
-        dataset=dataset_or_tasks,
-        agent_class=agent_class,
-        agent_config=agent_config,
-        max_concurrent=max_concurrent,
-        metadata={"dataset": source},
-        max_steps=max_steps,
-    )
+    if parallel:
+        design.info(
+            f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…"  # noqa: E501
+        )
+        if max_workers is None:
+            # Use auto-optimization (now the default run_dataset_parallel)
+            return await run_dataset_parallel(
+                name=f"Evaluation {dataset_name}",
+                dataset=dataset_or_tasks,
+                agent_class=agent_class,
+                agent_config=agent_config,
+                max_concurrent=max_concurrent,
+                metadata={"dataset": source, "parallel": True},
+                max_steps=max_steps,
+                auto_respond=True,
+            )
+        else:
+            # Use manual configuration
+            return await run_dataset_parallel_manual(
+                name=f"Evaluation {dataset_name}",
+                dataset=dataset_or_tasks,
+                agent_class=agent_class,
+                agent_config=agent_config,
+                max_workers=max_workers,
+                max_concurrent_per_worker=max_concurrent_per_worker,
+                max_concurrent=max_concurrent,
+                metadata={"dataset": source, "parallel": True},
+                max_steps=max_steps,
+                auto_respond=True,
+            )
+    else:
+        design.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
+        return await run_dataset(
+            name=f"Evaluation {dataset_name}",
+            dataset=dataset_or_tasks,
+            agent_class=agent_class,
+            agent_config=agent_config,
+            max_concurrent=max_concurrent,
+            metadata={"dataset": source},
+            max_steps=max_steps,
+        )
 def eval_command(
@@ -303,13 +338,28 @@ def eval_command(
     max_concurrent: int = typer.Option(
         50,
         "--max-concurrent",
-        help="Concurrency level for full-dataset mode",
+        help="Concurrency level for asyncio mode (ignored in parallel mode)",
     ),
-    max_steps: int = typer.Option(
+    max_steps: int | None = typer.Option(
         None,
         "--max-steps",
         help="Maximum steps per task (default: 10 for single, 50 for full)",
     ),
+    parallel: bool = typer.Option(
+        False,
+        "--parallel",
+        help="Use process-based parallel execution for large datasets (100+ tasks)",
+    ),
+    max_workers: int | None = typer.Option(
+        None,
+        "--max-workers",
+        help="Number of worker processes for parallel mode (auto-optimized if not set)",
+    ),
+    max_concurrent_per_worker: int = typer.Option(
+        20,
+        "--max-concurrent-per-worker",
+        help="Maximum concurrent tasks per worker in parallel mode",
+    ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents.
@@ -317,40 +367,42 @@ def eval_command(
         # Evaluate a single task from SheetBench
         hud eval hud-evals/SheetBench-50
-        # Evaluate the FULL SheetBench dataset with Claude
+        # Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
         hud eval hud-evals/SheetBench-50 --full --agent claude
+        # Run large dataset with PARALLEL execution (auto-optimized)
+        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
+        # Parallel mode with manual configuration (16 workers, 25 tasks each)
+        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
+        # Limit total concurrent tasks to prevent rate limits
+        hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
         # Run a single task from a JSON file
         hud eval task.json
-        # Run multiple tasks from a JSON file (auto-detects list)
-        hud eval tasks.json  # If tasks.json contains a list, runs all tasks
-        # Run JSON list with full dataset mode and concurrency
-        hud eval tasks.json --full --max-concurrent 10
+        # Run multiple tasks from a JSON file with parallel execution
+        hud eval tasks.json --full --parallel
         # Run with OpenAI Operator agent
         hud eval hud-evals/OSWorld-Gold-Beta --agent openai
     """
-    import os
     from hud.settings import settings
     # Check for required API keys
     if agent == "claude":
-        if not settings.anthropic_api_key or not os.environ.get("ANTHROPIC_API_KEY"):
+        if not settings.anthropic_api_key:
             design.error("ANTHROPIC_API_KEY is required for Claude agent")
             design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
             raise typer.Exit(1)
-    elif agent == "openai" and (
-        not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY")
-    ):
+    elif agent == "openai" and not settings.openai_api_key:
         design.error("OPENAI_API_KEY is required for OpenAI agent")
         design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
         raise typer.Exit(1)
     # Check for HUD_API_KEY if using HUD services
-    if not settings.api_key or not os.environ.get("HUD_API_KEY"):
+    if not settings.api_key:
         design.warning("HUD_API_KEY not set. Some features may be limited.")
         design.info("Get your API key at: https://app.hud.so")
@@ -373,6 +425,9 @@ def eval_command(
                 allowed_tools=allowed_tools_list,
                 max_concurrent=max_concurrent,
                 max_steps=max_steps,
+                parallel=parallel,
+                max_workers=max_workers,
+                max_concurrent_per_worker=max_concurrent_per_worker,
             )
         )
     else:

hud-python 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.13py3-none-any.whl → 0.4.15py3-none-any.whl