PyPI - hud-python - Versions diffs - 0.4.47__py3-none-any.whl → 0.4.49__py3-none-any.whl - Mend

hud-python 0.4.47py3-none-any.whl → 0.4.49py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (45) hide show

hud/agents/base.py +55 -142
hud/agents/claude.py +5 -6
hud/agents/grounded_openai.py +1 -1
hud/agents/misc/integration_test_agent.py +2 -0
hud/agents/tests/test_base.py +2 -5
hud/cli/__init__.py +80 -215
hud/cli/build.py +105 -45
hud/cli/dev.py +614 -743
hud/cli/eval.py +14 -9
hud/cli/flows/tasks.py +100 -21
hud/cli/init.py +18 -14
hud/cli/push.py +27 -9
hud/cli/rl/local_runner.py +28 -16
hud/cli/rl/vllm.py +2 -0
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_eval.py +574 -0
hud/cli/tests/test_mcp_server.py +6 -95
hud/cli/tests/test_utils.py +1 -1
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/source_hash.py +1 -1
hud/datasets/parallel.py +0 -12
hud/datasets/runner.py +1 -4
hud/rl/actor.py +4 -2
hud/rl/distributed.py +1 -1
hud/rl/learner.py +2 -1
hud/rl/train.py +1 -1
hud/server/__init__.py +2 -1
hud/server/router.py +160 -0
hud/server/server.py +246 -79
hud/telemetry/trace.py +1 -1
hud/tools/base.py +20 -10
hud/tools/computer/__init__.py +2 -0
hud/tools/computer/qwen.py +431 -0
hud/tools/computer/settings.py +16 -0
hud/tools/executors/pyautogui.py +1 -1
hud/tools/playwright.py +1 -1
hud/types.py +2 -3
hud/utils/hud_console.py +43 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/METADATA +1 -1
{hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/RECORD +45 -42
{hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/WHEEL +0 -0
{hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/licenses/LICENSE +0 -0

hud/cli/eval.py CHANGED Viewed

@@ -199,6 +199,8 @@ async def run_single_task(
 ) -> None:
     """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
+    # Provide early feedback to user
+    hud_console.info("🔧 Initializing evaluation...")
     # Import Task and run_dataset lazily
     try:
         from hud.utils.tasks import load_tasks
@@ -318,7 +320,10 @@ async def run_single_task(
         )
         display_group_statistics(stats, show_details=True)
     else:
-        # Original single-run logic
+        # Enable agent step logging for single task mode
+        logging.getLogger("hud.agents").setLevel(logging.INFO)
+        logging.getLogger("hud.agents.base").setLevel(logging.INFO)
         with hud.trace(name=task_prompt):
             agent = build_agent(
                 agent_type,
@@ -352,6 +357,9 @@ async def run_full_dataset(
     Uses either asyncio-based run_dataset or process-based parallel execution
     depending on the parallel flag."""
+    # Provide early feedback to user
+    hud_console.info("🔧 Initializing evaluation...")
     # Import run_dataset lazily
     try:
         from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
@@ -367,7 +375,7 @@ async def run_full_dataset(
     hud_console.info(f"📊 Loading tasks from: {source}…")
     tasks: list[Task] = load_tasks(source)  # type: ignore[assignment]
-    if not tasks:
+    if len(tasks) == 0:
         hud_console.error(f"No tasks found in: {source}")
         raise typer.Exit(1)
@@ -646,10 +654,10 @@ def eval_command(
         hud eval hud-evals/SheetBench-50 --full --agent claude
         # Run large dataset with PARALLEL execution (auto-optimized)
-        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
+        hud eval hud-evals/OSWorld-Verified-Gold --full --parallel
         # Parallel mode with manual configuration (16 workers, 25 tasks each)
-        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
+        hud eval hud-evals/OSWorld-Verified-Gold --full --parallel --max-workers 16
         # Limit total concurrent tasks to prevent rate limits
         hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
@@ -674,6 +682,8 @@ def eval_command(
     """
     from hud.settings import settings
+    # Always configure basic logging so agent steps can be logged
+    # Set to INFO by default for consistency with run_evaluation.py
     if very_verbose:
         logging.basicConfig(
             level=logging.DEBUG,
@@ -683,11 +693,6 @@ def eval_command(
         logging.getLogger("hud.agents").setLevel(logging.DEBUG)
         logging.getLogger("hud.agents.base").setLevel(logging.DEBUG)
     elif verbose:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s - %(name)s - %(message)s",
-            datefmt="%H:%M:%S",
-        )
         logging.getLogger("hud.agents").setLevel(logging.INFO)
         logging.getLogger("hud.agents.base").setLevel(logging.INFO)

hud/cli/flows/tasks.py CHANGED Viewed

@@ -78,26 +78,38 @@ def _ensure_pushed(env_dir: Path, lock_data: dict[str, Any]) -> dict[str, Any]:
 def _derive_remote_image(lock_data: dict[str, Any]) -> str:
-    """Derive org/name:tag from lock file for MCP header.
+    """Derive org/name:tag from lock file for remote MCP header.
-    Preference order:
-    1) lock_data["push"]["image_with_tag"] if present
-    2) Derive from lock_data["image"] (may be a digest; falls back to latest)
+    Preference order (new lock first, then legacy):
+    1) lock_data["push"]["image_with_tag"] (exact org/name:tag that was pushed)
+    2) lock_data["images"]["local"] (base name with internal version)
+    3) lock_data["image"] (legacy field; may contain tag or digest)
     """
-    push_info = lock_data.get("push", {}) if isinstance(lock_data, dict) else {}
+    if not isinstance(lock_data, dict):  # Defensive
+        raise typer.Exit(1)
-    # 1) Exact image_with_tag if present
-    pushed_with_tag = str(push_info.get("image_with_tag", "")).strip()
+    # 1) Prefer the exact image that was pushed (org/name:tag)
+    push_info = lock_data.get("push") or {}
+    pushed_with_tag = str(push_info.get("image_with_tag") or "").strip()
     if pushed_with_tag:
         name, tag = extract_name_and_tag(pushed_with_tag)
         return f"{name}:{tag}"
-    # Base name always comes from lock_data.image to preserve org/repo
-    image_ref = str(lock_data.get("image", "")).strip()
-    if not image_ref:
-        raise typer.Exit(1)
-    name, tag = extract_name_and_tag(image_ref)
-    return f"{name}:{tag}"
+    # 2) Fall back to the local tag recorded in the new lock schema
+    images = lock_data.get("images") or {}
+    local_image = str(images.get("local") or "").strip()
+    if local_image:
+        name, tag = extract_name_and_tag(local_image)
+        return f"{name}:{tag}"
+    # 3) Legacy top-level image field
+    legacy_image = str(lock_data.get("image") or "").strip()
+    if legacy_image:
+        name, tag = extract_name_and_tag(legacy_image)
+        return f"{name}:{tag}"
+    # If none of the above exist, we cannot derive an image
+    raise typer.Exit(1)
 def _extract_existing_images(tasks: list[Task]) -> set[str]:
@@ -183,6 +195,63 @@ def _extract_dotenv_api_key_vars(env_dir: Path) -> set[str]:
     return detected
+def _extract_env_vars_from_docker_args(args: list[str]) -> set[str]:
+    """Extract environment variable names from docker run arguments.
+    Parses args like: ["run", "--rm", "-i", "-e", "API_KEY=value", "-e", "TOKEN", "image:tag"]
+    Returns set of env var names (not values).
+    """
+    env_vars: set[str] = set()
+    i = 0
+    while i < len(args):
+        arg = args[i]
+        # Check for -e or --env flags
+        if arg in ("-e", "--env"):
+            if i + 1 < len(args):
+                env_spec = args[i + 1]
+                # Could be "KEY=value" or just "KEY"
+                var_name = env_spec.split("=", 1)[0].strip()
+                if var_name:
+                    env_vars.add(var_name)
+                i += 2
+                continue
+        # Check for --env=KEY=value format
+        elif arg.startswith("--env="):
+            env_spec = arg[6:]  # Remove "--env=" prefix
+            var_name = env_spec.split("=", 1)[0].strip()
+            if var_name:
+                env_vars.add(var_name)
+        i += 1
+    env_vars.discard("HUD_API_KEY")
+    return env_vars
+def _extract_vars_from_task_configs(raw_tasks: list[dict[str, Any]]) -> set[str]:
+    """Extract environment variable names from docker run commands in task mcp_configs."""
+    all_env_vars: set[str] = set()
+    for task in raw_tasks:
+        mcp_config = task.get("mcp_config", {})
+        # Iterate through all server configs
+        for server_config in mcp_config.values():
+            if not isinstance(server_config, dict):
+                continue
+            command = server_config.get("command", "")
+            args = server_config.get("args", [])
+            # Only process docker run commands
+            if command == "docker" and "run" in args:
+                env_vars = _extract_env_vars_from_docker_args(args)
+                all_env_vars.update(env_vars)
+    return all_env_vars
 def convert_tasks_to_remote(tasks_file: str) -> str:
     """Convert a local tasks file to remote MCP tasks and return new filename.
@@ -297,12 +366,21 @@ def convert_tasks_to_remote(tasks_file: str) -> str:
         hud_console.success(f"Updated {tasks_path.name} with latest image: {remote_image}")
         return str(tasks_path)
-    # Extract additional API key headers from lock and suggest from .env
+    # Extract environment variables from multiple sources:
+    # 1. Lock file (authoritative for required env vars)
     provided_keys = _extract_api_key_vars(lock_data)
+    # 2. Task configs (docker run -e flags)
+    task_env_vars = _extract_vars_from_task_configs(raw_tasks)
+    # 3. .env file (detect API-like vars)
     dotenv_keys = _extract_dotenv_api_key_vars(env_dir)
-    # If .env contains API-like vars not in lock, offer to include them
-    missing = sorted(dotenv_keys - provided_keys)
+    # Combine: lock file vars + task config vars, then check for missing from .env
+    all_detected = provided_keys | task_env_vars
+    # If .env contains API-like vars not yet included, offer to add them
+    missing = sorted(dotenv_keys - all_detected)
     if missing:
         names_preview = ", ".join(missing)
         prompt = (
@@ -310,7 +388,10 @@ def convert_tasks_to_remote(tasks_file: str) -> str:
             "Include them as remote headers (values will be ${VAR} placeholders)?"
         )
         if hud_console.confirm(prompt, default=True):
-            provided_keys.update(missing)
+            all_detected.update(missing)
+    # Final set of env vars to convert to headers
+    provided_keys = all_detected
     extra_api_key_headers: dict[str, str] = {}
     for var_name in provided_keys:
@@ -364,10 +445,8 @@ def convert_tasks_to_remote(tasks_file: str) -> str:
             item["setup_tool"] = _simplify_tool_call(t.setup_tool)
         if t.evaluate_tool is not None:
             item["evaluate_tool"] = _simplify_tool_call(t.evaluate_tool)
-        if t.agent_tools is not None:
-            item["agent_tools"] = t.agent_tools
-        if t.system_prompt is not None:
-            item["system_prompt"] = t.system_prompt
+        if t.agent_config is not None:
+            item["agent_config"] = t.agent_config
         if t.metadata:
             item["metadata"] = t.metadata
         if t.id is not None:

hud/cli/init.py CHANGED Viewed

@@ -29,9 +29,12 @@ SKIP_DIR_NAMES = {"node_modules", "__pycache__", "dist", "build", ".next", ".git
 # Files that need placeholder replacement
 PLACEHOLDER_FILES = {
-    "pyproject.toml",
+    "server/pyproject.toml",
+    "environment/pyproject.toml",
+    "server/main.py",
+    "server/README.md",
+    "environment/README.md",
     "tasks.json",
-    "src/controller/server.py",
     "test_env.ipynb",
     "README.md",
 }
@@ -48,7 +51,7 @@ def _replace_placeholders(target_dir: Path, env_name: str) -> list[str]:
         List of files that were modified
     """
     modified_files = []
-    placeholder = "test_test"
+    placeholder = "blank"  # Placeholder used in blank environment template
     # Normalize environment name for use in code/configs
     # Replace spaces and special chars with underscores for Python identifiers
@@ -240,17 +243,18 @@ def create_environment(
         f"Downloaded {len(files_created_dl)} files in {duration_ms} ms into {target_dir}"
     )
-    # Replace placeholders in template files
-    hud_console.section_title("Customizing template files")
-    modified_files = _replace_placeholders(target_dir, name)
-    if modified_files:
-        hud_console.success(f"Replaced placeholders in {len(modified_files)} files:")
-        for file in modified_files[:5]:  # Show first 5 files
-            hud_console.status_item(file, "updated")
-        if len(modified_files) > 5:
-            hud_console.info(f"... and {len(modified_files) - 5} more files")
-    else:
-        hud_console.info("No placeholder replacements needed")
+    # Replace placeholders in template files (only for blank preset)
+    if preset_normalized == "blank":
+        hud_console.section_title("Customizing template files")
+        modified_files = _replace_placeholders(target_dir, name)
+        if modified_files:
+            hud_console.success(f"Replaced placeholders in {len(modified_files)} files:")
+            for file in modified_files[:5]:  # Show first 5 files
+                hud_console.status_item(file, "updated")
+            if len(modified_files) > 5:
+                hud_console.info(f"... and {len(modified_files) - 5} more files")
+        else:
+            hud_console.info("No placeholder replacements needed")
     hud_console.section_title("Top-level files and folders")
     for entry in sorted(os.listdir(target_dir)):

hud/cli/push.py CHANGED Viewed

@@ -163,10 +163,7 @@ def push_environment(
         lock_data = yaml.safe_load(f)
     # Handle both old and new lock file formats
-    local_image = lock_data.get("image", "")
-    if not local_image and "build" in lock_data:
-        # New format might have image elsewhere
-        local_image = lock_data.get("image", "")
+    local_image = lock_data.get("images", {}).get("local") or lock_data.get("image", "")
     # Get internal version from lock file
     internal_version = lock_data.get("build", {}).get("version", None)
@@ -293,7 +290,7 @@ def push_environment(
     # Push the image
     hud_console.progress_message(f"Pushing {image} to registry...")
-    # Show push output
+    # Show push output (filtered for cleaner display)
     process = subprocess.Popen(  # noqa: S603
         ["docker", "push", image],  # noqa: S607
         stdout=subprocess.PIPE,
@@ -303,8 +300,27 @@ def push_environment(
         errors="replace",
     )
+    # Filter output to only show meaningful progress
+    layers_pushed = 0
     for line in process.stdout or []:
-        hud_console.info(line.rstrip())
+        line = line.rstrip()
+        # Only show: digest, pushed, mounted, or error lines
+        if any(
+            keyword in line.lower()
+            for keyword in ["digest:", "pushed", "mounted", "error", "denied"]
+        ):
+            if "pushed" in line.lower():
+                layers_pushed += 1
+            if (
+                verbose
+                or "error" in line.lower()
+                or "denied" in line.lower()
+                or "digest:" in line.lower()
+            ):
+                hud_console.info(line)
+    if layers_pushed > 0 and not verbose:
+        hud_console.info(f"Pushed {layers_pushed} layer(s)")
     process.wait()
@@ -331,8 +347,10 @@ def push_environment(
     hud_console.section_title("Pushed Image")
     hud_console.status_item("Registry", pushed_digest, primary=True)
-    # Update the lock file with registry information
-    lock_data["image"] = pushed_digest
+    # Update the lock file with pushed image reference
+    if "images" not in lock_data:
+        lock_data["images"] = {}
+    lock_data["images"]["pushed"] = image
     # Add push information
     from datetime import UTC, datetime
@@ -348,7 +366,7 @@ def push_environment(
     with open(lock_path, "w") as f:
         yaml.dump(lock_data, f, default_flow_style=False, sort_keys=False)
-    hud_console.success("Updated lock file with registry image")
+    hud_console.success("Updated lock file with pushed image reference")
     # Upload lock file to HUD registry
     try:

hud/cli/rl/local_runner.py CHANGED Viewed

@@ -190,9 +190,9 @@ def run_local_training(
     invalid_tasks: list[str] = []
     for i, task in enumerate(tasks):
-        if not hasattr(task, "prompt") or not task.prompt:
+        if not hasattr(task, "prompt") or not task.prompt:  # type: ignore
             invalid_tasks.append(f"Task {i}: missing 'prompt' field")
-        if not hasattr(task, "mcp_config") or not task.mcp_config:
+        if not hasattr(task, "mcp_config") or not task.mcp_config:  # type: ignore
             invalid_tasks.append(f"Task {i}: missing 'mcp_config' field")
     if invalid_tasks:
@@ -230,19 +230,33 @@ def run_local_training(
                 console.print("Enter the model name (HuggingFace ID):")
                 model = input().strip()
-    # Validate model is a VL model (whether provided via CLI or selected)
-    if model:
+    # try to get model from config file
+    if config_file:
+        console.print(f"\n[cyan]Loading configuration from: {config_file}[/cyan]")
+        config = load_config(config_file)
+        if hasattr(config, "model") and hasattr(config.model, "base_model"):
+            if model is None:
+                model = config.model.base_model
+            else:
+                console.print(
+                    f"[yellow]Model already set to {model}, using that instead "
+                    f"of {config.model.base_model}[/yellow] (override)"
+                )
+    if model is None:
+        console.print("[red]❌ No model specified either through CLI or config file[/red]")
         try:
-            validate_vl_model(model)
-        except ValueError as e:
-            console.print(f"\n[red]❌ {e}[/red]")
-            try:
-                import typer
+            import typer
-                raise typer.Exit(1)
-            except Exception:
-                return
-    else:
+            raise typer.Exit(1)
+        except Exception:
+            return
+    # Validate model is a VL model (whether provided via CLI or selected)
+    try:
+        validate_vl_model(model)
+    except ValueError as e:
+        console.print(f"\n[red]❌ {e}[/red]")
         try:
             import typer
@@ -488,7 +502,6 @@ def run_local_training(
         from .vllm import start_vllm_server, wait_for_vllm_server
         start_vllm_server(config.model.base_model, vllm_gpu_idx, restart=restart)
         server_ready = asyncio.run(wait_for_vllm_server())
         if not server_ready:
             console.print("[red]❌ Failed to start vLLM server[/red]")
@@ -507,7 +520,6 @@ def run_local_training(
             f"\n[bold green]🎯 Starting DDP training on {len(training_gpus)} GPUs...[/bold green]\n"
         )
         launch_ddp_training(training_gpus, tasks_file, temp_config_path, verbose)
-        console.print("\n[green]✅ Training completed successfully![/green]")
     else:
         console.print("\n[bold green]🎯 Starting single-GPU training...[/bold green]\n")
         try:
@@ -518,7 +530,7 @@ def run_local_training(
             # Import and run the async training function lazily
             from hud.rl.train import train  # heavy import
-            asyncio.run(train(config, tasks))
+            asyncio.run(train(config, tasks))  # type: ignore
             console.print("\n[green]✅ Training completed successfully![/green]")
             try:

hud/cli/rl/vllm.py CHANGED Viewed

@@ -165,6 +165,8 @@ async def wait_for_vllm_server(timeout: int = 360) -> bool:  # noqa: ASYNC109
                 if response.status_code == 200:
                     console.print("[green]✅ vLLM server is ready![/green]")
                     return True
+            except httpx.ConnectError:
+                pass
             except Exception as e:
                 hud_console.error(f"Failed to connect to vLLM server: {e}")

hud/cli/tests/test_analyze_metadata.py CHANGED Viewed

@@ -214,6 +214,7 @@ class TestAnalyzeFromMetadata:
     @mock.patch("hud.cli.utils.metadata.check_local_cache")
     @mock.patch("hud.cli.utils.metadata.fetch_lock_from_registry")
+    @mock.patch("hud.cli.utils.metadata.hud_console")
     @mock.patch("hud.cli.utils.metadata.console")
     async def test_analyze_not_found(self, mock_console, mock_hud_console, mock_fetch, mock_check):
         """Test when environment not found anywhere."""
@@ -222,9 +223,9 @@ class TestAnalyzeFromMetadata:
         await analyze_from_metadata("test/notfound:latest", "json", verbose=False)
-        # Should show error
+        # Should show error via hud_console
         mock_hud_console.error.assert_called_with("Environment metadata not found")
-        # Should print suggestions
+        # Should print suggestions via console
         mock_console.print.assert_called()
     @mock.patch("hud.cli.utils.metadata.check_local_cache")

hud-python 0.4.47__py3-none-any.whl → 0.4.49__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.47py3-none-any.whl → 0.4.49py3-none-any.whl