PyPI - hud-python - Versions diffs - 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl - Mend

hud-python 0.4.11py3-none-any.whl → 0.4.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show

hud/__main__.py +8 -0
hud/agents/base.py +7 -8
hud/agents/langchain.py +2 -2
hud/agents/tests/test_openai.py +3 -1
hud/cli/__init__.py +114 -52
hud/cli/build.py +121 -71
hud/cli/debug.py +2 -2
hud/cli/{mcp_server.py → dev.py} +101 -38
hud/cli/eval.py +175 -90
hud/cli/init.py +442 -64
hud/cli/list_func.py +72 -71
hud/cli/pull.py +1 -2
hud/cli/push.py +35 -23
hud/cli/remove.py +35 -41
hud/cli/tests/test_analyze.py +2 -1
hud/cli/tests/test_analyze_metadata.py +42 -49
hud/cli/tests/test_build.py +28 -52
hud/cli/tests/test_cursor.py +1 -1
hud/cli/tests/test_debug.py +1 -1
hud/cli/tests/test_list_func.py +75 -64
hud/cli/tests/test_main_module.py +30 -0
hud/cli/tests/test_mcp_server.py +3 -3
hud/cli/tests/test_pull.py +30 -61
hud/cli/tests/test_push.py +70 -89
hud/cli/tests/test_registry.py +36 -38
hud/cli/tests/test_utils.py +1 -1
hud/cli/utils/__init__.py +1 -0
hud/cli/{docker_utils.py → utils/docker.py} +36 -0
hud/cli/{env_utils.py → utils/environment.py} +7 -7
hud/cli/{interactive.py → utils/interactive.py} +91 -19
hud/cli/{analyze_metadata.py → utils/metadata.py} +12 -8
hud/cli/{registry.py → utils/registry.py} +28 -30
hud/cli/{remote_runner.py → utils/remote_runner.py} +1 -1
hud/cli/utils/runner.py +134 -0
hud/cli/utils/server.py +250 -0
hud/clients/base.py +1 -1
hud/clients/fastmcp.py +5 -13
hud/clients/mcp_use.py +6 -10
hud/server/server.py +35 -5
hud/shared/exceptions.py +11 -0
hud/shared/tests/test_exceptions.py +22 -0
hud/telemetry/tests/__init__.py +0 -0
hud/telemetry/tests/test_replay.py +40 -0
hud/telemetry/tests/test_trace.py +63 -0
hud/tools/base.py +20 -3
hud/tools/computer/hud.py +15 -6
hud/tools/executors/tests/test_base_executor.py +27 -0
hud/tools/response.py +12 -8
hud/tools/tests/test_response.py +60 -0
hud/tools/tests/test_tools_init.py +49 -0
hud/utils/design.py +19 -8
hud/utils/mcp.py +17 -5
hud/utils/tests/test_mcp.py +112 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/METADATA +16 -13
{hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/RECORD +62 -52
hud/cli/runner.py +0 -160
/hud/cli/{cursor.py → utils/cursor.py} +0 -0
/hud/cli/{utils.py → utils/logging.py} +0 -0
{hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/WHEEL +0 -0
{hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.11.dist-info → hud_python-0.4.13.dist-info}/licenses/LICENSE +0 -0

hud/cli/eval.py CHANGED Viewed

@@ -6,18 +6,13 @@ import asyncio
 import json
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Any, Literal
 import typer
 import hud
 from hud.utils.design import HUDDesign
-if TYPE_CHECKING:
-    from datasets import Dataset
-    from hud.agents import ClaudeAgent, OperatorAgent
-    from hud.agents.misc.response_agent import ResponseAgent
 logger = logging.getLogger(__name__)
 design = HUDDesign()
@@ -29,17 +24,8 @@ def build_agent(
     allowed_tools: list[str] | None = None,
 ) -> Any:
     """Create and return the requested agent type."""
     # Import agents lazily to avoid dependency issues
-    try:
-        from hud.agents.misc.response_agent import ResponseAgent
-    except ImportError as e:
-        design.error(
-            "Agent dependencies are not installed. "
-            "Please install with: pip install 'hud-python[agent]'"
-        )
-        raise typer.Exit(1) from e
     if agent_type == "openai":
         try:
             from hud.agents import OperatorAgent
@@ -49,14 +35,14 @@ def build_agent(
                 "Please install with: pip install 'hud-python[agent]'"
             )
             raise typer.Exit(1) from e
-        allowed_tools = allowed_tools or ["openai_computer"]
-        return OperatorAgent(
-            allowed_tools=allowed_tools,
-            response_agent=ResponseAgent(),
-        )
+        if allowed_tools:
+            return OperatorAgent(
+                allowed_tools=allowed_tools,
+            )
+        else:
+            return OperatorAgent()
     # Fallback Claude agent (Anthropic)
     try:
         from hud.agents import ClaudeAgent
@@ -66,15 +52,18 @@ def build_agent(
             "Please install with: pip install 'hud-python[agent]'"
         )
         raise typer.Exit(1) from e
     model = model or "claude-sonnet-4-20250514"
-    allowed_tools = allowed_tools or ["anthropic_computer"]
-    return ClaudeAgent(
-        model=model,
-        allowed_tools=allowed_tools,
-        response_agent=ResponseAgent(),
-    )
+    if allowed_tools:
+        return ClaudeAgent(
+            model=model,
+            allowed_tools=allowed_tools,
+        )
+    else:
+        return ClaudeAgent(
+            model=model,
+        )
 async def run_single_task(
@@ -85,26 +74,91 @@ async def run_single_task(
     allowed_tools: list[str] | None = None,
     max_steps: int = 10,
 ) -> None:
-    """Load one task and execute it."""
+    """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
     design.info("📊 Loading dataset…")
-    # Import Task lazily
+    # Import Task and run_dataset lazily
     try:
-        from hud.datasets import Task
+        from hud.datasets import Task, run_dataset
     except ImportError as e:
         design.error(
             "Dataset dependencies are not installed. "
             "Please install with: pip install 'hud-python[agent]'"
         )
         raise typer.Exit(1) from e
-    # Check if it's a single task JSON file
+    # Check if it's a JSON file
     path = Path(source)
     if path.exists() and path.suffix == ".json":
-        with open(path, "r") as f:
-            task_data = json.load(f)
-        task = Task(**task_data)
+        with open(path) as f:  # noqa: ASYNC230
+            json_data = json.load(f)
+        # Check if JSON contains multiple tasks (list with more than 1 task)
+        if isinstance(json_data, list) and len(json_data) > 1:
+            design.info(f"Found {len(json_data)} tasks in JSON file, running as dataset…")
+            # Build agent class and config for run_dataset
+            if agent_type == "openai":
+                try:
+                    from hud.agents import OperatorAgent
+                    agent_class = OperatorAgent
+                except ImportError as e:
+                    design.error(
+                        "OpenAI agent dependencies are not installed. "
+                        "Please install with: pip install 'hud-python[agent]'"
+                    )
+                    raise typer.Exit(1) from e
+                agent_config: dict[str, Any] = {
+                }
+                if allowed_tools:
+                    agent_config["allowed_tools"] = allowed_tools
+            else:
+                try:
+                    from hud.agents import ClaudeAgent
+                    agent_class = ClaudeAgent
+                except ImportError as e:
+                    design.error(
+                        "Claude agent dependencies are not installed. "
+                        "Please install with: pip install 'hud-python[agent]'"
+                    )
+                    raise typer.Exit(1) from e
+                agent_config = {
+                    "model": model or "claude-sonnet-4-20250514",
+                }
+                if allowed_tools:
+                    agent_config["allowed_tools"] = allowed_tools
+            # Run as dataset with single-task concurrency to maintain debug behavior
+            results = await run_dataset(
+                name=f"JSON Dataset: {path.name}",
+                dataset=json_data,  # Pass the list directly
+                agent_class=agent_class,
+                agent_config=agent_config,
+                max_concurrent=1,  # Run sequentially for debug mode
+                metadata={"source": str(path)},
+                max_steps=max_steps,
+            )
+            # Display summary
+            successful = sum(1 for r in results if getattr(r, "reward", 0) > 0)
+            design.success(f"Completed {len(results)} tasks: {successful} successful")
+            return
+        # Single task JSON (either direct object or list with 1 task)
+        if isinstance(json_data, list) and len(json_data) == 1:
+            design.info("Found 1 task in JSON file, running as single task…")
+            task = Task(**json_data[0])
+        elif isinstance(json_data, dict):
+            task = Task(**json_data)
+        else:
+            design.error("JSON file must contain a list of tasks when using --full flag")
+            raise typer.Exit(1)
     else:
         # Load from HuggingFace dataset
         try:
@@ -115,15 +169,15 @@ async def run_single_task(
                 "Please install with: pip install 'hud-python[agent]'"
             )
             raise typer.Exit(1) from e
         dataset = load_dataset(source, split="train")
         # Get first task from dataset
         sample_task = dataset[0]  # type: ignore[index]
         task = Task(**sample_task)  # type: ignore[arg-type]
     task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
     with hud.trace(name=task_prompt):
         agent = build_agent(
             agent_type,
@@ -145,7 +199,7 @@ async def run_full_dataset(
     max_steps: int = 50,
 ) -> list[Any]:
     """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
     # Import run_dataset lazily
     try:
         from hud.datasets import run_dataset
@@ -155,11 +209,29 @@ async def run_full_dataset(
             "Please install with: pip install 'hud-python[agent]'"
         )
         raise typer.Exit(1) from e
+    # Check if source is a JSON file with list of tasks
+    path = Path(source)
+    dataset_or_tasks = source
+    dataset_name = source.split("/")[-1]
+    if path.exists() and path.suffix == ".json":
+        with open(path) as f:  # noqa: ASYNC230
+            json_data = json.load(f)
+        if isinstance(json_data, list):
+            dataset_or_tasks = json_data
+            dataset_name = f"JSON Dataset: {path.name}"
+            design.info(f"Found {len(json_data)} tasks in JSON file")
+        else:
+            design.error("JSON file must contain a list of tasks when using --full flag")
+            raise typer.Exit(1)
     # Build agent class + config for run_dataset
     if agent_type == "openai":
         try:
             from hud.agents import OperatorAgent
             agent_class = OperatorAgent
         except ImportError as e:
             design.error(
@@ -167,13 +239,16 @@ async def run_full_dataset(
                 "Please install with: pip install 'hud-python[agent]'"
             )
             raise typer.Exit(1) from e
         agent_config: dict[str, Any] = {
-            "allowed_tools": allowed_tools or ["openai_computer"],
         }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
     else:
         try:
             from hud.agents import ClaudeAgent
             agent_class = ClaudeAgent
         except ImportError as e:
             design.error(
@@ -181,29 +256,29 @@ async def run_full_dataset(
                 "Please install with: pip install 'hud-python[agent]'"
             )
             raise typer.Exit(1) from e
         agent_config = {
             "model": model or "claude-sonnet-4-20250514",
-            "allowed_tools": allowed_tools or ["anthropic_computer"],
         }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
     design.info("🚀 Running evaluation…")
     return await run_dataset(
-        name=f"Evaluation {source.split('/')[-1]}",
-        dataset=source,
+        name=f"Evaluation {dataset_name}",
+        dataset=dataset_or_tasks,
         agent_class=agent_class,
         agent_config=agent_config,
         max_concurrent=max_concurrent,
         metadata={"dataset": source},
         max_steps=max_steps,
-        auto_respond=True,
     )
 def eval_command(
     source: str = typer.Argument(
         ...,
-        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
+        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), single task JSON file, or JSON file with list of tasks",  # noqa: E501
     ),
     full: bool = typer.Option(
         False,
@@ -237,66 +312,76 @@ def eval_command(
     ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents.
     Examples:
         # Evaluate a single task from SheetBench
         hud eval hud-evals/SheetBench-50
         # Evaluate the FULL SheetBench dataset with Claude
         hud eval hud-evals/SheetBench-50 --full --agent claude
         # Run a single task from a JSON file
         hud eval task.json
+        # Run multiple tasks from a JSON file (auto-detects list)
+        hud eval tasks.json  # If tasks.json contains a list, runs all tasks
+        # Run JSON list with full dataset mode and concurrency
+        hud eval tasks.json --full --max-concurrent 10
         # Run with OpenAI Operator agent
         hud eval hud-evals/OSWorld-Gold-Beta --agent openai
     """
-    from hud.settings import settings
     import os
+    from hud.settings import settings
     # Check for required API keys
     if agent == "claude":
         if not settings.anthropic_api_key or not os.environ.get("ANTHROPIC_API_KEY"):
             design.error("ANTHROPIC_API_KEY is required for Claude agent")
             design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
             raise typer.Exit(1)
-    elif agent == "openai":
-        if not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY"):
-            design.error("OPENAI_API_KEY is required for OpenAI agent")
-            design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
-            raise typer.Exit(1)
+    elif agent == "openai" and (
+        not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY")
+    ):
+        design.error("OPENAI_API_KEY is required for OpenAI agent")
+        design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
+        raise typer.Exit(1)
     # Check for HUD_API_KEY if using HUD services
     if not settings.api_key or not os.environ.get("HUD_API_KEY"):
         design.warning("HUD_API_KEY not set. Some features may be limited.")
         design.info("Get your API key at: https://app.hud.so")
     # Parse allowed tools
     allowed_tools_list = (
-        [t.strip() for t in allowed_tools.split(",") if t.strip()]
-        if allowed_tools
-        else None
+        [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
     )
     # Set default max_steps if not provided
     if max_steps is None:
         max_steps = 50 if full else 10
     # Run evaluation
     if full:
-        asyncio.run(run_full_dataset(
-            source,
-            agent_type=agent,
-            model=model,
-            allowed_tools=allowed_tools_list,
-            max_concurrent=max_concurrent,
-            max_steps=max_steps,
-        ))
+        asyncio.run(
+            run_full_dataset(
+                source,
+                agent_type=agent,
+                model=model,
+                allowed_tools=allowed_tools_list,
+                max_concurrent=max_concurrent,
+                max_steps=max_steps,
+            )
+        )
     else:
-        asyncio.run(run_single_task(
-            source,
-            agent_type=agent,
-            model=model,
-            allowed_tools=allowed_tools_list,
-            max_steps=max_steps,
-        ))
+        asyncio.run(
+            run_single_task(
+                source,
+                agent_type=agent,
+                model=model,
+                allowed_tools=allowed_tools_list,
+                max_steps=max_steps,
+            )
+        )

hud-python 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.11py3-none-any.whl → 0.4.13py3-none-any.whl