PyPI - hud-python - Versions diffs - 0.4.51__tar.gz → 0.4.52__tar.gz - Mend

hud-python 0.4.51tar.gz → 0.4.52tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (261) hide show

{hud_python-0.4.51 → hud_python-0.4.52}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.51
+Version: 0.4.52
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -48,6 +48,7 @@ Requires-Dist: opentelemetry-api>=1.34.1
 Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
 Requires-Dist: opentelemetry-instrumentation-mcp==0.47.0
 Requires-Dist: opentelemetry-sdk>=1.34.1
+Requires-Dist: packaging>=21.0
 Requires-Dist: pathspec>=0.12.1
 Requires-Dist: pillow>=11.1.0
 Requires-Dist: prompt-toolkit==3.0.51

{hud_python-0.4.51 → hud_python-0.4.52}/environments/blank/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "MCP server for blank environment"
 requires-python = ">=3.11"
 dependencies = [
-    "hud-python>=0.4.51",
+    "hud-python>=0.4.52",
     "httpx>=0.28.1",
 ]

{hud_python-0.4.51 → hud_python-0.4.52}/environments/deepresearch/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "MCP server for DeepResearch environment"
 requires-python = ">=3.11"
 dependencies = [
-    "hud-python>=0.4.51",
+    "hud-python>=0.4.52",
     "httpx>=0.24.0",
 ]

{hud_python-0.4.51 → hud_python-0.4.52}/hud/__init__.py RENAMED Viewed

@@ -5,10 +5,22 @@ tools for building, evaluating, and training AI agents.
 from __future__ import annotations
-from .telemetry import Trace, clear_trace, create_job, get_trace, instrument, job, trace
+from .telemetry import (
+    Trace,
+    async_job,
+    async_trace,
+    clear_trace,
+    create_job,
+    get_trace,
+    instrument,
+    job,
+    trace,
+)
 __all__ = [
     "Trace",
+    "async_job",
+    "async_trace",
     "clear_trace",
     "create_job",
     "get_trace",

{hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/base.py RENAMED Viewed

@@ -55,6 +55,7 @@ class MCPAgent(ABC):
         # Filtering
         allowed_tools: list[str] | None = None,
         disallowed_tools: list[str] | None = None,
+        response_tool_name: str | None = None,
         # Messages
         system_prompt: str = GLOBAL_SYSTEM_PROMPT,
         append_setup_output: bool = True,
@@ -74,6 +75,7 @@ class MCPAgent(ABC):
                 that provides `mcp_config`.
             allowed_tools: Names of tools to allow (None means allow all).
             disallowed_tools: Names of tools to always exclude.
+            response_tool_name: Name of the tool to use for response.
             system_prompt: System prompt to seed the conversation.
             append_setup_output: Whether to append setup tool output to the
                 first turn's messages.
@@ -108,7 +110,7 @@ class MCPAgent(ABC):
         # Initialize these here so methods can be called before initialize()
         self._tool_map: dict[str, types.Tool] = {}  # Simplified: just name to tool
-        self.response_tool_name = None
+        self.response_tool_name = response_tool_name
         # Trace
         self._auto_trace = auto_trace
@@ -168,6 +170,8 @@ class MCPAgent(ABC):
                     self.disallowed_tools.extend(task.agent_config["disallowed_tools"])
                 else:  # If disallowed_tools is None, we overwrite it
                     self.disallowed_tools = task.agent_config["disallowed_tools"]
+            if "response_tool_name" in task.agent_config:
+                self.response_tool_name = task.agent_config["response_tool_name"]
         all_tools = await self.mcp_client.list_tools()
         self._available_tools = []

{hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/lite_llm.py RENAMED Viewed

@@ -47,7 +47,7 @@ class LiteAgent(GenericOpenAIChatAgent):
             **agent_kwargs,
         )
-    def get_tool_schemas(self) -> list[dict]:
+    def get_tool_schemas(self) -> list[Any]:
         # Prefer LiteLLM's stricter transformer (handles Bedrock & friends)
         if transform_mcp_tool_to_openai_tool is not None:
             return [

{hud_python-0.4.51 → hud_python-0.4.52}/hud/agents/tests/test_base.py RENAMED Viewed

@@ -94,7 +94,7 @@ class TestBaseMCPAgent:
         assert agent.mcp_client is not None
         assert agent.allowed_tools is None
-        assert agent.disallowed_tools == []
+        assert agent.disallowed_tools is None
         assert agent.initial_screenshot is True
         assert agent.system_prompt is not None  # Default system prompt is set
@@ -241,6 +241,13 @@ class TestBaseMCPAgent:
         assert "tool2" not in tool_names  # Not in allowed list
         assert "tool3" not in tool_names  # In disallowed list
+        # Make sure tool schemas are correct
+        schemas = agent.get_tool_schemas()
+        assert len(schemas) == 1
+        assert schemas[0]["name"] == "tool1"
+        assert schemas[0]["description"] == "Tool 1"
+        assert schemas[0]["parameters"] == {"type": "object"}
     @pytest.mark.asyncio
     async def test_call_tool_success(self):
         """Test successful tool call."""
@@ -322,21 +329,6 @@ class TestBaseMCPAgent:
         # call_tools doesn't validate empty names, it will return error
         await agent.call_tools(tool_call)
-    def test_get_tool_schemas(self):
-        """Test getting tool schemas."""
-        agent = MockMCPAgent()
-        agent._available_tools = [
-            types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-            types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
-        ]
-        schemas = agent.get_tool_schemas()
-        # Should include non-lifecycle tools
-        assert len(schemas) == 1
-        assert schemas[0]["name"] == "tool1"
     def test_get_tools_by_server(self):
         """Test getting tools grouped by server."""
         agent = MockMCPAgent()

{hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/__init__.py RENAMED Viewed

@@ -796,33 +796,19 @@ def eval(
         help="Comma-separated list of allowed tools",
     ),
     max_concurrent: int = typer.Option(
-        50,
+        30,
         "--max-concurrent",
-        help="Max concurrent tasks (prevents rate limits in both asyncio and parallel modes)",
+        help="Maximum concurrent tasks (1-200 recommended, prevents rate limits)",
     ),
     max_steps: int | None = typer.Option(
         None,
         "--max-steps",
         help="Maximum steps per task (default: 10 for single, 50 for full)",
     ),
-    parallel: bool = typer.Option(
-        False,
-        "--parallel",
-        help="Use process-based parallel execution for large datasets (100+ tasks)",
-    ),
-    max_workers: int | None = typer.Option(
-        None,
-        "--max-workers",
-        help="Number of worker processes for parallel mode (auto-optimized if not set)",
-    ),
-    max_concurrent_per_worker: int = typer.Option(
-        20,
-        "--max-concurrent-per-worker",
-        help="Maximum concurrent tasks per worker in parallel mode",
-    ),
     verbose: bool = typer.Option(
         False,
         "--verbose",
+        "-v",
         help="Enable verbose output from the agent",
     ),
     very_verbose: bool = typer.Option(
@@ -867,14 +853,14 @@ def eval(
             source = find_tasks_file(None, msg="Select a tasks file to run")
             hud_console.success(f"Selected: {source}")
-        except Exception as e:
+        except (FileNotFoundError, Exception):
             hud_console.error(
                 "No source provided and no task/eval JSON files found in current directory"
             )
             hud_console.info(
                 "Usage: hud eval <source> or create a task JSON file (e.g., task.json, tasks.jsonl)"
             )
-            raise typer.Exit(1) from e
+            raise typer.Exit(1) from None
     # Import eval_command lazily to avoid importing agent dependencies
     try:
@@ -950,9 +936,6 @@ def eval(
         allowed_tools=allowed_tools,
         max_concurrent=max_concurrent,
         max_steps=max_steps,
-        parallel=parallel,
-        max_workers=max_workers,
-        max_concurrent_per_worker=max_concurrent_per_worker,
         verbose=verbose,
         very_verbose=very_verbose,
         vllm_base_url=vllm_base_url,
@@ -1126,6 +1109,13 @@ def set(
 def main() -> None:
     """Main entry point for the CLI."""
+    # Check for updates (including on --version command)
+    # Skip only on help-only commands
+    if not (len(sys.argv) == 1 or (len(sys.argv) == 2 and sys.argv[1] in ["--help", "-h"])):
+        from .utils.version_check import display_update_prompt
+        display_update_prompt()
     # Handle --version flag before Typer parses args
     if "--version" in sys.argv:
         try:

{hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/eval.py RENAMED Viewed

@@ -300,6 +300,7 @@ async def run_single_task(
         agent_config = {
             "model": model or "claude-sonnet-4-20250514",
             "verbose": verbose,
+            "validate_api_key": False,
         }
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
@@ -345,24 +346,18 @@ async def run_full_dataset(
     allowed_tools: list[str] | None = None,
     max_concurrent: int = 30,
     max_steps: int = 10,
-    parallel: bool = False,
-    max_workers: int | None = None,
-    max_concurrent_per_worker: int = 25,
     verbose: bool = False,
     vllm_base_url: str | None = None,
     group_size: int = 1,
 ) -> list[Any]:
-    """Run evaluation across the entire dataset.
-    Uses either asyncio-based run_dataset or process-based parallel execution
-    depending on the parallel flag."""
+    """Run evaluation across the entire dataset using asyncio-based concurrency."""
     # Provide early feedback to user
     hud_console.info("🔧 Initializing evaluation...")
     # Import run_dataset lazily
     try:
-        from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
+        from hud.datasets import run_dataset
         from hud.utils.tasks import load_tasks
     except ImportError as e:
         hud_console.error(
@@ -434,7 +429,7 @@ async def run_full_dataset(
             )
             raise typer.Exit(1) from e
-        agent_config = {"verbose": verbose}
+        agent_config = {"verbose": verbose, "validate_api_key": False}
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
@@ -472,6 +467,7 @@ async def run_full_dataset(
         agent_config = {
             "model": model or "claude-sonnet-4-20250514",
             "verbose": verbose,
+            "validate_api_key": False,
         }
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
@@ -505,9 +501,7 @@ async def run_full_dataset(
                 agent_class=agent_class,
                 agent_config=agent_config,
                 group_size=group_size,
-                max_parallel_episodes=max_concurrent
-                if not parallel
-                else max_concurrent_per_worker * (max_workers or 4),
+                max_parallel_episodes=max_concurrent,
                 max_steps=max_steps,
                 verbose=verbose,
                 job_id=job.id,
@@ -519,48 +513,18 @@ async def run_full_dataset(
         # Return stats for consistency with other modes
         return stats
-    # Original logic for non-grouped evaluation
-    elif parallel:
-        hud_console.info(
-            f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…"  # noqa: E501
-        )
-        if max_workers is None:
-            # Use auto-optimization (now the default run_dataset_parallel)
-            return await run_dataset_parallel(
-                name=f"Evaluation {dataset_name}",
-                dataset=dataset_or_tasks,
-                agent_class=agent_class,
-                agent_config=agent_config,
-                max_concurrent=max_concurrent,
-                metadata={"dataset": source, "parallel": True},
-                max_steps=max_steps,
-                auto_respond=True,
-            )
-        else:
-            # Use manual configuration
-            return await run_dataset_parallel_manual(
-                name=f"Evaluation {dataset_name}",
-                dataset=dataset_or_tasks,
-                agent_class=agent_class,
-                agent_config=agent_config,
-                max_workers=max_workers,
-                max_concurrent_per_worker=max_concurrent_per_worker,
-                max_concurrent=max_concurrent,
-                metadata={"dataset": source, "parallel": True},
-                max_steps=max_steps,
-                auto_respond=True,
-            )
-    else:
-        hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
-        return await run_dataset(
-            name=f"Evaluation {dataset_name}",
-            dataset=dataset_or_tasks,
-            agent_class=agent_class,
-            agent_config=agent_config,
-            max_concurrent=max_concurrent,
-            metadata={"dataset": source},
-            max_steps=max_steps,
-        )
+    # Run evaluation with asyncio-based concurrency
+    hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
+    return await run_dataset(
+        name=f"Evaluation {dataset_name}",
+        dataset=dataset_or_tasks,
+        agent_class=agent_class,
+        agent_config=agent_config,
+        max_concurrent=max_concurrent,
+        metadata={"dataset": source},
+        max_steps=max_steps,
+        auto_respond=True,
+    )
 def eval_command(
@@ -591,31 +555,20 @@ def eval_command(
     max_concurrent: int = typer.Option(
         30,
         "--max-concurrent",
-        help="Concurrency level for asyncio mode (ignored in parallel mode)",
+        help=(
+            "Maximum concurrent tasks (1-200 recommended, prevents rate limits "
+            "and resource exhaustion)"
+        ),
     ),
     max_steps: int | None = typer.Option(
         None,
         "--max-steps",
         help="Maximum steps per task (default: 10 for single, 50 for full)",
     ),
-    parallel: bool = typer.Option(
-        False,
-        "--parallel",
-        help="Use process-based parallel execution for large datasets (100+ tasks)",
-    ),
-    max_workers: int | None = typer.Option(
-        None,
-        "--max-workers",
-        help="Number of worker processes for parallel mode (auto-optimized if not set)",
-    ),
-    max_concurrent_per_worker: int = typer.Option(
-        20,
-        "--max-concurrent-per-worker",
-        help="Maximum concurrent tasks per worker in parallel mode",
-    ),
     verbose: bool = typer.Option(
         False,
         "--verbose",
+        "-v",
         help="Enable verbose output from the agent",
     ),
     very_verbose: bool = typer.Option(
@@ -650,23 +603,20 @@ def eval_command(
         # Evaluate a single task from SheetBench
         hud eval hud-evals/SheetBench-50
-        # Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
+        # Evaluate the FULL SheetBench dataset with Claude
         hud eval hud-evals/SheetBench-50 --full --agent claude
-        # Run large dataset with PARALLEL execution (auto-optimized)
-        hud eval hud-evals/OSWorld-Verified-Gold --full --parallel
-        # Parallel mode with manual configuration (16 workers, 25 tasks each)
-        hud eval hud-evals/OSWorld-Verified-Gold --full --parallel --max-workers 16
+        # Run with higher concurrency for faster evaluation
+        hud eval hud-evals/OSWorld-Verified-Gold --full --max-concurrent 100
-        # Limit total concurrent tasks to prevent rate limits
-        hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
+        # Limit concurrent tasks to prevent rate limits
+        hud eval hud-evals/SheetBench-50 --full --max-concurrent 20
         # Run a single task from a JSON file
         hud eval task.json
-        # Run multiple tasks from a JSON file with parallel execution
-        hud eval tasks.json --full --parallel
+        # Run multiple tasks from a JSON file
+        hud eval tasks.json --full
         # Run with OpenAI Operator agent
         hud eval hud-evals/OSWorld-Gold-Beta --agent openai
@@ -736,7 +686,11 @@ def eval_command(
     # Run evaluation
     if full:
-        asyncio.run(
+        import time
+        start_time = time.time()
+        results = asyncio.run(
             run_full_dataset(
                 source,
                 agent_type=agent,
@@ -744,14 +698,29 @@ def eval_command(
                 allowed_tools=allowed_tools_list,
                 max_concurrent=max_concurrent,
                 max_steps=max_steps,
-                parallel=parallel,
-                max_workers=max_workers,
-                max_concurrent_per_worker=max_concurrent_per_worker,
                 verbose=very_verbose or verbose,
                 vllm_base_url=vllm_base_url,
                 group_size=group_size,
             )
         )
+        elapsed = time.time() - start_time
+        # Print statistics (only for non-grouped mode)
+        if group_size == 1 and results:
+            hud_console.info("\n" + "=" * 50)
+            hud_console.success("📊 Evaluation Complete!")
+            hud_console.info("=" * 50)
+            hud_console.info(f"Total tasks: {len(results)}")
+            hud_console.info(f"Time elapsed: {elapsed:.2f} seconds")
+            hud_console.info(f"Throughput: {len(results) / elapsed:.2f} tasks/second")
+            hud_console.info(f"Execution mode: ASYNCIO (max_concurrent: {max_concurrent})")
+            # Count successes
+            successful = sum(1 for r in results if getattr(r, "reward", 0) > 0.7)
+            success_rate = 100 * successful / len(results)
+            hud_console.info(f"Successful tasks: {successful}/{len(results)} ({success_rate:.1f}%)")
+            hud_console.info("=" * 50)
     else:
         asyncio.run(
             run_single_task(

{hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_build.py RENAMED Viewed

@@ -373,7 +373,8 @@ ENV API_KEY
         with open(lock_file) as f:
             lock_data = yaml.safe_load(f)
-        assert lock_data["image"] == "test/env:latest@sha256:abc123"
+        assert lock_data["images"]["full"] == "test-env:0.1.0@sha256:abc123"
+        assert lock_data["images"]["local"] == "test-env:0.1.0"
         assert lock_data["build"]["version"] == "0.1.0"
         assert lock_data["environment"]["toolCount"] == 2
         assert len(lock_data["tools"]) == 2

{hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_eval.py RENAMED Viewed

@@ -332,6 +332,7 @@ class TestRunDatasetToolFiltering:
             patch.object(ClaudeAgent, "_run_context", mock_run_context),
             patch.object(ClaudeAgent, "call_tools", mock_call_tools),
             patch("hud.clients.MCPClient", return_value=mock_client_instance),
+            patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
         ):
             # Run the dataset
             await run_dataset(
@@ -400,6 +401,7 @@ class TestRunDatasetToolFiltering:
             patch.object(ClaudeAgent, "_run_context", mock_run_context),
             patch.object(ClaudeAgent, "call_tools", mock_call_tools),
             patch("hud.clients.MCPClient", return_value=mock_client_instance),
+            patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
         ):
             # Run the dataset
             await run_dataset(
@@ -500,6 +502,7 @@ class TestSystemPromptHandling:
             patch.object(ClaudeAgent, "_run_context", mock_run_context),
             patch.object(ClaudeAgent, "call_tools", mock_call_tools),
             patch("hud.clients.MCPClient", return_value=mock_mcp_client),
+            patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
         ):
             # Run the dataset
             await run_dataset(
@@ -551,6 +554,7 @@ class TestSystemPromptHandling:
             patch.object(ClaudeAgent, "_run_context", mock_run_context),
             patch.object(ClaudeAgent, "call_tools", mock_call_tools),
             patch("hud.clients.MCPClient", return_value=mock_mcp_client),
+            patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
         ):
             # Run the dataset
             await run_dataset(

{hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/tests/test_mcp_server.py RENAMED Viewed

@@ -19,7 +19,7 @@ class TestRunMCPDevServer:
         import click
         with (
-            patch("hud.cli.dev.image_exists", return_value=False),
+            patch("hud.cli.utils.environment.image_exists", return_value=False),
             patch("click.confirm", return_value=False),
             pytest.raises(click.Abort),
         ):

{hud_python-0.4.51 → hud_python-0.4.52}/hud/cli/utils/tasks.py RENAMED Viewed

@@ -18,9 +18,12 @@ def find_tasks_file(tasks_file: str | None, msg: str = "Select a tasks file") ->
     ]
     all_files = [file for file in all_files if file[0] != "."]  # Remove all config files
+    if not all_files:
+        # No task files found - raise a clear exception
+        raise FileNotFoundError("No task JSON or JSONL files found in current directory")
     if len(all_files) == 1:
         return str(all_files[0])
     else:
         # Prompt user to select a file
         return hud_console.select(msg, choices=all_files)

hud-python 0.4.51__tar.gz → 0.4.52__tar.gz

Potentially problematic release.

hud-python 0.4.51tar.gz → 0.4.52tar.gz