PyPI - hud-python - Versions diffs - 0.4.53__tar.gz → 0.4.54__tar.gz - Mend

hud-python 0.4.53tar.gz → 0.4.54tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (297) hide show

{hud_python-0.4.53 → hud_python-0.4.54}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.53
+Version: 0.4.54
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -42,6 +42,7 @@ Requires-Dist: httpx<1,>=0.23.0
 Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
 Requires-Dist: hud-mcp-python-sdk>=3.13.2
 Requires-Dist: hud-mcp-use-python-sdk==2.3.20
+Requires-Dist: langchain==0.3.27
 Requires-Dist: numpy>=1.24.0
 Requires-Dist: openai
 Requires-Dist: opentelemetry-api>=1.34.1
@@ -247,8 +248,8 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
 RL using GRPO a Qwen2.5-VL model on any hud dataset:
 ```bash
-hud get hud-evals/basic-2048 # from HF
-hud rl basic-2048.json
+hud get hud-evals/2048-basic # from HF
+hud rl 2048-basic.json
 ```
 > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
@@ -439,14 +440,14 @@ Train with the new interactive `hud rl` flow:
 uv tool install hud-python
 # Option A: Run directly from a HuggingFace dataset
-hud rl hud-evals/basic-2048
+hud rl hud-evals/2048-basic
 # Option B: Download first, modify, then train
-hud get hud-evals/basic-2048
-hud rl basic-2048.json
+hud get hud-evals/2048-basic
+hud rl 2048-basic.json
 # Optional: baseline evaluation
-hud eval basic-2048.json
+hud eval 2048-basic.json
 ```
 Supports multi‑turn RL for both:

{hud_python-0.4.53 → hud_python-0.4.54}/README.md RENAMED Viewed

@@ -109,8 +109,8 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
 RL using GRPO a Qwen2.5-VL model on any hud dataset:
 ```bash
-hud get hud-evals/basic-2048 # from HF
-hud rl basic-2048.json
+hud get hud-evals/2048-basic # from HF
+hud rl 2048-basic.json
 ```
 > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
@@ -301,14 +301,14 @@ Train with the new interactive `hud rl` flow:
 uv tool install hud-python
 # Option A: Run directly from a HuggingFace dataset
-hud rl hud-evals/basic-2048
+hud rl hud-evals/2048-basic
 # Option B: Download first, modify, then train
-hud get hud-evals/basic-2048
-hud rl basic-2048.json
+hud get hud-evals/2048-basic
+hud rl 2048-basic.json
 # Optional: baseline evaluation
-hud eval basic-2048.json
+hud eval 2048-basic.json
 ```
 Supports multi‑turn RL for both:

{hud_python-0.4.53 → hud_python-0.4.54}/environments/README.md RENAMED Viewed

@@ -804,9 +804,9 @@ class TodoCompleted:
 @problem("todo_basic", description="Complete two todo items", difficulty="easy")
 class TodoBasic:
     def get_setup(self):
-        return {"function": "todo_seed", "args": {"num_items": 5}}
+        return {"name": "todo_seed", "arguments": {"num_items": 5}}
     def get_evaluation(self):
-        return {"function": "todo_completed", "args": {"expected_count": 2}}
+        return {"name": "todo_completed", "arguments": {"expected_count": 2}}
 ```
 Decorators keep registration *next to the implementation* and avoid manual bookkeeping.  The server simply exposes the combined metadata through an MCP **resource**.  Follow `environments/browser/src/hud_controller/problems/registry.py` as a template and expose the JSON with `@mcp.resource("problems://registry")`.

{hud_python-0.4.53 → hud_python-0.4.54}/environments/blank/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "MCP server for blank environment"
 requires-python = ">=3.11"
 dependencies = [
-    "hud-python>=0.4.53",
+    "hud-python>=0.4.54",
     "httpx>=0.28.1",
 ]

{hud_python-0.4.53 → hud_python-0.4.54}/environments/browser/environment/todo/README.md RENAMED Viewed

@@ -47,8 +47,8 @@ await setup({"name": "todo_basic_usage"})
 await evaluate({"name": "todo_basic_usage"})
 # Direct function calls
-await setup({"function": "todo_reset", "args": {}})
-await evaluate({"function": "todo_completion_rate", "args": {"min_rate": 0.5}})
+await setup({"name": "todo_reset", "arguments": {}})
+await evaluate({"name": "todo_completion_rate", "arguments": {"min_rate": 0.5}})
 # MCP resource discovery
 todo_evaluators = await client.read_resource("evaluators://todo")

{hud_python-0.4.53 → hud_python-0.4.54}/environments/browser/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "HUD Browser MCP Server"
 requires-python = ">=3.11,<3.14"
 dependencies = [
-    "hud-python>=0.4.53",
+    "hud-python>=0.4.54",
     "httpx",
     "playwright",
     "pyautogui",

{hud_python-0.4.53 → hud_python-0.4.54}/environments/deepresearch/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "MCP server for DeepResearch environment"
 requires-python = ">=3.11"
 dependencies = [
-    "hud-python>=0.4.53",
+    "hud-python>=0.4.54",
     "httpx>=0.24.0",
 ]

{hud_python-0.4.53 → hud_python-0.4.54}/hud/cli/__init__.py RENAMED Viewed

@@ -12,6 +12,8 @@ from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
+from hud.types import AgentType
 from . import list_func as list_module
 from .analyze import (
     analyze_environment,
@@ -847,7 +849,7 @@ def eval(
     hud_console = HUDConsole()
     if integration_test:
-        agent = "integration_test"
+        agent = AgentType.INTEGRATION_TEST
     # If no source provided, reuse RL helper to find a tasks file interactively
     if source is None:
@@ -894,17 +896,17 @@ def eval(
         # Add standard agent choices
         choices.extend(
             [
-                {"name": "Claude 4 Sonnet", "value": "claude"},
-                {"name": "OpenAI Computer Use", "value": "openai"},
-                {"name": "vLLM (Local Server)", "value": "vllm"},
-                {"name": "LiteLLM (Multi-provider)", "value": "litellm"},
+                {"name": "Claude 4 Sonnet", "value": AgentType.CLAUDE},
+                {"name": "OpenAI Computer Use", "value": AgentType.OPENAI},
+                {"name": "vLLM (Local Server)", "value": AgentType.VLLM},
+                {"name": "LiteLLM (Multi-provider)", "value": AgentType.LITELLM},
             ]
         )
         agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
     # Handle HUD model selection
-    if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]:
+    if agent and agent not in [e.value for e in AgentType]:
         # Find remote model name
         model = agent
         if not vllm_base_url:
@@ -921,20 +923,23 @@ def eval(
             hud_console.error(f"Model {model} not found")
             raise typer.Exit(1)
         model = base_model
-        agent = "vllm"  # Use vLLM backend for HUD models
+        agent = AgentType.VLLM  # Use vLLM backend for HUD models
         hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
     # Validate agent choice
-    valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"]
+    valid_agents = [e.value for e in AgentType]
     if agent not in valid_agents:
         hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
         raise typer.Exit(1)
+    # Type narrowing: agent is now guaranteed to be an AgentType value after validation
+    agent = AgentType(agent)
     # Run the command
     eval_command(
         source=source,
         full=full,
-        agent=agent,  # type: ignore
+        agent=agent,
         model=model,
         allowed_tools=allowed_tools,
         max_concurrent=max_concurrent,

{hud_python-0.4.53 → hud_python-0.4.54}/hud/cli/dev.py RENAMED Viewed

@@ -238,9 +238,9 @@ async def run_mcp_module(
         if env_dir.exists() and (env_dir / "server.py").exists():
             hud_console.info("")
             hud_console.info(
-                f"{hud_console.sym.FLOW} Don't forget to start the environment backend:"
+                f"{hud_console.sym.FLOW} Don't forget to start the environment backend in another terminal:"
             )
-            hud_console.info("   cd ../environment && uvicorn server:app --reload")
+            hud_console.info("   cd environment && uv run python uvicorn server:app --reload")
         # Launch inspector if requested (first run only)
         if inspector and transport == "http":

{hud_python-0.4.53 → hud_python-0.4.54}/hud/cli/eval.py RENAMED Viewed

@@ -5,13 +5,14 @@ from __future__ import annotations
 import asyncio
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any
 import typer
 import hud
 from hud.cli.utils.env_check import ensure_built, find_environment_dir
 from hud.settings import settings
+from hud.types import AgentType
 from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
 from hud.utils.hud_console import HUDConsole
@@ -113,7 +114,7 @@ def _build_vllm_config(
 def build_agent(
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
+    agent_type: AgentType,
     *,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
@@ -123,11 +124,11 @@ def build_agent(
     """Create and return the requested agent type."""
     # Import agents lazily to avoid dependency issues
-    if agent_type == "integration_test":
+    if agent_type == AgentType.INTEGRATION_TEST:
         from hud.agents.misc.integration_test_agent import IntegrationTestRunner
         return IntegrationTestRunner(verbose=verbose)
-    elif agent_type == "vllm":
+    elif agent_type == AgentType.VLLM:
         # Create a generic OpenAI agent for vLLM server
         try:
             from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
@@ -147,7 +148,7 @@ def build_agent(
         )
         return GenericOpenAIChatAgent(**config)
-    elif agent_type == "openai":
+    elif agent_type == AgentType.OPENAI:
         try:
             from hud.agents import OperatorAgent
         except ImportError as e:
@@ -165,7 +166,7 @@ def build_agent(
         else:
             return OperatorAgent(verbose=verbose)
-    elif agent_type == "litellm":
+    elif agent_type == AgentType.LITELLM:
         try:
             from hud.agents.lite_llm import LiteAgent
         except ImportError as e:
@@ -209,7 +210,7 @@ def build_agent(
 async def run_single_task(
     source: str,
     *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
+    agent_type: AgentType = AgentType.CLAUDE,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_steps: int = 10,
@@ -268,14 +269,14 @@ async def run_single_task(
     # Use grouped evaluation if group_size > 1
     agent_config: dict[str, Any] = {}
-    if agent_type == "integration_test":
+    if agent_type == AgentType.INTEGRATION_TEST:
         from hud.agents.misc.integration_test_agent import IntegrationTestRunner
         agent_class = IntegrationTestRunner
         agent_config = {"verbose": verbose}
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "vllm":
+    elif agent_type == AgentType.VLLM:
         # Special handling for vLLM
         from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
@@ -288,14 +289,14 @@ async def run_single_task(
             allowed_tools=allowed_tools,
             verbose=verbose,
         )
-    elif agent_type == "openai":
+    elif agent_type == AgentType.OPENAI:
         from hud.agents import OperatorAgent
         agent_class = OperatorAgent
         agent_config = {"verbose": verbose}
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "litellm":
+    elif agent_type == AgentType.LITELLM:
         from hud.agents.lite_llm import LiteAgent
         agent_class = LiteAgent
@@ -305,7 +306,7 @@ async def run_single_task(
         }
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "claude":
+    elif agent_type == AgentType.CLAUDE:
         from hud.agents import ClaudeAgent
         agent_class = ClaudeAgent
@@ -353,7 +354,7 @@ async def run_single_task(
 async def run_full_dataset(
     source: str,
     *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
+    agent_type: AgentType = AgentType.CLAUDE,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_concurrent: int = 30,
@@ -395,12 +396,12 @@ async def run_full_dataset(
     # Build agent class + config for run_dataset
     agent_config: dict[str, Any]
-    if agent_type == "integration_test":  # --integration-test mode
+    if agent_type == AgentType.INTEGRATION_TEST:  # --integration-test mode
         from hud.agents.misc.integration_test_agent import IntegrationTestRunner
         agent_class = IntegrationTestRunner
         agent_config = {"verbose": verbose}
-    elif agent_type == "vllm":
+    elif agent_type == AgentType.VLLM:
         try:
             from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
@@ -419,7 +420,7 @@ async def run_full_dataset(
             allowed_tools=allowed_tools,
             verbose=verbose,
         )
-    elif agent_type == "openai":
+    elif agent_type == AgentType.OPENAI:
         try:
             from hud.agents import OperatorAgent
@@ -435,7 +436,7 @@ async def run_full_dataset(
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "litellm":
+    elif agent_type == AgentType.LITELLM:
         try:
             from hud.agents.lite_llm import LiteAgent
@@ -539,8 +540,8 @@ def eval_command(
         "--full",
         help="Run the entire dataset (omit for single-task debug mode)",
     ),
-    agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
-        "claude",
+    agent: AgentType = typer.Option(  # noqa: B008
+        AgentType.CLAUDE,
         "--agent",
         help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
     ),
@@ -648,21 +649,21 @@ def eval_command(
     # We pass integration_test as the agent_type
     if integration_test:
-        agent = "integration_test"
+        agent = AgentType.INTEGRATION_TEST
     # Check for required API keys
-    if agent == "claude":
+    if agent == AgentType.CLAUDE:
         if not settings.anthropic_api_key:
             hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
             hud_console.info(
                 "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
             )
             raise typer.Exit(1)
-    elif agent == "openai" and not settings.openai_api_key:
+    elif agent == AgentType.OPENAI and not settings.openai_api_key:
         hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
         hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
         raise typer.Exit(1)
-    elif agent == "vllm":
+    elif agent == AgentType.VLLM:
         if model:
             hud_console.info(f"Using vLLM with model: {model}")
         else:

{hud_python-0.4.53 → hud_python-0.4.54}/hud/cli/tests/test_eval.py RENAMED Viewed

@@ -11,7 +11,7 @@ from hud.cli.eval import (
     build_agent,
     run_single_task,
 )
-from hud.types import Task, Trace
+from hud.types import AgentType, Task, Trace
 class TestBuildAgent:
@@ -26,7 +26,7 @@ class TestBuildAgent:
             mock_runner.return_value = mock_instance
             # Test with verbose=False
-            result = build_agent("integration_test", verbose=False)
+            result = build_agent(AgentType.INTEGRATION_TEST, verbose=False)
             mock_runner.assert_called_once_with(verbose=False)
             assert result == mock_instance
@@ -40,7 +40,7 @@ class TestBuildAgent:
             mock_runner.return_value = mock_instance
             # Test with verbose=False
-            result = build_agent("claude", verbose=False)
+            result = build_agent(AgentType.CLAUDE, verbose=False)
             mock_runner.assert_called_once_with(model="claude-sonnet-4-20250514", verbose=False)
             assert result == mock_instance
@@ -55,7 +55,7 @@ class TestBuildAgent:
             # Test with verbose=False
             result = build_agent(
-                "claude",
+                AgentType.CLAUDE,
                 model="claude-sonnet-4-20250514",
                 allowed_tools=["act"],
                 verbose=True,
@@ -97,7 +97,7 @@ class TestRunSingleTask:
             patch("hud.cli.eval.find_environment_dir", return_value=None),
             patch("hud.cli.eval.hud.trace"),
         ):
-            await run_single_task("test.json", agent_type="integration_test", max_steps=10)
+            await run_single_task("test.json", agent_type=AgentType.INTEGRATION_TEST, max_steps=10)
             # Verify agent.run was called with the task containing agent_config
             mock_agent.run.assert_called_once()
@@ -119,7 +119,7 @@ class TestRunSingleTask:
             mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]
             await run_single_task(
-                "test.json", agent_type="integration_test", group_size=3, max_steps=10
+                "test.json", agent_type=AgentType.INTEGRATION_TEST, group_size=3, max_steps=10
             )
             # Verify run_tasks_grouped was called with correct group_size

{hud_python-0.4.53 → hud_python-0.4.54}/hud/types.py RENAMED Viewed

@@ -5,6 +5,7 @@ import json
 import logging
 import uuid
 from collections import defaultdict
+from enum import Enum
 from string import Template
 from typing import Any, Literal
@@ -21,6 +22,14 @@ logger = logging.getLogger(__name__)
 _missing_api_key_error_logged: bool = False
+class AgentType(str, Enum):
+    CLAUDE = "claude"
+    OPENAI = "openai"
+    VLLM = "vllm"
+    LITELLM = "litellm"
+    INTEGRATION_TEST = "integration_test"
 class Task(BaseModel):
     """
     A task configuration that can be used to create a task.
@@ -325,6 +334,7 @@ class Trace(BaseModel):
 __all__ = [
     "AgentResponse",
+    "AgentType",
     "MCPToolCall",
     "MCPToolResult",
     "Trace",

{hud_python-0.4.53 → hud_python-0.4.54}/hud/utils/tests/test_version.py RENAMED Viewed

@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
-    assert hud.__version__ == "0.4.53"
+    assert hud.__version__ == "0.4.54"

{hud_python-0.4.53 → hud_python-0.4.54}/hud/version.py RENAMED Viewed

@@ -4,4 +4,4 @@ Version information for the HUD SDK.
 from __future__ import annotations
-__version__ = "0.4.53"
+__version__ = "0.4.54"

{hud_python-0.4.53 → hud_python-0.4.54}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.4.53"
+version = "0.4.54"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.13"
@@ -18,6 +18,7 @@ dependencies = [
     "hud-mcp-python-sdk>=3.13.2",
     "hud-fastmcp-python-sdk>=0.1.2",
     "hud-mcp-use-python-sdk==2.3.20",
+    "langchain==0.3.27",
     "pathspec>=0.12.1",
     "wrapt>=1.14.0",
     # CLI dependencies