PyPI - hud-python - Versions diffs - 0.4.53__tar.gz → 0.4.55__tar.gz - Mend

hud-python 0.4.53tar.gz → 0.4.55tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (299) hide show

{hud_python-0.4.53 → hud_python-0.4.55}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.53
+Version: 0.4.55
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -42,6 +42,7 @@ Requires-Dist: httpx<1,>=0.23.0
 Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
 Requires-Dist: hud-mcp-python-sdk>=3.13.2
 Requires-Dist: hud-mcp-use-python-sdk==2.3.20
+Requires-Dist: langchain==0.3.27
 Requires-Dist: numpy>=1.24.0
 Requires-Dist: openai
 Requires-Dist: opentelemetry-api>=1.34.1
@@ -247,8 +248,8 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
 RL using GRPO a Qwen2.5-VL model on any hud dataset:
 ```bash
-hud get hud-evals/basic-2048 # from HF
-hud rl basic-2048.json
+hud get hud-evals/2048-basic # from HF
+hud rl 2048-basic.json
 ```
 > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
@@ -439,14 +440,14 @@ Train with the new interactive `hud rl` flow:
 uv tool install hud-python
 # Option A: Run directly from a HuggingFace dataset
-hud rl hud-evals/basic-2048
+hud rl hud-evals/2048-basic
 # Option B: Download first, modify, then train
-hud get hud-evals/basic-2048
-hud rl basic-2048.json
+hud get hud-evals/2048-basic
+hud rl 2048-basic.json
 # Optional: baseline evaluation
-hud eval basic-2048.json
+hud eval 2048-basic.json
 ```
 Supports multi‑turn RL for both:

{hud_python-0.4.53 → hud_python-0.4.55}/README.md RENAMED Viewed

@@ -109,8 +109,8 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
 RL using GRPO a Qwen2.5-VL model on any hud dataset:
 ```bash
-hud get hud-evals/basic-2048 # from HF
-hud rl basic-2048.json
+hud get hud-evals/2048-basic # from HF
+hud rl 2048-basic.json
 ```
 > See [agent training docs](https://docs.hud.so/train-agents/quickstart)
@@ -301,14 +301,14 @@ Train with the new interactive `hud rl` flow:
 uv tool install hud-python
 # Option A: Run directly from a HuggingFace dataset
-hud rl hud-evals/basic-2048
+hud rl hud-evals/2048-basic
 # Option B: Download first, modify, then train
-hud get hud-evals/basic-2048
-hud rl basic-2048.json
+hud get hud-evals/2048-basic
+hud rl 2048-basic.json
 # Optional: baseline evaluation
-hud eval basic-2048.json
+hud eval 2048-basic.json
 ```
 Supports multi‑turn RL for both:

{hud_python-0.4.53 → hud_python-0.4.55}/environments/README.md RENAMED Viewed

@@ -804,9 +804,9 @@ class TodoCompleted:
 @problem("todo_basic", description="Complete two todo items", difficulty="easy")
 class TodoBasic:
     def get_setup(self):
-        return {"function": "todo_seed", "args": {"num_items": 5}}
+        return {"name": "todo_seed", "arguments": {"num_items": 5}}
     def get_evaluation(self):
-        return {"function": "todo_completed", "args": {"expected_count": 2}}
+        return {"name": "todo_completed", "arguments": {"expected_count": 2}}
 ```
 Decorators keep registration *next to the implementation* and avoid manual bookkeeping.  The server simply exposes the combined metadata through an MCP **resource**.  Follow `environments/browser/src/hud_controller/problems/registry.py` as a template and expose the JSON with `@mcp.resource("problems://registry")`.

{hud_python-0.4.53 → hud_python-0.4.55}/environments/blank/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "MCP server for blank environment"
 requires-python = ">=3.11"
 dependencies = [
-    "hud-python>=0.4.53",
+    "hud-python>=0.4.54",
     "httpx>=0.28.1",
 ]

{hud_python-0.4.53 → hud_python-0.4.55}/environments/browser/environment/todo/README.md RENAMED Viewed

@@ -47,8 +47,8 @@ await setup({"name": "todo_basic_usage"})
 await evaluate({"name": "todo_basic_usage"})
 # Direct function calls
-await setup({"function": "todo_reset", "args": {}})
-await evaluate({"function": "todo_completion_rate", "args": {"min_rate": 0.5}})
+await setup({"name": "todo_reset", "arguments": {}})
+await evaluate({"name": "todo_completion_rate", "arguments": {"min_rate": 0.5}})
 # MCP resource discovery
 todo_evaluators = await client.read_resource("evaluators://todo")

{hud_python-0.4.53 → hud_python-0.4.55}/environments/browser/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "HUD Browser MCP Server"
 requires-python = ">=3.11,<3.14"
 dependencies = [
-    "hud-python>=0.4.53",
+    "hud-python>=0.4.54",
     "httpx",
     "playwright",
     "pyautogui",

{hud_python-0.4.53 → hud_python-0.4.55}/environments/deepresearch/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "MCP server for DeepResearch environment"
 requires-python = ">=3.11"
 dependencies = [
-    "hud-python>=0.4.53",
+    "hud-python>=0.4.54",
     "httpx>=0.24.0",
 ]

{hud_python-0.4.53 → hud_python-0.4.55}/hud/agents/base.py RENAMED Viewed

@@ -11,6 +11,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
 import mcp.types as types
+from hud.agents.utils import log_agent_metadata_to_status, log_task_config_to_current_trace
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
 from hud.utils.hud_console import HUDConsole
 from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
@@ -62,6 +63,7 @@ class MCPAgent(ABC):
         initial_screenshot: bool = True,
         # Misc
         model_name: str = "mcp-agent",
+        checkpoint_name: str | None = None,
         response_agent: ResponseAgent | None = None,
         auto_trace: bool = True,
         verbose: bool = False,
@@ -92,6 +94,7 @@ class MCPAgent(ABC):
         self._auto_created_client = False  # Track if we created the client
         self.model_name = model_name
+        self.checkpoint_name = checkpoint_name
         self.console = HUDConsole(logger=logger)
         # Set verbose mode if requested
@@ -198,6 +201,8 @@ class MCPAgent(ABC):
             f"Agent initialized with {len(self.get_available_tools())} tools: {', '.join([t.name for t in self.get_available_tools()])}"  # noqa: E501
         )
+        await log_agent_metadata_to_status(self.model_name, self.checkpoint_name)
     async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
         """
         Run the agent with the given prompt or task.
@@ -223,6 +228,9 @@ class MCPAgent(ABC):
             # Handle Task objects with full lifecycle
             if isinstance(prompt_or_task, Task):
+                # Log a compact summary of task config to the current trace (async)
+                await log_task_config_to_current_trace(prompt_or_task)
                 return await self.run_task(prompt_or_task, max_steps)
             # Handle simple string prompts

{hud_python-0.4.53 → hud_python-0.4.55}/hud/agents/claude.py RENAMED Viewed

@@ -89,7 +89,8 @@ class ClaudeAgent(MCPAgent):
         self.use_computer_beta = use_computer_beta
         self.hud_console = HUDConsole(logger=logger)
-        self.model_name = self.model
+        self.model_name = "Claude"
+        self.checkpoint_name = self.model
         # Track mapping from Claude tool names to MCP tool names
         self._claude_to_mcp_tool_map: dict[str, str] = {}
@@ -98,14 +99,14 @@ class ClaudeAgent(MCPAgent):
         # Append Claude-specific instructions to the base system prompt
         claude_instructions = """
         You are Claude, an AI assistant created by Anthropic. You are helpful, harmless, and honest.
         When working on tasks:
         1. Be thorough and systematic in your approach
         2. Complete tasks autonomously without asking for confirmation
         3. Use available tools efficiently to accomplish your goals
         4. Verify your actions and ensure task completion
         5. Be precise and accurate in all operations
         Remember: You are expected to complete tasks autonomously. The user trusts you to accomplish what they asked.
         """.strip()  # noqa: E501

{hud_python-0.4.53 → hud_python-0.4.55}/hud/agents/openai.py RENAMED Viewed

@@ -70,6 +70,7 @@ class OperatorAgent(MCPAgent):
         self.openai_client = model_client
         self.model = model
+        self.checkpoint_name = self.model
         self.environment = environment
         # State tracking for OpenAI's stateful API
@@ -84,7 +85,7 @@ class OperatorAgent(MCPAgent):
             except Exception as e:
                 raise ValueError(f"OpenAI API key is invalid: {e}") from e
-        self.model_name = "openai-" + self.model
+        self.model_name = "Operator"
         # Append OpenAI-specific instructions to the base system prompt
         openai_instructions = """

{hud_python-0.4.53 → hud_python-0.4.55}/hud/agents/openai_chat_generic.py RENAMED Viewed

@@ -62,7 +62,8 @@ class GenericOpenAIChatAgent(MCPAgent):
         else:
             raise ValueError("Either openai_client or (api_key and base_url) must be provided")
-        self.model_name = model_name
+        self.model_name = "GenericOpenAI"
+        self.checkpoint_name = model_name
         self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
         self.mcp_schemas = []
         self.hud_console = HUDConsole(logger=logger)
@@ -194,7 +195,7 @@ class GenericOpenAIChatAgent(MCPAgent):
             raise ValueError("openai_client is required for GenericOpenAIChatAgent")
         # default transport = OpenAI SDK
         return await self.oai.chat.completions.create(
-            model=self.model_name,
+            model=self.checkpoint_name,
             messages=messages,
             tools=tools,  # type: ignore ready ChatCompletionToolParam-shaped
             **extra,

{hud_python-0.4.53 → hud_python-0.4.55}/hud/agents/tests/test_claude.py RENAMED Viewed

@@ -89,7 +89,7 @@ class TestClaudeAgent:
             validate_api_key=False,  # Skip validation in tests
         )
-        assert agent.model_name == "claude-3-opus-20240229"
+        assert agent.model_name == "Claude"
         assert agent.max_tokens == 1000
         assert agent.anthropic_client == mock_model_client
@@ -103,7 +103,7 @@ class TestClaudeAgent:
                 validate_api_key=False,  # Skip validation in tests
             )
-            assert agent.model_name == "claude-3-opus-20240229"
+            assert agent.model_name == "Claude"
             assert agent.anthropic_client is not None
     @pytest.mark.asyncio

{hud_python-0.4.53 → hud_python-0.4.55}/hud/agents/tests/test_openai.py RENAMED Viewed

@@ -50,7 +50,7 @@ class TestOperatorAgent:
             validate_api_key=False,  # Skip validation in tests
         )
-        assert agent.model_name == "openai-gpt-4"
+        assert agent.model_name == "Operator"
         assert agent.model == "gpt-4"
         assert agent.openai_client == mock_model_client

hud_python-0.4.55/hud/agents/utils.py ADDED Viewed

@@ -0,0 +1,50 @@
+from __future__ import annotations
+import contextlib
+from typing import TYPE_CHECKING
+from hud.otel.context import (
+    _update_task_status_async,
+    get_current_task_run_id,
+)
+if TYPE_CHECKING:
+    from hud.datasets import Task
+async def log_task_config_to_current_trace(task: Task) -> None:
+    with contextlib.suppress(Exception):
+        task_run_id = get_current_task_run_id()
+        if not task_run_id:
+            return
+        raw_config = task.model_dump()
+        await _update_task_status_async(
+            task_run_id,
+            "running",
+            task_id=task.id,
+            extra_metadata={"task_config": raw_config},
+        )
+async def log_agent_metadata_to_status(
+    model_name: str | None = None, checkpoint_name: str | None = None
+) -> None:
+    """Attach agent metadata (model/checkpoint) to current trace status metadata."""
+    with contextlib.suppress(Exception):
+        task_run_id = get_current_task_run_id()
+        if not task_run_id or (not model_name and not checkpoint_name):
+            return
+        agent_meta = {}
+        if model_name is not None:
+            agent_meta["model_name"] = model_name
+        if checkpoint_name is not None:
+            agent_meta["checkpoint_name"] = checkpoint_name
+        await _update_task_status_async(
+            task_run_id,
+            "running",
+            extra_metadata={"agent": agent_meta},
+        )

{hud_python-0.4.53 → hud_python-0.4.55}/hud/cli/__init__.py RENAMED Viewed

@@ -12,6 +12,8 @@ from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
+from hud.types import AgentType
 from . import list_func as list_module
 from .analyze import (
     analyze_environment,
@@ -380,6 +382,11 @@ def dev(
         "--watch",
         help="Additional directories to watch for changes (default: current directory)",
     ),
+    new: bool = typer.Option(
+        False,
+        "--new",
+        help="Show Cursor installation link for new server setup",
+    ),
 ) -> None:
     """🔥 Development mode - run MCP server with hot-reload.
@@ -420,6 +427,7 @@ def dev(
         watch,
         docker=docker,
         docker_args=docker_args,
+        new=new,
     )
@@ -847,7 +855,7 @@ def eval(
     hud_console = HUDConsole()
     if integration_test:
-        agent = "integration_test"
+        agent = AgentType.INTEGRATION_TEST
     # If no source provided, reuse RL helper to find a tasks file interactively
     if source is None:
@@ -894,17 +902,17 @@ def eval(
         # Add standard agent choices
         choices.extend(
             [
-                {"name": "Claude 4 Sonnet", "value": "claude"},
-                {"name": "OpenAI Computer Use", "value": "openai"},
-                {"name": "vLLM (Local Server)", "value": "vllm"},
-                {"name": "LiteLLM (Multi-provider)", "value": "litellm"},
+                {"name": "Claude 4 Sonnet", "value": AgentType.CLAUDE},
+                {"name": "OpenAI Computer Use", "value": AgentType.OPENAI},
+                {"name": "vLLM (Local Server)", "value": AgentType.VLLM},
+                {"name": "LiteLLM (Multi-provider)", "value": AgentType.LITELLM},
             ]
         )
         agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
     # Handle HUD model selection
-    if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]:
+    if agent and agent not in [e.value for e in AgentType]:
         # Find remote model name
         model = agent
         if not vllm_base_url:
@@ -921,20 +929,23 @@ def eval(
             hud_console.error(f"Model {model} not found")
             raise typer.Exit(1)
         model = base_model
-        agent = "vllm"  # Use vLLM backend for HUD models
+        agent = AgentType.VLLM  # Use vLLM backend for HUD models
         hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
     # Validate agent choice
-    valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"]
+    valid_agents = [e.value for e in AgentType]
     if agent not in valid_agents:
         hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
         raise typer.Exit(1)
+    # Type narrowing: agent is now guaranteed to be an AgentType value after validation
+    agent = AgentType(agent)
     # Run the command
     eval_command(
         source=source,
         full=full,
-        agent=agent,  # type: ignore
+        agent=agent,
         model=model,
         allowed_tools=allowed_tools,
         max_concurrent=max_concurrent,
@@ -1074,6 +1085,51 @@ def rl(
     )
+@app.command()
+def convert(
+    tasks_file: str = typer.Argument(
+        ..., help="Path to tasks file (JSON/JSONL) to convert to remote MCP configuration"
+    ),
+) -> None:
+    """Convert local MCP task configs to remote (mcp.hud.so) format.
+    This mirrors the implicit conversion flow used by 'hud rl' and writes a new
+    remote_<name>.json next to the source file when needed.
+    """
+    from pathlib import Path
+    from hud.utils.hud_console import HUDConsole
+    hud_console = HUDConsole()
+    try:
+        from .flows.tasks import convert_tasks_to_remote
+        result_path = convert_tasks_to_remote(tasks_file)
+        # If nothing changed, inform the user
+        try:
+            if Path(result_path).resolve() == Path(tasks_file).resolve():
+                hud_console.success(
+                    "Tasks already reference remote MCP URLs. No conversion needed."
+                )
+                hud_console.hint("You can run them directly with: hud eval <tasks_file> --full")
+                return
+        except Exception as e:
+            # Best effort; continue with success message
+            hud_console.debug(f"Path comparison failed, continuing: {e}")
+        hud_console.success(f"Converted tasks written to: {result_path}")
+        hud_console.hint(
+            "You can now run remote flows: hud rl <converted_file> or hud eval <converted_file>"
+        )
+    except typer.Exit:
+        raise
+    except Exception as e:
+        hud_console.error(f"Failed to convert tasks: {e}")
+        raise typer.Exit(1) from e
 @app.command()
 def set(
     assignments: list[str] = typer.Argument(  # type: ignore[arg-type]  # noqa: B008

hud-python 0.4.53__tar.gz → 0.4.55__tar.gz

Potentially problematic release.

hud-python 0.4.53tar.gz → 0.4.55tar.gz