PyPI - hud-python - Versions diffs - 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl - Mend

hud-python 0.4.28py3-none-any.whl → 0.4.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (77) hide show

hud/__init__.py +2 -1
hud/agents/base.py +81 -45
hud/agents/claude.py +8 -4
hud/agents/openai_chat_generic.py +66 -40
hud/agents/tests/test_base.py +0 -4
hud/agents/tests/test_openai.py +1 -1
hud/cli/__init__.py +182 -52
hud/cli/dev.py +8 -9
hud/cli/eval.py +317 -119
hud/cli/flows/__init__.py +0 -0
hud/cli/flows/tasks.py +0 -0
hud/cli/get.py +160 -0
hud/cli/rl/__init__.py +567 -71
hud/cli/rl/config.py +94 -0
hud/cli/rl/display.py +133 -0
hud/cli/rl/gpu.py +63 -0
hud/cli/rl/gpu_utils.py +318 -0
hud/cli/rl/presets.py +96 -0
hud/cli/rl/remote_runner.py +347 -0
hud/cli/rl/rl_api.py +150 -0
hud/cli/rl/vllm.py +177 -0
hud/cli/tests/test_analyze_metadata.py +0 -1
hud/cli/utils/tasks.py +26 -0
hud/clients/base.py +21 -23
hud/clients/mcp_use.py +36 -44
hud/clients/tests/test_mcp_use_retry.py +10 -10
hud/datasets/__init__.py +4 -3
hud/datasets/{execution/parallel.py → parallel.py} +1 -1
hud/datasets/{execution/runner.py → runner.py} +1 -1
hud/datasets/utils.py +1 -1
hud/native/comparator.py +6 -6
hud/native/tests/test_comparator.py +8 -8
hud/native/tests/test_native_init.py +13 -11
hud/otel/config.py +1 -1
hud/otel/instrumentation.py +35 -0
hud/rl/README.md +30 -0
hud/rl/__init__.py +1 -0
hud/rl/actor.py +174 -0
hud/rl/buffer.py +371 -0
hud/rl/chat_template.jinja +101 -0
hud/rl/config.py +184 -0
hud/rl/distributed.py +95 -0
hud/rl/learner.py +589 -0
hud/rl/tests/__init__.py +1 -0
hud/rl/tests/test_learner.py +171 -0
hud/rl/train.py +354 -0
hud/rl/types.py +101 -0
hud/rl/utils/start_vllm_server.sh +30 -0
hud/rl/utils.py +524 -0
hud/rl/vllm_adapter.py +125 -0
hud/settings.py +6 -0
hud/telemetry/__init__.py +2 -1
hud/telemetry/job.py +46 -3
hud/telemetry/tests/test_trace.py +3 -3
hud/telemetry/trace.py +85 -13
hud/tools/tests/test_computer.py +3 -3
hud/tools/tests/test_computer_actions.py +1 -1
hud/types.py +123 -2
hud/utils/group_eval.py +223 -0
hud/utils/hud_console.py +113 -13
hud/utils/tasks.py +119 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/METADATA +20 -2
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/RECORD +68 -48
hud/cli/hf.py +0 -406
hud/cli/rl/README.md +0 -243
hud/cli/rl/init.py +0 -370
hud/cli/rl/pod.py +0 -501
hud/cli/rl/ssh.py +0 -322
hud/cli/rl/train.py +0 -562
hud/cli/rl/utils.py +0 -165
hud/datasets/execution/__init__.py +0 -13
hud/datasets/task.py +0 -116
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/WHEEL +0 -0
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.28.dist-info → hud_python-0.4.30.dist-info}/licenses/LICENSE +0 -0

hud/native/tests/test_comparator.py CHANGED Viewed

@@ -11,7 +11,7 @@ from hud.native.comparator import (
     ComparisonResult,
     DataType,
     auto_select_mode,
-    comparator_server,
+    comparator,
     detect_type,
     extract_boolean,
     extract_json,
@@ -321,10 +321,10 @@ class TestAliasTools:
     @pytest.mark.asyncio
     async def test_aliases_work(self):
         """Test that aliases are properly registered and work."""
-        from hud.native.comparator import comparator_server
+        from hud.native.comparator import comparator
         # Check that aliases are registered
-        tool_names = [t.name for t in comparator_server._tool_manager._tools.values()]
+        tool_names = [t.name for t in comparator._tool_manager._tools.values()]
         expected_aliases = [
             "compare_exact",
@@ -433,7 +433,7 @@ class TestAliasPreprocessing:
     @pytest.mark.asyncio
     async def test_json_alias_preprocessing(self):
         """Test JSON extraction in compare_json tool."""
-        tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
+        tools = {t.name: t for t in comparator._tool_manager._tools.values()}
         json_tool = tools["compare_json"]
         assert isinstance(json_tool, FunctionTool)
@@ -448,7 +448,7 @@ class TestAliasPreprocessing:
     @pytest.mark.asyncio
     async def test_numeric_alias_preprocessing(self):
         """Test number extraction in numeric tools."""
-        tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
+        tools = {t.name: t for t in comparator._tool_manager._tools.values()}
         # Float tool
         float_tool = tools["compare_float"]
@@ -471,7 +471,7 @@ class TestAliasPreprocessing:
     @pytest.mark.asyncio
     async def test_boolean_alias_preprocessing(self):
         """Test boolean extraction in compare_boolean tool."""
-        tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
+        tools = {t.name: t for t in comparator._tool_manager._tools.values()}
         bool_tool = tools["compare_boolean"]
         assert isinstance(bool_tool, FunctionTool)
@@ -485,7 +485,7 @@ class TestAliasPreprocessing:
     @pytest.mark.asyncio
     async def test_list_alias_preprocessing(self):
         """Test list extraction in compare_list tool."""
-        tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
+        tools = {t.name: t for t in comparator._tool_manager._tools.values()}
         list_tool = tools["compare_list"]
         assert isinstance(list_tool, FunctionTool)
@@ -499,7 +499,7 @@ class TestAliasPreprocessing:
     @pytest.mark.asyncio
     async def test_complex_llm_output(self):
         """Test extraction from complex LLM outputs with reasoning."""
-        tools = {t.name: t for t in comparator_server._tool_manager._tools.values()}
+        tools = {t.name: t for t in comparator._tool_manager._tools.values()}
         json_tool = tools["compare_json"]
         llm_output = """

hud/native/tests/test_native_init.py CHANGED Viewed

@@ -8,12 +8,12 @@ class TestNativeInit:
     def test_comparator_server_import(self):
         """Test that comparator server can be imported."""
-        from hud.native.comparator import comparator_server
+        from hud.native.comparator import comparator
         from hud.server import MCPServer
         # Verify comparator is an MCPServer instance
-        assert isinstance(comparator_server, MCPServer)
-        assert comparator_server.name == "comparator"
+        assert isinstance(comparator, MCPServer)
+        assert comparator.name == "comparator"
     def test_all_exports(self):
         """Test that __all__ is properly defined."""
@@ -31,17 +31,17 @@ class TestNativeInit:
     def test_comparator_tools_registered(self):
         """Test that comparator server has tools registered."""
-        from hud.native.comparator import comparator_server
+        from hud.native.comparator import comparator
         # The server should have tools registered
         # We can check that the tool manager has tools
-        tool_names = [t.name for t in comparator_server._tool_manager._tools.values()]
+        tool_names = [t.name for t in comparator._tool_manager._tools.values()]
         # Should have the main compare tool
         assert "compare" in tool_names
         # Should have the submit tool
-        assert "submit" in tool_names
+        assert "response" in tool_names
         # Should have all the alias tools
         expected_aliases = [
@@ -64,16 +64,18 @@ class TestNativeInit:
     def test_comparator_tool_functionality(self):
         """Test that we can get the CompareTool from the comparator."""
-        from hud.native.comparator import comparator_server
-        from hud.tools import BaseTool
+        from hud.native.comparator import comparator
         # Get the compare tool
         compare_tool = None
-        for tool in comparator_server._tool_manager._tools.values():
+        for tool in comparator._tool_manager._tools.values():
             if tool.name == "compare":
                 compare_tool = tool
                 break
         assert compare_tool is not None
-        assert isinstance(compare_tool, BaseTool)
-        assert hasattr(compare_tool, "__call__")
+        # FastMCP wraps tools as FunctionTool instances
+        assert hasattr(compare_tool, "name")
+        assert compare_tool.name == "compare"
+        # FunctionTool has a 'fn' attribute for the callable
+        assert hasattr(compare_tool, "fn") or hasattr(compare_tool, "__call__")

hud/otel/config.py CHANGED Viewed

@@ -111,7 +111,7 @@ def configure_telemetry(
         # Error if no exporters are configured
         raise ValueError(
             "No telemetry backend configured. Either:\n"
-            "1. Set HUD_API_KEY environment variable for HUD telemetry\n"
+            "1. Set HUD_API_KEY environment variable for HUD telemetry (https://app.hud.so)\n"
             "2. Use enable_otlp=True with configure_telemetry() for alternative backends (e.g., Jaeger)\n"  # noqa: E501
         )
     elif not settings.telemetry_enabled:

hud/otel/instrumentation.py CHANGED Viewed

@@ -55,6 +55,9 @@ def _patch_mcp_instrumentation() -> None:
     try:
         from opentelemetry.instrumentation.mcp.instrumentation import McpInstrumentor
+        # First, patch the get_error_type function to handle invalid HTTP status codes
+        _patch_get_error_type()
         def patched_transport_wrapper(self: Any, tracer: Any) -> Callable[..., Any]:
             @asynccontextmanager
             async def traced_method(
@@ -98,3 +101,35 @@ def _patch_mcp_instrumentation() -> None:
         logger = logging.getLogger(__name__)
         logger.warning("Failed to patch MCP instrumentation: %s", e)
+def _patch_get_error_type() -> None:
+    """Patch get_error_type to handle invalid HTTP status codes gracefully."""
+    import re
+    from http import HTTPStatus
+    try:
+        import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
+        def patched_get_error_type(error_message: str) -> str | None:
+            """Extract HTTP status from error message, handling invalid codes."""
+            if not isinstance(error_message, str):
+                return None
+            match = re.search(r"\b(4\d{2}|5\d{2})\b", error_message)
+            if match:
+                num = int(match.group())
+                try:
+                    # Only return if it's a valid HTTPStatus
+                    if 400 <= num <= 599:
+                        return HTTPStatus(num).name
+                except ValueError:
+                    # Not a valid HTTP status code
+                    logger.debug("Ignoring invalid HTTP status code: %s", num)
+            return None
+        # Apply the patch
+        mcp_inst.get_error_type = patched_get_error_type
+        logger.debug("Patched get_error_type to handle invalid HTTP status codes")
+    except Exception as e:
+        logger.warning("Failed to patch get_error_type: %s", e)

hud/rl/README.md ADDED Viewed

@@ -0,0 +1,30 @@
+We suggest running hud rl (or with the --local flag) for optimal hyperparameters and native HuggingFace running.
+However, to run this independently, sping up an instance with at least 2 GPUs and run:
+```bash
+sudo apt-get update -y && sudo apt-get install -y cuda-toolkit-12-6
+uv pip install -e .[rl]
+uv pip install ninja
+uv pip install flash-attn --no-build-isolation
+```
+Launch a vllm server with:
+```bash
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+export TOKENIZERS_PARALLELISM=false
+export VLLM_LOGGING_LEVEL=INFO
+export CUDA_VISIBLE_DEVICES=7 # Set this to your last GPU
+uv run vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
+    --api-key token-abc123 --host 0.0.0.0 --port 8000 --tensor-parallel-size 1 --trust-remote-code \
+    --max-model-len 16384 --enable-lora --max-lora-rank 64 --max-cpu-loras 4 --enable-auto-tool-choice \
+    --tool-call-parser hermes --disable-log-requests --dtype auto
+```
+And training with (replace 2 with your spare GPUs):
+```bash
+hud get hud-evals/2048-basic
+torchrun --nproc-per-node 2 -m hud.rl.train --tasks 2048-basic.json --verbose
+```
+Add a `--config path/to/config.json` flag to run a specific configuration (or change the defaults in config.py)

hud/rl/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """RL module for HUD."""

hud/rl/actor.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Actor for episode collection using vLLM and HUD."""
+from __future__ import annotations
+import asyncio
+import logging
+import httpx
+from openai import AsyncOpenAI
+import hud
+from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+from hud.clients.utils.retry_transport import create_retry_httpx_client
+from hud.types import Task, Trace
+from hud.utils.hud_console import HUDConsole
+from .config import Config
+logger = logging.getLogger(__name__)
+hud_console = HUDConsole(logger)
+class Actor:
+    """Collects episodes using vLLM-served models via HUD agents."""
+    def __init__(self, config: Config) -> None:
+        self.config = config
+        self.actor_config = config.actor
+        self.current_adapter = config.model.base_model
+        # Setup OpenAI client for vLLM
+        base_url = self.actor_config.vllm_base_url.replace("localhost", "127.0.0.1")
+        self.openai_client = self._create_openai_client(base_url)
+    def _create_openai_client(self, base_url: str) -> AsyncOpenAI:
+        """Create OpenAI client with optimized settings for vLLM."""
+        # Match connection limits to parallel_episodes to avoid bottlenecks
+        # Use shorter per-request timeout and keep retries modest to avoid long blocking
+        http_client = create_retry_httpx_client(
+            timeout=httpx.Timeout(30.0),
+        )
+        return AsyncOpenAI(
+            base_url=base_url,
+            api_key=self.actor_config.vllm_api_key,
+            http_client=http_client,
+            max_retries=2,
+        )
+    def create_agent(self) -> GenericOpenAIChatAgent:
+        """Create an agent with the current adapter."""
+        return GenericOpenAIChatAgent(
+            openai_client=self.openai_client,
+            model_name=self.current_adapter,
+            allowed_tools=self.actor_config.allowed_tools,
+            append_setup_output=False,
+            system_prompt=self.actor_config.system_prompt,
+            verbose=self.config.verbose,
+            completion_kwargs={
+                "temperature": self.actor_config.temperature,
+                "max_tokens": self.actor_config.max_new_tokens,
+                "tool_choice": "required" if self.actor_config.force_tool_choice else "auto",
+            },
+        )
+    def update_adapter(self, adapter_name: str) -> None:
+        """Update the current adapter being used."""
+        self.current_adapter = adapter_name
+        hud_console.info(f"[Actor] Using adapter: {adapter_name}")
+    async def run_tasks(self, tasks: list[Task], job_id: str) -> list[Trace]:
+        """Run tasks and collect traces."""
+        traces = []
+        # Process tasks in batches respecting max_parallel_episodes limit
+        for batch_start in range(0, len(tasks), self.actor_config.max_parallel_episodes):
+            batch_end = min(batch_start + self.actor_config.max_parallel_episodes, len(tasks))
+            batch = tasks[batch_start:batch_end]
+            # Run batch in parallel with per-episode timeout protection
+            async def run_with_timeout(t: Task) -> Trace:
+                try:
+                    return await asyncio.wait_for(
+                        self._run_task(t, job_id),
+                        timeout=self.actor_config.episode_timeout_sec,
+                    )
+                except TimeoutError:
+                    hud_console.warning_log(f"Episode timed out for task {t.id}")
+                    return Trace(isError=True, content="Episode timeout")
+            results = await asyncio.gather(
+                *[run_with_timeout(t) for t in batch],
+                return_exceptions=True,
+            )
+            # Normalize exceptions to error traces
+            for res in results:
+                if isinstance(res, Exception):
+                    hud_console.warning_log(f"Episode error: {res}")
+                    traces.append(Trace(isError=True, content=str(res)))
+                else:
+                    traces.append(res)
+        return traces
+    async def _run_task(self, task: Task, job_id: str) -> Trace:
+        """Run a single task."""
+        agent = self.create_agent()
+        # Run the task
+        try:
+            with hud.trace(f"Training | {task.id}", job_id=job_id):
+                result = await agent.run(task, max_steps=self.actor_config.max_steps_per_episode)
+        except Exception:
+            logger.info("GOT EXCEPTION")
+            return Trace(isError=True)
+        result.info["tool_spec"] = agent.get_tool_schemas()
+        return result
+if __name__ == "__main__":
+    from hud.types import Task
+    async def test_actor() -> None:
+        """Test the actor with a single 2048 task using local hud-browser image."""
+        config = Config()
+        config.actor.max_parallel_episodes = 1
+        config.actor.max_steps_per_episode = 6
+        config.actor.episode_timeout_sec = 120
+        config.verbose = True
+        # Create test task with local hud-browser image
+        task_data = {
+            "id": "test_2048_128",
+            "prompt": "Play the browser-based 2048 game and try to reach the 128 tile. Start by taking a screenshot, then make strategic moves using arrow keys.",  # noqa: E501
+            "mcp_config": {
+                "local": {
+                    "command": "sh",
+                    "args": [
+                        "-c",
+                        "docker run --rm --platform linux/amd64 -i hud-browser:latest 2>/dev/null",
+                    ],
+                }
+            },
+            "setup_tool": {"name": "launch_app", "arguments": {"app_name": "2048"}},
+            "evaluate_tool": {
+                "name": "evaluate",
+                "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
+            },
+            "system_prompt": "You are an expert 2048 game player. Use arrow keys to reach the target tile. First take a screenshot, then make strategic moves.",  # noqa: E501
+        }
+        task = Task(**task_data)
+        actor = Actor(config)
+        logger.info("Testing actor with task: %s", task.id)
+        logger.info("Model: %s", config.model.base_model)
+        logger.info("VLLM: %s", config.actor.vllm_base_url)
+        traces = await actor.run_tasks([task], job_id="test_2048")
+        for trace in traces:
+            if trace.isError:
+                logger.info("Error: %s", trace.content)
+            else:
+                logger.info("Success!")
+                logger.info("Trace info: %s", trace.info if hasattr(trace, "info") else "No info")
+                # Check for evaluation in the trace info
+                if hasattr(trace, "info") and "evaluation" in trace.info:
+                    logger.info("  Evaluation: %s", trace.info["evaluation"])
+    asyncio.run(test_actor())

hud-python 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.28py3-none-any.whl → 0.4.30py3-none-any.whl