PyPI - hud-python - Versions diffs - 0.4.52__py3-none-any.whl → 0.4.54__py3-none-any.whl - Mend

hud-python 0.4.52py3-none-any.whl → 0.4.54py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (70) hide show

hud/agents/base.py +9 -2
hud/agents/openai_chat_generic.py +15 -3
hud/agents/tests/test_base.py +15 -0
hud/agents/tests/test_base_runtime.py +164 -0
hud/cli/__init__.py +20 -12
hud/cli/build.py +35 -27
hud/cli/dev.py +13 -31
hud/cli/eval.py +85 -84
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +24 -2
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +134 -0
hud/cli/tests/test_eval.py +6 -6
hud/cli/tests/test_mcp_server.py +8 -7
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/utils/docker.py +120 -1
hud/cli/utils/runner.py +1 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +2 -2
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_runner.py +106 -0
hud/datasets/tests/test_utils.py +228 -0
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_instrumentation.py +207 -0
hud/server/tests/test_server_extra.py +2 -0
hud/shared/exceptions.py +35 -4
hud/shared/hints.py +25 -0
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +31 -23
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/tests/test_async_context.py +242 -0
hud/telemetry/tests/test_instrument.py +414 -0
hud/telemetry/tests/test_job.py +609 -0
hud/telemetry/tests/test_trace.py +183 -5
hud/tools/computer/settings.py +2 -2
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/types.py +17 -1
hud/utils/agent_factories.py +1 -3
hud/utils/mcp.py +1 -1
hud/utils/tests/test_agent_factories.py +60 -0
hud/utils/tests/test_mcp.py +4 -6
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tasks.py +187 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.52.dist-info → hud_python-0.4.54.dist-info}/METADATA +49 -49
{hud_python-0.4.52.dist-info → hud_python-0.4.54.dist-info}/RECORD +70 -32
{hud_python-0.4.52.dist-info → hud_python-0.4.54.dist-info}/WHEEL +0 -0
{hud_python-0.4.52.dist-info → hud_python-0.4.54.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.52.dist-info → hud_python-0.4.54.dist-info}/licenses/LICENSE +0 -0

hud/utils/tests/test_pretty_errors.py ADDED Viewed

@@ -0,0 +1,186 @@
+from __future__ import annotations
+import sys
+from unittest.mock import MagicMock, patch
+from hud.utils.pretty_errors import (
+    _async_exception_handler,
+    _render_and_fallback,
+    install_pretty_errors,
+)
+def test_render_and_fallback_hud_exception():
+    """Test _render_and_fallback with HudException."""
+    from hud.shared.exceptions import HudException
+    exc = HudException("Test error")
+    with (
+        patch("sys.__excepthook__") as mock_excepthook,
+        patch("hud.utils.pretty_errors.hud_console") as mock_console,
+        patch("sys.stderr.flush"),
+    ):
+        _render_and_fallback(HudException, exc, None)
+        mock_excepthook.assert_called_once()
+        mock_console.render_exception.assert_called_once_with(exc)
+def test_render_and_fallback_non_hud_exception():
+    """Test _render_and_fallback with non-HudException."""
+    exc = ValueError("Test error")
+    with (
+        patch("sys.__excepthook__") as mock_excepthook,
+        patch("hud.utils.pretty_errors.hud_console") as mock_console,
+    ):
+        _render_and_fallback(ValueError, exc, None)
+        mock_excepthook.assert_called_once()
+        # Should not render for non-HudException
+        mock_console.render_exception.assert_not_called()
+def test_render_and_fallback_rendering_error():
+    """Test _render_and_fallback handles rendering errors gracefully."""
+    from hud.shared.exceptions import HudException
+    exc = HudException("Test error")
+    with (
+        patch("sys.__excepthook__") as mock_excepthook,
+        patch("hud.utils.pretty_errors.hud_console") as mock_console,
+    ):
+        mock_console.render_exception.side_effect = Exception("Render failed")
+        # Should not raise
+        _render_and_fallback(HudException, exc, None)
+        mock_excepthook.assert_called_once()
+def test_async_exception_handler_with_exception():
+    """Test _async_exception_handler with exception in context."""
+    mock_loop = MagicMock()
+    context = {"exception": ValueError("Test error")}
+    with patch("hud.utils.pretty_errors.hud_console") as mock_console:
+        _async_exception_handler(mock_loop, context)
+        mock_console.render_exception.assert_called_once()
+        mock_loop.default_exception_handler.assert_called_once_with(context)
+def test_async_exception_handler_with_message():
+    """Test _async_exception_handler with message only."""
+    mock_loop = MagicMock()
+    context = {"message": "Error message"}
+    with patch("hud.utils.pretty_errors.hud_console") as mock_console:
+        _async_exception_handler(mock_loop, context)
+        mock_console.error.assert_called_once_with("Error message")
+        mock_console.render_support_hint.assert_called_once()
+        mock_loop.default_exception_handler.assert_called_once()
+def test_async_exception_handler_rendering_error():
+    """Test _async_exception_handler handles rendering errors."""
+    mock_loop = MagicMock()
+    context = {"exception": ValueError("Test")}
+    with patch("hud.utils.pretty_errors.hud_console") as mock_console:
+        mock_console.render_exception.side_effect = Exception("Render failed")
+        # Should not raise, should call default handler
+        _async_exception_handler(mock_loop, context)
+        mock_loop.default_exception_handler.assert_called_once()
+def test_install_pretty_errors_with_running_loop():
+    """Test install_pretty_errors with a running event loop."""
+    mock_loop = MagicMock()
+    with patch("asyncio.get_running_loop", return_value=mock_loop):
+        install_pretty_errors()
+        assert sys.excepthook == _render_and_fallback
+        mock_loop.set_exception_handler.assert_called_once_with(_async_exception_handler)
+def test_install_pretty_errors_no_running_loop():
+    """Test install_pretty_errors without a running loop."""
+    with (
+        patch("asyncio.get_running_loop", side_effect=RuntimeError("No running loop")),
+        patch("asyncio.new_event_loop") as mock_new_loop,
+    ):
+        mock_loop = MagicMock()
+        mock_new_loop.return_value = mock_loop
+        install_pretty_errors()
+        assert sys.excepthook == _render_and_fallback
+        mock_loop.set_exception_handler.assert_called_once()
+def test_install_pretty_errors_new_loop_fails():
+    """Test install_pretty_errors when creating new loop fails."""
+    with (
+        patch("asyncio.get_running_loop", side_effect=RuntimeError("No running loop")),
+        patch("asyncio.new_event_loop", side_effect=Exception("Can't create loop")),
+    ):
+        # Should not raise
+        install_pretty_errors()
+        assert sys.excepthook == _render_and_fallback
+def test_install_pretty_errors_set_handler_fails():
+    """Test install_pretty_errors when set_exception_handler fails."""
+    mock_loop = MagicMock()
+    mock_loop.set_exception_handler.side_effect = Exception("Can't set handler")
+    with patch("asyncio.get_running_loop", return_value=mock_loop):
+        # Should not raise
+        install_pretty_errors()
+        assert sys.excepthook == _render_and_fallback
+def test_async_exception_handler_no_exception_or_message():
+    """Test _async_exception_handler with empty context."""
+    mock_loop = MagicMock()
+    context = {}
+    with patch("hud.utils.pretty_errors.hud_console") as mock_console:
+        _async_exception_handler(mock_loop, context)
+        mock_console.render_exception.assert_not_called()
+        mock_console.error.assert_not_called()
+        mock_loop.default_exception_handler.assert_called_once()
+def test_render_and_fallback_with_traceback():
+    """Test _render_and_fallback includes traceback."""
+    from hud.shared.exceptions import HudException
+    exc = HudException("Test error")
+    # Create a fake traceback
+    try:
+        raise exc
+    except HudException as e:
+        tb = e.__traceback__
+    with (
+        patch("sys.__excepthook__") as mock_excepthook,
+        patch("hud.utils.pretty_errors.hud_console"),
+        patch("sys.stderr.flush"),
+    ):
+        _render_and_fallback(HudException, exc, tb)
+        # Should call excepthook with traceback
+        call_args = mock_excepthook.call_args[0]
+        assert call_args[2] == tb

hud/utils/tests/test_tasks.py ADDED Viewed

@@ -0,0 +1,187 @@
+from __future__ import annotations
+import json
+import tempfile
+from pathlib import Path
+import pytest
+from hud.types import Task
+from hud.utils.tasks import load_tasks
+def test_load_tasks_from_list():
+    """Test loading tasks from a list of dictionaries."""
+    task_dicts = [
+        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
+        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
+    ]
+    tasks = load_tasks(task_dicts)
+    assert len(tasks) == 2
+    assert all(isinstance(t, Task) for t in tasks)
+    assert tasks[0].prompt == "Test task 1"  # type: ignore
+    assert tasks[1].prompt == "Test task 2"  # type: ignore
+def test_load_tasks_from_list_raw():
+    """Test loading tasks from a list in raw mode."""
+    task_dicts = [
+        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
+        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
+    ]
+    tasks = load_tasks(task_dicts, raw=True)
+    assert len(tasks) == 2
+    assert all(isinstance(t, dict) for t in tasks)
+    assert tasks[0]["prompt"] == "Test task 1"  # type: ignore
+def test_load_tasks_from_json_file():
+    """Test loading tasks from a JSON file."""
+    task_dicts = [
+        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
+        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
+    ]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
+        json.dump(task_dicts, f)
+        temp_path = f.name
+    try:
+        tasks = load_tasks(temp_path)
+        assert len(tasks) == 2
+        assert all(isinstance(t, Task) for t in tasks)
+        assert tasks[0].prompt == "Test task 1"  # type: ignore
+    finally:
+        Path(temp_path).unlink()
+def test_load_tasks_from_json_file_raw():
+    """Test loading tasks from a JSON file in raw mode."""
+    task_dicts = [
+        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
+        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
+    ]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
+        json.dump(task_dicts, f)
+        temp_path = f.name
+    try:
+        tasks = load_tasks(temp_path, raw=True)
+        assert len(tasks) == 2
+        assert all(isinstance(t, dict) for t in tasks)
+    finally:
+        Path(temp_path).unlink()
+def test_load_tasks_from_jsonl_file():
+    """Test loading tasks from a JSONL file."""
+    task_dicts = [
+        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
+        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
+    ]
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
+    ) as f:
+        for task_dict in task_dicts:
+            f.write(json.dumps(task_dict) + "\n")
+        temp_path = f.name
+    try:
+        tasks = load_tasks(temp_path)
+        assert len(tasks) == 2
+        assert all(isinstance(t, Task) for t in tasks)
+        assert tasks[0].prompt == "Test task 1"  # type: ignore
+    finally:
+        Path(temp_path).unlink()
+def test_load_tasks_from_jsonl_file_with_empty_lines():
+    """Test loading tasks from a JSONL file with empty lines."""
+    task_dicts = [
+        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
+        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
+    ]
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
+    ) as f:
+        f.write(json.dumps(task_dicts[0]) + "\n")
+        f.write("\n")  # Empty line
+        f.write(json.dumps(task_dicts[1]) + "\n")
+        temp_path = f.name
+    try:
+        tasks = load_tasks(temp_path)
+        assert len(tasks) == 2
+        assert all(isinstance(t, Task) for t in tasks)
+    finally:
+        Path(temp_path).unlink()
+def test_load_tasks_from_jsonl_file_with_list():
+    """Test loading tasks from a JSONL file where a line contains a list."""
+    task_dict = {"id": "1", "prompt": "Test task 1", "mcp_config": {}}
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
+    ) as f:
+        f.write(json.dumps([task_dict, task_dict]) + "\n")
+        temp_path = f.name
+    try:
+        tasks = load_tasks(temp_path)
+        assert len(tasks) == 2
+        assert all(isinstance(t, Task) for t in tasks)
+    finally:
+        Path(temp_path).unlink()
+def test_load_tasks_json_not_array_error():
+    """Test that loading from JSON file with non-array raises error."""
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
+        json.dump({"not": "an array"}, f)
+        temp_path = f.name
+    try:
+        with pytest.raises(ValueError, match="JSON file must contain an array"):
+            load_tasks(temp_path)
+    finally:
+        Path(temp_path).unlink()
+def test_load_tasks_invalid_jsonl_format():
+    """Test that loading from JSONL with invalid format raises error."""
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
+    ) as f:
+        f.write(json.dumps("invalid") + "\n")
+        temp_path = f.name
+    try:
+        with pytest.raises(ValueError, match="Invalid JSONL format"):
+            load_tasks(temp_path)
+    finally:
+        Path(temp_path).unlink()
+def test_load_tasks_invalid_input_type():
+    """Test that invalid input type raises TypeError."""
+    with pytest.raises(TypeError, match="tasks_input must be str or list"):
+        load_tasks(123)  # type: ignore
+def test_load_tasks_nonexistent_file():
+    """Test that loading from nonexistent file raises error."""
+    with pytest.raises(ValueError, match="neither a file path nor a HuggingFace dataset"):
+        load_tasks("nonexistent_file_without_slash")

hud/utils/tests/test_tool_shorthand.py ADDED Viewed

@@ -0,0 +1,154 @@
+from __future__ import annotations
+from hud.utils.tool_shorthand import (
+    _is_call_like,
+    _to_call_dict,
+    normalize_to_tool_call_dict,
+)
+def test_is_call_like_with_name_and_arguments():
+    """Test _is_call_like with name and arguments keys."""
+    obj = {"name": "test_tool", "arguments": {"key": "value"}}
+    assert _is_call_like(obj) is True
+def test_is_call_like_with_single_key_dict_value():
+    """Test _is_call_like with single key dict containing dict value."""
+    obj = {"tool": {"name": "test"}}
+    assert _is_call_like(obj) is True
+def test_is_call_like_with_nested_single_key():
+    """Test _is_call_like with nested single key dict."""
+    obj = {"tool": {"inner": {"key": "value"}}}
+    assert _is_call_like(obj) is True
+def test_is_call_like_not_dict():
+    """Test _is_call_like returns False for non-dict."""
+    assert _is_call_like("string") is False
+    assert _is_call_like(123) is False
+    assert _is_call_like(None) is False
+    assert _is_call_like([]) is False
+def test_is_call_like_empty_dict():
+    """Test _is_call_like returns False for empty dict."""
+    assert _is_call_like({}) is False
+def test_is_call_like_multi_key_dict():
+    """Test _is_call_like returns False for multi-key dict without name/arguments."""
+    obj = {"key1": "value1", "key2": "value2"}
+    assert _is_call_like(obj) is False
+def test_to_call_dict_with_name_arguments():
+    """Test _to_call_dict preserves name and arguments."""
+    obj = {"name": "test_tool", "arguments": {"param": "value"}}
+    result = _to_call_dict(obj)
+    assert result == {"name": "test_tool", "arguments": {"param": "value"}}
+def test_to_call_dict_with_nested_call():
+    """Test _to_call_dict with nested call-like arguments."""
+    obj = {"name": "outer", "arguments": {"name": "inner", "arguments": {"x": 1}}}
+    result = _to_call_dict(obj)
+    assert result == {"name": "outer", "arguments": {"name": "inner", "arguments": {"x": 1}}}
+def test_to_call_dict_shorthand_single_key():
+    """Test _to_call_dict converts shorthand single-key dict."""
+    obj = {"tool_name": {"name": "inner", "arguments": {}}}
+    result = _to_call_dict(obj)
+    assert result == {"name": "tool_name", "arguments": {"name": "inner", "arguments": {}}}
+def test_to_call_dict_non_call_arguments():
+    """Test _to_call_dict with non-call-like arguments."""
+    obj = {"name": "test", "arguments": {"simple": "value"}}
+    result = _to_call_dict(obj)
+    assert result == {"name": "test", "arguments": {"simple": "value"}}
+def test_to_call_dict_non_dict():
+    """Test _to_call_dict returns non-dict unchanged."""
+    assert _to_call_dict("string") == "string"
+    assert _to_call_dict(123) == 123
+    assert _to_call_dict(None) is None
+def test_to_call_dict_single_key_non_call():
+    """Test _to_call_dict with single key but non-call value."""
+    obj = {"key": "simple_value"}
+    result = _to_call_dict(obj)
+    assert result == {"key": "simple_value"}
+def test_normalize_to_tool_call_dict_none():
+    """Test normalize_to_tool_call_dict with None."""
+    assert normalize_to_tool_call_dict(None) is None
+def test_normalize_to_tool_call_dict_simple_dict():
+    """Test normalize_to_tool_call_dict with simple dict."""
+    obj = {"name": "tool", "arguments": {"x": 1}}
+    result = normalize_to_tool_call_dict(obj)
+    assert result == {"name": "tool", "arguments": {"x": 1}}
+def test_normalize_to_tool_call_dict_shorthand():
+    """Test normalize_to_tool_call_dict with shorthand notation."""
+    obj = {"tool_name": {"name": "inner", "arguments": {}}}
+    result = normalize_to_tool_call_dict(obj)
+    assert result == {"name": "tool_name", "arguments": {"name": "inner", "arguments": {}}}
+def test_normalize_to_tool_call_dict_list():
+    """Test normalize_to_tool_call_dict with list of dicts."""
+    obj = [
+        {"name": "tool1", "arguments": {"a": 1}},
+        {"name": "tool2", "arguments": {"b": 2}},
+    ]
+    result = normalize_to_tool_call_dict(obj)
+    assert len(result) == 2
+    assert result[0] == {"name": "tool1", "arguments": {"a": 1}}
+    assert result[1] == {"name": "tool2", "arguments": {"b": 2}}
+def test_normalize_to_tool_call_dict_list_shorthand():
+    """Test normalize_to_tool_call_dict with list of shorthand dicts."""
+    obj = [
+        {"tool1": {"name": "inner1", "arguments": {}}},
+        {"tool2": {"name": "inner2", "arguments": {}}},
+    ]
+    result = normalize_to_tool_call_dict(obj)
+    assert len(result) == 2
+    assert result[0]["name"] == "tool1"
+    assert result[1]["name"] == "tool2"
+def test_normalize_to_tool_call_dict_non_dict_non_list():
+    """Test normalize_to_tool_call_dict with non-dict, non-list value."""
+    assert normalize_to_tool_call_dict("string") == "string"
+    assert normalize_to_tool_call_dict(123) == 123
+def test_normalize_to_tool_call_dict_empty_list():
+    """Test normalize_to_tool_call_dict with empty list."""
+    assert normalize_to_tool_call_dict([]) == []
+def test_normalize_to_tool_call_dict_complex_nested():
+    """Test normalize_to_tool_call_dict with complex nested structure."""
+    obj = {
+        "outer_tool": {
+            "name": "middle_tool",
+            "arguments": {"name": "inner_tool", "arguments": {"x": 1}},
+        }
+    }
+    result = normalize_to_tool_call_dict(obj)
+    assert result["name"] == "outer_tool"
+    assert result["arguments"]["name"] == "middle_tool"
+    assert result["arguments"]["arguments"]["name"] == "inner_tool"

hud/utils/tests/test_version.py CHANGED Viewed

@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
-    assert hud.__version__ == "0.4.52"
+    assert hud.__version__ == "0.4.54"

hud/version.py CHANGED Viewed

@@ -4,4 +4,4 @@ Version information for the HUD SDK.
 from __future__ import annotations
-__version__ = "0.4.52"
+__version__ = "0.4.54"

{hud_python-0.4.52.dist-info → hud_python-0.4.54.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.52
+Version: 0.4.54
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -42,6 +42,7 @@ Requires-Dist: httpx<1,>=0.23.0
 Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
 Requires-Dist: hud-mcp-python-sdk>=3.13.2
 Requires-Dist: hud-mcp-use-python-sdk==2.3.20
+Requires-Dist: langchain==0.3.27
 Requires-Dist: numpy>=1.24.0
 Requires-Dist: openai
 Requires-Dist: opentelemetry-api>=1.34.1
@@ -160,12 +161,12 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
 ## Highlights
-- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
 - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
 - ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
 - 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
 - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
 - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
+- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
 > We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
@@ -186,29 +187,6 @@ uv tool install hud-python
 Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
-## Quickstart: Training
-RL using GRPO a Qwen2.5-VL model on any hud dataset:
-```bash
-hud get hud-evals/basic-2048 # from HF
-hud rl basic-2048.json
-```
-> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
-Or make your own environment and dataset:
-```bash
-hud init my-env && cd my-env
-hud dev --interactive
-# When ready to run:
-hud rl
-```
-> See [environment design docs](https://docs.hud.so/build-environments)
 ## Quickstart: Evals
 For a tutorial that explains the agent and evaluation design, run:
@@ -265,38 +243,27 @@ The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6
 ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
-## Reinforcement Learning with GRPO
-This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
-![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
+## Quickstart: Training
-Train with the new interactive `hud rl` flow:
+RL using GRPO a Qwen2.5-VL model on any hud dataset:
 ```bash
-# Install CLI
-uv tool install hud-python
-# Option A: Run directly from a HuggingFace dataset
-hud rl hud-evals/basic-2048
-# Option B: Download first, modify, then train
-hud get hud-evals/basic-2048
-hud rl basic-2048.json
-# Optional: baseline evaluation
-hud eval basic-2048.json
+hud get hud-evals/2048-basic # from HF
+hud rl 2048-basic.json
 ```
-Supports multi‑turn RL for both:
-- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
-- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
+> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
-By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
+Or make your own environment and dataset:
-Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
+```bash
+hud init my-env && cd my-env
+hud dev --interactive
+# When ready to run:
+hud rl
+```
-Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
+> See [environment design docs](https://docs.hud.so/build-environments)
 ## Benchmarking Agents
@@ -460,6 +427,39 @@ We highly suggest running 3-5 evaluations per dataset for the most consistent re
 Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
+## Reinforcement Learning with GRPO
+This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
+![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
+Train with the new interactive `hud rl` flow:
+```bash
+# Install CLI
+uv tool install hud-python
+# Option A: Run directly from a HuggingFace dataset
+hud rl hud-evals/2048-basic
+# Option B: Download first, modify, then train
+hud get hud-evals/2048-basic
+hud rl 2048-basic.json
+# Optional: baseline evaluation
+hud eval 2048-basic.json
+```
+Supports multi‑turn RL for both:
+- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
+- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
+By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
+Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
+Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
 ## Architecture
 ```mermaid

hud-python 0.4.52__py3-none-any.whl → 0.4.54__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.52py3-none-any.whl → 0.4.54py3-none-any.whl