PyPI - hud-python - Versions diffs - 0.4.35__py3-none-any.whl → 0.4.37__py3-none-any.whl - Mend

hud-python 0.4.35py3-none-any.whl → 0.4.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show

hud/agents/__init__.py +2 -0
hud/agents/lite_llm.py +72 -0
hud/agents/openai_chat_generic.py +21 -7
hud/agents/tests/test_claude.py +32 -7
hud/agents/tests/test_openai.py +29 -6
hud/cli/__init__.py +228 -79
hud/cli/build.py +26 -6
hud/cli/dev.py +21 -40
hud/cli/eval.py +96 -15
hud/cli/flows/tasks.py +198 -65
hud/cli/init.py +222 -629
hud/cli/pull.py +6 -0
hud/cli/push.py +11 -1
hud/cli/rl/__init__.py +14 -4
hud/cli/rl/celebrate.py +187 -0
hud/cli/rl/config.py +15 -8
hud/cli/rl/local_runner.py +44 -20
hud/cli/rl/remote_runner.py +166 -87
hud/cli/rl/viewer.py +141 -0
hud/cli/rl/wait_utils.py +89 -0
hud/cli/tests/test_build.py +3 -27
hud/cli/tests/test_mcp_server.py +1 -12
hud/cli/utils/config.py +85 -0
hud/cli/utils/docker.py +21 -39
hud/cli/utils/env_check.py +196 -0
hud/cli/utils/environment.py +4 -3
hud/cli/utils/interactive.py +2 -1
hud/cli/utils/local_runner.py +204 -0
hud/cli/utils/metadata.py +3 -1
hud/cli/utils/package_runner.py +292 -0
hud/cli/utils/remote_runner.py +4 -1
hud/cli/utils/source_hash.py +108 -0
hud/clients/base.py +1 -1
hud/clients/fastmcp.py +1 -1
hud/clients/mcp_use.py +30 -7
hud/datasets/parallel.py +3 -1
hud/datasets/runner.py +4 -1
hud/otel/config.py +1 -1
hud/otel/context.py +40 -6
hud/rl/buffer.py +3 -0
hud/rl/tests/test_learner.py +1 -1
hud/rl/vllm_adapter.py +1 -1
hud/server/server.py +234 -7
hud/server/tests/test_add_tool.py +60 -0
hud/server/tests/test_context.py +128 -0
hud/server/tests/test_mcp_server_handlers.py +44 -0
hud/server/tests/test_mcp_server_integration.py +405 -0
hud/server/tests/test_mcp_server_more.py +247 -0
hud/server/tests/test_run_wrapper.py +53 -0
hud/server/tests/test_server_extra.py +166 -0
hud/server/tests/test_sigterm_runner.py +78 -0
hud/settings.py +38 -0
hud/shared/hints.py +2 -2
hud/telemetry/job.py +2 -2
hud/types.py +9 -2
hud/utils/tasks.py +32 -24
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/METADATA +43 -23
{hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/RECORD +63 -46
{hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/WHEEL +0 -0
{hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/licenses/LICENSE +0 -0

hud/server/tests/test_server_extra.py ADDED Viewed

@@ -0,0 +1,166 @@
+# filename: hud/server/tests/test_server_extra.py
+from __future__ import annotations
+import asyncio
+from contextlib import asynccontextmanager, suppress
+import anyio
+import pytest
+from hud.server import MCPServer
+from hud.server import server as server_mod
+@asynccontextmanager
+async def _fake_stdio_server():
+    """
+    Stand-in for stdio_server that avoids reading real stdin.
+    It yields a pair of in-memory streams (receive, send) so the low-level server
+    can start and idle without touching sys.stdin/sys.stdout.
+    """
+    send_in, recv_in = anyio.create_memory_object_stream(100)
+    send_out, recv_out = anyio.create_memory_object_stream(100)
+    try:
+        yield recv_in, send_out
+    finally:
+        # best-effort close across anyio versions
+        for s in (send_in, recv_in, send_out, recv_out):
+            close = getattr(s, "close", None) or getattr(s, "aclose", None)
+            try:
+                if close is not None:
+                    res = close()
+                    if asyncio.iscoroutine(res):
+                        await res
+            except Exception:
+                pass
+@pytest.fixture
+def patch_stdio(monkeypatch: pytest.MonkeyPatch):
+    """Patch stdio server to avoid stdin issues during tests."""
+    monkeypatch.setenv("FASTMCP_DISABLE_BANNER", "1")
+    monkeypatch.setattr("mcp.server.stdio.stdio_server", _fake_stdio_server)
+    monkeypatch.setattr("fastmcp.server.server.stdio_server", _fake_stdio_server)
+@pytest.mark.asyncio
+async def test_sigterm_flag_remains_true_without_shutdown_handler(patch_stdio):
+    """
+    When no @mcp.shutdown is registered, neither the lifespan.finally nor run_async.finally
+    should reset the global SIGTERM flag. This exercises the 'no handler' branches.
+    """
+    mcp = MCPServer(name="NoShutdownHandler")
+    task = asyncio.create_task(mcp.run_async(transport="stdio", show_banner=False))
+    try:
+        await asyncio.sleep(0.05)
+        # Simulate SIGTERM path
+        server_mod._sigterm_received = True  # type: ignore[attr-defined]
+    finally:
+        with suppress(asyncio.CancelledError):
+            task.cancel()
+            await task
+    # Flag must remain set since no shutdown handler was installed
+    assert getattr(server_mod, "_sigterm_received") is True
+    # Always reset for other tests
+    server_mod._sigterm_received = False  # type: ignore[attr-defined]
+@pytest.mark.asyncio
+async def test_last_shutdown_handler_wins(patch_stdio):
+    """
+    If multiple @mcp.shutdown decorators are applied, the last one should be the one that runs.
+    """
+    mcp = MCPServer(name="ShutdownOverride")
+    calls: list[str] = []
+    @mcp.shutdown
+    async def _first() -> None:
+        calls.append("first")
+    @mcp.shutdown
+    async def _second() -> None:
+        calls.append("second")
+    task = asyncio.create_task(mcp.run_async(transport="stdio", show_banner=False))
+    try:
+        await asyncio.sleep(0.05)
+        server_mod._sigterm_received = True  # type: ignore[attr-defined]
+    finally:
+        with suppress(asyncio.CancelledError):
+            task.cancel()
+            await task
+    assert calls == ["second"], "Only the last registered shutdown handler should run"
+    server_mod._sigterm_received = False  # type: ignore[attr-defined]
+def test__run_with_sigterm_registers_handlers_when_enabled(monkeypatch: pytest.MonkeyPatch):
+    """
+    Verify that _run_with_sigterm attempts to register SIGTERM/SIGINT handlers
+    when the env var does NOT disable the handler. We stub AnyIO's TaskGroup so
+    the watcher doesn't block and the test returns immediately.
+    """
+    # Ensure handler is enabled
+    monkeypatch.delenv("FASTMCP_DISABLE_SIGTERM_HANDLER", raising=False)
+    # Record what the server tries to register
+    added_signals: list[int] = []
+    import asyncio as _asyncio
+    orig_get_running_loop = _asyncio.get_running_loop
+    def proxy_get_running_loop():
+        real = orig_get_running_loop()
+        class _LoopProxy:
+            __slots__ = ("_inner",)
+            def __init__(self, inner):
+                self._inner = inner
+            def add_signal_handler(self, signum, callback, *args):
+                added_signals.append(signum)  # don't actually install
+                # no-op: skip calling inner.add_signal_handler to avoid OS constraints
+            def __getattr__(self, name):
+                # delegate everything else (create_task, call_soon, etc.)
+                return getattr(self._inner, name)
+        return _LoopProxy(real)
+    # Patch globally so both the test and hud.server.server see the proxy
+    monkeypatch.setattr(_asyncio, "get_running_loop", proxy_get_running_loop)
+    # Dummy TaskGroup that runs the work but skips _watch
+    class _DummyTG:
+        async def __aenter__(self):
+            return self
+        async def __aexit__(self, exc_type, exc, tb):
+            return False
+        def start_soon(self, fn, *args, **kwargs):
+            if getattr(fn, "__name__", "") == "_watch":
+                return
+            _asyncio.get_running_loop().create_task(fn(*args, **kwargs))
+    monkeypatch.setattr("anyio.create_task_group", lambda: _DummyTG())
+    # Simple coroutine that should run to completion
+    hit = {"v": False}
+    async def work():
+        hit["v"] = True
+    server_mod._run_with_sigterm(work)
+    assert hit["v"] is True
+    import signal as _signal
+    assert _signal.SIGTERM in added_signals
+    assert _signal.SIGINT in added_signals

hud/server/tests/test_sigterm_runner.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+import asyncio
+from contextlib import asynccontextmanager, suppress
+import anyio
+import pytest
+from hud.server import MCPServer
+from hud.server import server as server_mod
+def test__run_with_sigterm_executes_coro_when_handler_disabled(monkeypatch: pytest.MonkeyPatch):
+    """With FASTMCP_DISABLE_SIGTERM_HANDLER=1, _run_with_sigterm should just run the task."""
+    monkeypatch.setenv("FASTMCP_DISABLE_SIGTERM_HANDLER", "1")
+    hit = {"v": False}
+    async def work(arg, *, kw=None):
+        assert arg == 123 and kw == "ok"
+        hit["v"] = True
+    # Wrapper to exercise kwargs since TaskGroup.start_soon only accepts positional args
+    async def wrapper(arg):
+        await work(arg, kw="ok")
+    # Should return cleanly and mark hit
+    server_mod._run_with_sigterm(wrapper, 123)
+    assert hit["v"] is True
+@asynccontextmanager
+async def _fake_stdio_server():
+    """Stand-in for stdio_server that avoids reading real stdin."""
+    send_in, recv_in = anyio.create_memory_object_stream(100)
+    send_out, recv_out = anyio.create_memory_object_stream(100)
+    try:
+        yield recv_in, send_out
+    finally:
+        for s in (send_in, recv_in, send_out, recv_out):
+            close = getattr(s, "close", None) or getattr(s, "aclose", None)
+            try:
+                if close is not None:
+                    res = close()
+                    if asyncio.iscoroutine(res):
+                        await res
+            except Exception:
+                pass
+@pytest.fixture
+def patch_stdio(monkeypatch: pytest.MonkeyPatch):
+    """Patch stdio server to avoid stdin issues during tests."""
+    monkeypatch.setenv("FASTMCP_DISABLE_BANNER", "1")
+    monkeypatch.setattr("mcp.server.stdio.stdio_server", _fake_stdio_server)
+    monkeypatch.setattr("fastmcp.server.server.stdio_server", _fake_stdio_server)
+@pytest.mark.asyncio
+async def test_shutdown_handler_exception_does_not_crash_and_resets_flag(patch_stdio):
+    """If @shutdown raises, run_async must swallow it and still reset the SIGTERM flag."""
+    mcp = MCPServer(name="ShutdownRaises")
+    @mcp.shutdown
+    async def _boom() -> None:
+        raise RuntimeError("kaboom")
+    task = asyncio.create_task(mcp.run_async(transport="stdio", show_banner=False))
+    try:
+        await asyncio.sleep(0.05)
+        server_mod._sigterm_received = True  # trigger shutdown path
+    finally:
+        with suppress(asyncio.CancelledError):
+            task.cancel()
+            await task
+    # No exception propagated; flag must be reset
+    assert not getattr(server_mod, "_sigterm_received")

hud/settings.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from __future__ import annotations
+from pathlib import Path
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic_settings.sources import DotEnvSettingsSource, PydanticBaseSettingsSource
 class Settings(BaseSettings):
@@ -14,6 +17,41 @@ class Settings(BaseSettings):
     model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="allow")
+    @classmethod
+    def settings_customise_sources(
+        cls,
+        settings_cls: type[BaseSettings],
+        init_settings: PydanticBaseSettingsSource,
+        env_settings: PydanticBaseSettingsSource,
+        dotenv_settings: PydanticBaseSettingsSource,
+        file_secret_settings: PydanticBaseSettingsSource,
+    ) -> tuple[PydanticBaseSettingsSource, ...]:
+        """
+        Customize settings source precedence to include a user-level env file.
+        Precedence (highest to lowest):
+        - init_settings (explicit kwargs)
+        - env_settings (process environment)
+        - dotenv_settings (project .env)
+        - user_dotenv_settings (~/.hud/.env)  ← added
+        - file_secret_settings
+        """
+        user_env_path = Path.home() / ".hud" / ".env"
+        user_dotenv_settings = DotEnvSettingsSource(
+            settings_cls,
+            env_file=user_env_path,
+            env_file_encoding="utf-8",
+        )
+        return (
+            init_settings,
+            env_settings,
+            dotenv_settings,
+            user_dotenv_settings,
+            file_secret_settings,
+        )
     hud_telemetry_url: str = Field(
         default="https://telemetry.hud.so/v3/api",
         description="Base URL for the HUD API",

hud/shared/hints.py CHANGED Viewed

@@ -37,8 +37,8 @@ HUD_API_KEY_MISSING = Hint(
     title="HUD API key required",
     message="Missing or invalid HUD_API_KEY.",
     tips=[
-        "Set HUD_API_KEY environment variable",
-        "Get a key at https://app.hud.so",
+        "Set HUD_API_KEY in your environment or run: hud set HUD_API_KEY=your-key-here",
+        "Get a key at https://hud.so",
         "Check for whitespace or truncation",
     ],
     docs_url=None,

hud/telemetry/job.py CHANGED Viewed

@@ -143,7 +143,7 @@ def _print_job_url(job_id: str, job_name: str) -> None:
     if not (settings.telemetry_enabled and settings.api_key):
         return
-    url = f"https://app.hud.so/jobs/{job_id}"
+    url = f"https://hud.so/jobs/{job_id}"
     header = f"🚀 Job '{job_name}' started:"
     # ANSI color codes
@@ -182,7 +182,7 @@ def _print_job_complete_url(job_id: str, job_name: str, error_occurred: bool = F
     if not (settings.telemetry_enabled and settings.api_key):
         return
-    url = f"https://app.hud.so/jobs/{job_id}"
+    url = f"https://hud.so/jobs/{job_id}"
     # ANSI color codes
     GREEN = "\033[92m"

hud/types.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import contextlib
 import json
 import logging
 import uuid
@@ -107,7 +108,13 @@ class Task(BaseModel):
         # Start with current environment variables
         mapping = dict(os.environ)
-        mapping.update(settings.model_dump())
+        # Include settings (from process env, project .env, and user .env)
+        settings_dict = settings.model_dump()
+        mapping.update(settings_dict)
+        # Add UPPERCASE aliases for settings keys
+        for _key, _val in settings_dict.items():
+            with contextlib.suppress(Exception):
+                mapping[_key.upper()] = _val
         if settings.api_key:
             mapping["HUD_API_KEY"] = settings.api_key
@@ -208,7 +215,7 @@ class AgentResponse(BaseModel):
     tool_calls: list[MCPToolCall] = Field(default_factory=list)
     done: bool = Field(default=False)
-    # --- TELEMETRY [app.hud.so] ---
+    # --- TELEMETRY [hud.so] ---
     # Responses
     content: str | None = Field(default=None)
     reasoning: str | None = Field(default=None)

hud/utils/tasks.py CHANGED Viewed

@@ -9,7 +9,7 @@ from hud.utils.hud_console import HUDConsole
 hud_console = HUDConsole()
-def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
+def load_tasks(tasks_input: str | list[dict], *, raw: bool = False) -> list[Task] | list[dict]:
     """Load tasks from various sources.
     Args:
@@ -18,16 +18,19 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
             - Path to a JSONL file (one task per line)
             - HuggingFace dataset name (format: "username/dataset" or "username/dataset:split")
             - List of task dictionaries
-        system_prompt: Default system prompt to use if not specified in task
+        raw: If True, return raw dicts without validation or env substitution
     Returns:
-        List of validated HUD Task objects
+        - If raw=False (default): list[Task]
+        - If raw=True: list[dict]
     """
-    tasks = []
+    tasks: list[Task] | list[dict] = []
     if isinstance(tasks_input, list):
         # Direct list of task dicts
         hud_console.info(f"Loading {len(tasks_input)} tasks from provided list")
+        if raw:
+            return [item for item in tasks_input if isinstance(item, dict)]
         for item in tasks_input:
             task = Task(**item)
             tasks.append(task)
@@ -36,7 +39,6 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
         # Check if it's a file path
         if Path(tasks_input).exists():
             file_path = Path(tasks_input)
-            hud_console.info(f"Loading tasks from file: {tasks_input}")
             with open(file_path) as f:
                 # Handle JSON files (array of tasks)
@@ -46,31 +48,33 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
                         raise ValueError(
                             f"JSON file must contain an array of tasks, got {type(data)}"
                         )
+                    if raw:
+                        return [item for item in data if isinstance(item, dict)]
                     for item in data:
                         task = Task(**item)
                         tasks.append(task)
                 # Handle JSONL files (one task per line)
                 else:
+                    raw_items: list[dict] = []
                     for line in f:
                         line = line.strip()
-                        if line:  # Skip empty lines
-                            item = json.loads(line)
-                            # Handle case where line contains an array of tasks
-                            if isinstance(item, list):
-                                for task_item in item:
-                                    task = Task(**task_item)
-                                    tasks.append(task)
-                            # Handle normal case where line contains a single task object
-                            elif isinstance(item, dict):
-                                task = Task(**item)
-                                tasks.append(task)
-                            else:
-                                raise ValueError(
-                                    f"Invalid JSONL format: expected dict or list of dicts, got {type(item)}"  # noqa: E501
-                                )
+                        if not line:
+                            continue
+                        item = json.loads(line)
+                        if isinstance(item, list):
+                            raw_items.extend([it for it in item if isinstance(it, dict)])
+                        elif isinstance(item, dict):
+                            raw_items.append(item)
+                        else:
+                            raise ValueError(
+                                f"Invalid JSONL format: expected dict or list of dicts, got {type(item)}"  # noqa: E501
+                            )
+                    if raw:
+                        return raw_items
+                    for it in raw_items:
+                        task = Task(**it)
+                        tasks.append(task)
         # Check if it's a HuggingFace dataset
         elif "/" in tasks_input:
@@ -88,6 +92,7 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
                 dataset = load_dataset(dataset_name, split=split)
                 # Convert dataset rows to Task objects
+                raw_rows: list[dict] = []
                 for item in dataset:
                     if not isinstance(item, dict):
                         raise ValueError(
@@ -97,7 +102,11 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
                         raise ValueError(
                             f"Invalid HuggingFace dataset: expected mcp_config and prompt, got {item}"  # noqa: E501
                         )
-                    task = Task(**item)
+                    raw_rows.append(item)
+                if raw:
+                    return raw_rows
+                for row in raw_rows:
+                    task = Task(**row)
                     tasks.append(task)
             except ImportError as e:
@@ -115,5 +124,4 @@ def load_tasks(tasks_input: str | list[dict]) -> list[Task]:
     else:
         raise TypeError(f"tasks_input must be str or list, got {type(tasks_input)}")
-    hud_console.info(f"Loaded {len(tasks)} tasks")
     return tasks

hud/utils/tests/test_version.py CHANGED Viewed

@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
-    assert hud.__version__ == "0.4.35"
+    assert hud.__version__ == "0.4.37"

hud/version.py CHANGED Viewed

@@ -4,4 +4,4 @@ Version information for the HUD SDK.
 from __future__ import annotations
-__version__ = "0.4.35"
+__version__ = "0.4.37"

{hud_python-0.4.35.dist-info → hud_python-0.4.37.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.35
+Version: 0.4.37
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -36,11 +36,13 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Requires-Python: <3.13,>=3.11
 Requires-Dist: anthropic
+Requires-Dist: blessed>=1.20.0
 Requires-Dist: datasets>=2.14.0
 Requires-Dist: httpx<1,>=0.23.0
 Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
 Requires-Dist: hud-mcp-python-sdk>=3.13.2
-Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
+Requires-Dist: hud-mcp-use-python-sdk==2.3.19
+Requires-Dist: litellm>=1.55.0
 Requires-Dist: numpy>=1.24.0
 Requires-Dist: openai
 Requires-Dist: opentelemetry-api>=1.34.1
@@ -50,8 +52,8 @@ Requires-Dist: opentelemetry-sdk>=1.34.1
 Requires-Dist: pathspec>=0.12.1
 Requires-Dist: pillow>=11.1.0
 Requires-Dist: prompt-toolkit==3.0.51
-Requires-Dist: pydantic-settings<3,>=2
-Requires-Dist: pydantic<3,>=2
+Requires-Dist: pydantic-settings<3,>=2.2
+Requires-Dist: pydantic<3,>=2.6
 Requires-Dist: questionary==2.1.0
 Requires-Dist: rich>=13.0.0
 Requires-Dist: toml>=0.10.2
@@ -59,7 +61,9 @@ Requires-Dist: typer>=0.9.0
 Requires-Dist: watchfiles>=0.21.0
 Requires-Dist: wrapt>=1.14.0
 Provides-Extra: agent
+Requires-Dist: aiodocker>=0.24.0; extra == 'agent'
 Requires-Dist: dotenv>=0.9.9; extra == 'agent'
+Requires-Dist: inspect-ai>=0.3.80; extra == 'agent'
 Requires-Dist: ipykernel; extra == 'agent'
 Requires-Dist: ipython<9; extra == 'agent'
 Requires-Dist: jupyter-client; extra == 'agent'
@@ -67,8 +71,21 @@ Requires-Dist: jupyter-core; extra == 'agent'
 Requires-Dist: langchain; extra == 'agent'
 Requires-Dist: langchain-anthropic; extra == 'agent'
 Requires-Dist: langchain-openai; extra == 'agent'
+Requires-Dist: pillow>=11.1.0; extra == 'agent'
+Requires-Dist: playwright; extra == 'agent'
+Requires-Dist: pyautogui>=0.9.54; extra == 'agent'
+Requires-Dist: pyright==1.1.401; extra == 'agent'
+Requires-Dist: pytest-asyncio; extra == 'agent'
+Requires-Dist: pytest-cov; extra == 'agent'
+Requires-Dist: pytest-mock; extra == 'agent'
+Requires-Dist: pytest<9,>=8.1.1; extra == 'agent'
+Requires-Dist: ruff>=0.11.8; extra == 'agent'
+Requires-Dist: setuptools; extra == 'agent'
+Requires-Dist: textdistance<5,>=4.5.0; extra == 'agent'
 Provides-Extra: agents
+Requires-Dist: aiodocker>=0.24.0; extra == 'agents'
 Requires-Dist: dotenv>=0.9.9; extra == 'agents'
+Requires-Dist: inspect-ai>=0.3.80; extra == 'agents'
 Requires-Dist: ipykernel; extra == 'agents'
 Requires-Dist: ipython<9; extra == 'agents'
 Requires-Dist: jupyter-client; extra == 'agents'
@@ -76,6 +93,17 @@ Requires-Dist: jupyter-core; extra == 'agents'
 Requires-Dist: langchain; extra == 'agents'
 Requires-Dist: langchain-anthropic; extra == 'agents'
 Requires-Dist: langchain-openai; extra == 'agents'
+Requires-Dist: pillow>=11.1.0; extra == 'agents'
+Requires-Dist: playwright; extra == 'agents'
+Requires-Dist: pyautogui>=0.9.54; extra == 'agents'
+Requires-Dist: pyright==1.1.401; extra == 'agents'
+Requires-Dist: pytest-asyncio; extra == 'agents'
+Requires-Dist: pytest-cov; extra == 'agents'
+Requires-Dist: pytest-mock; extra == 'agents'
+Requires-Dist: pytest<9,>=8.1.1; extra == 'agents'
+Requires-Dist: ruff>=0.11.8; extra == 'agents'
+Requires-Dist: setuptools; extra == 'agents'
+Requires-Dist: textdistance<5,>=4.5.0; extra == 'agents'
 Provides-Extra: dev
 Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
 Requires-Dist: dotenv>=0.9.9; extra == 'dev'
@@ -100,14 +128,6 @@ Requires-Dist: setuptools; extra == 'dev'
 Requires-Dist: textdistance<5,>=4.5.0; extra == 'dev'
 Provides-Extra: rl
 Requires-Dist: bitsandbytes>=0.41.0; (sys_platform == 'linux') and extra == 'rl'
-Requires-Dist: dotenv>=0.9.9; extra == 'rl'
-Requires-Dist: ipykernel; extra == 'rl'
-Requires-Dist: ipython<9; extra == 'rl'
-Requires-Dist: jupyter-client; extra == 'rl'
-Requires-Dist: jupyter-core; extra == 'rl'
-Requires-Dist: langchain; extra == 'rl'
-Requires-Dist: langchain-anthropic; extra == 'rl'
-Requires-Dist: langchain-openai; extra == 'rl'
 Requires-Dist: liger-kernel>=0.5.0; (sys_platform == 'linux') and extra == 'rl'
 Requires-Dist: peft>=0.17.1; extra == 'rl'
 Requires-Dist: vllm==0.10.1.1; extra == 'rl'
@@ -138,8 +158,8 @@ OSS RL environment + evals toolkit. Wrap software as environments, run benchmark
 ## Highlights
 - 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
-- ⚡️ **[Live telemetry](https://app.hud.so)** – inspect every tool call, observation, and reward in real time.
-- 🗂️ **[Public benchmarks](https://app.hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
+- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
+- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
 - 🌱 **[Reinforcement learning built-in](rl/)** – Verifiers gym pipelines for GRPO on any environment.
 - 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
 - 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
@@ -185,14 +205,14 @@ from hud.agents import ClaudeAgent
 from hud.datasets import Task  # See docs: https://docs.hud.so/reference/tasks
 async def main() -> None:
-    with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://app.hud.so)
+    with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
         task = {
             "prompt": "Reach 64 in 2048.",
             "mcp_config": {
                 "hud": {
                     "url": "https://mcp.hud.so/v3/mcp",  # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
                     "headers": {
-                        "Authorization": f"Bearer {settings.api_key}",  # Get your key at https://app.hud.so
+                        "Authorization": f"Bearer {settings.api_key}",  # Get your key at https://hud.so
                         "Mcp-Image": "hudpython/hud-text-2048:v1.2"  # Docker image from https://hub.docker.com/u/hudpython
                     }
                 }
@@ -219,7 +239,7 @@ async def main() -> None:
 asyncio.run(main())
 ```
-The above example let's the agent play 2048 ([See replay](https://app.hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
+The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
 ![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
@@ -250,7 +270,7 @@ Supports multi‑turn RL for both:
 - Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
 - Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
-By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `app.hud.so`, and lets you monitor/manage models at `app.hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
+By default, `hud rl` provisions a persistant server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
 Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
@@ -260,7 +280,7 @@ This is Claude Computer Use running on our proprietary financial analyst benchma
 ![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
-> [See this trace on _app.hud.so_](https://app.hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
+> [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
 This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
@@ -286,7 +306,7 @@ results = await run_dataset(
 print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
 ```
-> Running a dataset creates a job and streams results to the [app.hud.so](https://app.hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
+> Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
 ## Building Environments (MCP)
@@ -377,7 +397,7 @@ Tools
 hud push # needs docker login, hud api key
 ```
-5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [app.hud.so](https://app.hud.so):
+5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
 ```python
 from hud.agents import ClaudeAgent
@@ -408,7 +428,7 @@ result = await ClaudeAgent().run({  # See all agents: https://docs.hud.so/refere
 ## Leaderboards & benchmarks
-All leaderboards are publicly available on [app.hud.so/leaderboards](https://app.hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
+All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
 ![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
@@ -422,7 +442,7 @@ Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) funct
 %%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
 graph LR
     subgraph "Platform"
-        Dashboard["📊 app.hud.so"]
+        Dashboard["📊 hud.so"]
         API["🔌 mcp.hud.so"]
     end

hud-python 0.4.35__py3-none-any.whl → 0.4.37__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.35py3-none-any.whl → 0.4.37py3-none-any.whl