PyPI - hud-python - Versions diffs - 0.6.3__tar.gz → 0.6.4__tar.gz - Mend

hud-python 0.6.3tar.gz → 0.6.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (240) hide show

{hud_python-0.6.3 → hud_python-0.6.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.6.3
+Version: 0.6.4
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

hud_python-0.6.4/cookbooks/connect4-selfplay/README.md ADDED Viewed

@@ -0,0 +1,57 @@
+# Connect Four self-play
+Symmetric self-play RL on a 6×7 Connect Four board. Draws are rare (you need a
+full 42-cell board with no four-in-a-row), so the win/loss reward signal
+persists as the policy improves and the GRPO advantage stays non-zero.
+## How it works
+- One agent ("outer") plays a full game against an inner model on the **same
+  slug** — true self-play. `seed % 2` decides who drops first, for symmetric
+  first-move coverage.
+- Each game trains **both sides at once**: the outer agent's `Run` (reward from
+  its perspective) plus a hand-built `TrajectoryPayload` for the inner model
+  with the flipped reward (`1 - outer_reward`).
+- `group_size=2` pairs each game's two trajectories so the GRPO advantage is
+  `reward - 0.5` per game.
+- `loss_fn="ppo"` clips the importance-sampling ratio, so a single lucky game
+  can't blow up the update.
+The training loop uses the public API directly — `forward_backward` accepts
+`Run` and `TrajectoryPayload` mixed, so no private helpers are needed.
+## Setup
+```bash
+hud models fork Qwen/Qwen3.5-4B --name c4-selfplay   # prints a slug like c4-selfplay-<id>
+```
+Put your `HUD_API_KEY` in a `.env` here (or the environment).
+## Run
+Local sanity check (one game, cheap external model as the outer agent):
+```bash
+hud eval env.py claude --model claude-haiku-4-5
+```
+Train:
+```bash
+python train.py --model c4-selfplay-<id> --steps 20 --group 4 --lr 1e-5
+```
+## Tuning notes
+- **Memory scales with `tasks × group`.** Each task×rollout is a fresh `env.py`
+  subprocess. With 8 tasks and `--group 4` that's 32 concurrent games. Connect
+  Four games can run up to 42 plies, so they cost more tokens and time per game —
+  start at `--group 4` and raise only if you have RAM headroom.
+- **Watch the server-side metrics.** The loop prints local win/draw/loss counts
+  each step and the last few checkpoints' `mean_reward` / `reward_std` via
+  `trainer.checkpoints()` at the end. A healthy run keeps non-trivial
+  `reward_std` (within-group spread); if it collapses, the policy has saturated.
+- **Reset on changes.** If you edit the reward or the board, roll the head back
+  to a clean checkpoint (`hud models head <slug> --set <id>`) or fork fresh —
+  don't keep training a policy shaped by the old objective.

{hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/__init__.py RENAMED Viewed

@@ -8,7 +8,12 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any, cast
 from hud.types import AgentType
-from hud.utils.gateway import build_gateway_client, list_gateway_models
+from hud.utils.gateway import (
+    build_gateway_client,
+    gateway_model_aliases,
+    list_gateway_models,
+    normalize_gateway_model_id,
+)
 if TYPE_CHECKING:
     from typing import TypeAlias
@@ -27,6 +32,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
     For direct API access with provider API keys, instantiate the agent classes directly.
     """
+    requested_model = model
+    model = normalize_gateway_model_id(model)
     agent_type = next((candidate for candidate in AgentType if candidate.value == model), None)
     if agent_type is not None:
         model_id = model
@@ -73,7 +80,8 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
                 for n in (gm.id, gm.name, gm.model_name)
                 if isinstance(n, str)
             ]
-            near = difflib.get_close_matches(model, known, n=3, cutoff=0.5)
+            known.extend(gateway_model_aliases())
+            near = difflib.get_close_matches(requested_model, known, n=3, cutoff=0.5)
             hint = (
                 f" Did you mean: {', '.join(near)}?"
                 if near
@@ -84,7 +92,7 @@ def create_agent(model: str, **kwargs: Any) -> GatewayAgent:
                 if gateway_models
                 else "the HUD gateway registry (empty — is HUD_API_KEY set?)"
             )
-            raise ValueError(f"Model {model!r} not found in {source}.{hint}")
+            raise ValueError(f"Model {requested_model!r} not found in {source}.{hint}")
     kwargs.setdefault("model", model_id)
     kwargs.setdefault("model_client", build_gateway_client(provider_name))

{hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/openai_compatible/agent.py RENAMED Viewed

@@ -193,16 +193,27 @@ class OpenAIChatAgent(ToolAgent[ChatCompletionMessageParam, OpenAIChatConfig]):
         sample: Sample | None = None
         if return_token_ids:
             prompt_token_ids = getattr(choice, "prompt_token_ids", None)
+            # Multimodal prompt (text + image chunks): the only prompt representation
+            # that survives image inputs; flat prompt_token_ids is null in that case.
+            prompt_chunks = getattr(choice, "prompt_chunks", None)
             token_ids = getattr(choice, "token_ids", None)
-            if prompt_token_ids is not None and token_ids is not None:
-                chat_state.continuation_token_ids = list(prompt_token_ids) + list(token_ids)
-                chat_state.continuation_message_count = len(messages)
+            has_prompt = prompt_token_ids is not None or prompt_chunks is not None
+            if token_ids is not None and has_prompt:
                 content_lp = choice.logprobs.content if choice.logprobs else None
                 sample = Sample(
-                    prompt_token_ids=list(prompt_token_ids),
+                    prompt_token_ids=list(prompt_token_ids) if prompt_token_ids is not None else [],
+                    prompt_chunks=list(prompt_chunks) if prompt_chunks is not None else None,
                     output_token_ids=list(token_ids),
                     output_logprobs=[tok.logprob for tok in content_lp] if content_lp else [],
                 )
+                # KV-cache continuation only applies to flat text prompts; clear any
+                # stale state when the gateway returns chunks-only (multimodal turn).
+                if prompt_token_ids is not None:
+                    chat_state.continuation_token_ids = list(prompt_token_ids) + list(token_ids)
+                    chat_state.continuation_message_count = len(messages)
+                else:
+                    chat_state.continuation_token_ids = None
+                    chat_state.continuation_message_count = None
         tool_calls: list[MCPToolCall] = []
         for tc in function_calls:

{hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_base.py RENAMED Viewed

@@ -108,7 +108,7 @@ def test_create_agent_resolves_gateway_model_metadata(
     model = GatewayModelInfo(
         id="ft:custom-123",
-        model_name="gpt-5.4",
+        model_name="gpt-5.5",
         sdk_agent_type="openai_compatible",
         provider=GatewayProviderInfo(name="openai"),
     )
@@ -122,4 +122,40 @@ def test_create_agent_resolves_gateway_model_metadata(
     agent = create_agent("ft:custom-123")
     assert isinstance(agent, OpenAIChatAgent)
-    assert agent.config.model == "gpt-5.4"  # resolved to the model's real name
+    assert agent.config.model == "gpt-5.5"  # resolved to the model's real name
+@pytest.mark.parametrize(
+    ("alias", "canonical"),
+    [
+        ("deepseek-v4", "deepseek/deepseek-v4-pro"),
+        ("deepseek-v4-flash", "deepseek/deepseek-v4-flash"),
+        ("glm-5.2", "z-ai/glm-5.2"),
+        ("kimi-k2.6", "moonshotai/kimi-k2.6"),
+        ("minimax-m3", "MiniMax-M3"),
+    ],
+)
+def test_create_agent_accepts_gateway_model_aliases(
+    alias: str,
+    canonical: str,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from hud.utils.gateway import GatewayModelInfo, GatewayProviderInfo
+    model = GatewayModelInfo(
+        id=canonical,
+        model_name=canonical,
+        sdk_agent_type="openai_compatible",
+        provider=GatewayProviderInfo(name="openai"),
+    )
+    monkeypatch.setattr("hud.agents.list_gateway_models", lambda: [model])
+    def _build_client(_provider: str) -> object:
+        return object()
+    monkeypatch.setattr("hud.agents.build_gateway_client", _build_client)
+    agent = create_agent(alias)
+    assert isinstance(agent, OpenAIChatAgent)
+    assert agent.config.model == canonical

{hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/tests/test_provider_native_tools.py RENAMED Viewed

@@ -102,7 +102,7 @@ def _commands(tool: Any) -> list[str]:
 async def test_openai_shell_wraps_command_with_timeout() -> None:
-    tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
+    tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
     result = await tool.execute({"commands": ["pwd"], "timeout_ms": 2500})
@@ -114,7 +114,7 @@ async def test_openai_shell_wraps_command_with_timeout() -> None:
 async def test_openai_shell_runs_each_command_without_timeout() -> None:
-    tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
+    tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
     await tool.execute({"commands": ["echo a", "echo b"]})
@@ -122,7 +122,7 @@ async def test_openai_shell_runs_each_command_without_timeout() -> None:
 async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
-    tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
+    tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
     result = await tool.execute({"commands": 123})
@@ -131,7 +131,7 @@ async def test_openai_shell_rejects_non_list_commands_without_running() -> None:
 def test_openai_shell_to_params_is_shell_type() -> None:
-    tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.4"), client=_ssh())
+    tool = OpenAIShellTool(spec=OpenAIShellTool.default_spec("gpt-5.5"), client=_ssh())
     assert tool.to_params()["type"] == "shell"

{hud_python-0.6.3 → hud_python-0.6.4}/hud/agents/types.py RENAMED Viewed

@@ -99,7 +99,7 @@ class OpenAIConfig(AgentConfig):
     """Configuration for OpenAIAgent."""
     model_name: str = "OpenAI"
-    model: str = Field(default="gpt-5.4", validation_alias=_model_alias)
+    model: str = Field(default="gpt-5.5", validation_alias=_model_alias)
     max_output_tokens: int | None = None
     temperature: float | None = None
     reasoning: Any = None  # openai Reasoning
@@ -113,7 +113,7 @@ class OpenAIChatConfig(AgentConfig):
     """Configuration for OpenAIChatAgent."""
     model_name: str = "OpenAI Chat"
-    model: str = Field(default="gpt-5-mini", validation_alias=_model_alias)
+    model: str = Field(default="gpt-5.4-mini", validation_alias=_model_alias)
     checkpoint: str | None = Field(
         default=None,
         description="Specific checkpoint name for inference routing. "
@@ -139,7 +139,7 @@ class ClaudeSDKConfig(AgentConfig):
     """
     model_name: str = "Claude Code"
-    model: str = Field(default="claude-sonnet-4-5", validation_alias=_model_alias)
+    model: str = Field(default="claude-sonnet-4-6", validation_alias=_model_alias)
     permission_mode: str = "bypassPermissions"
     max_steps: int = -1
     allowed_tools: list[str] = Field(
@@ -222,6 +222,10 @@ class Sample(BaseModel):
     """
     prompt_token_ids: list[int] = Field(default_factory=list[int])
+    # Multimodal prompt as serialized ``ModelInput`` chunks (text + image), set by
+    # vision rollouts where the prompt is not a flat token list. When present it is
+    # the authoritative prompt for training; ``prompt_token_ids`` stays empty.
+    prompt_chunks: list[dict[str, Any]] | None = None
     output_token_ids: list[int] = Field(default_factory=list[int])
     output_logprobs: list[float] = Field(default_factory=list[float])

{hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/__init__.py RENAMED Viewed

@@ -35,11 +35,13 @@ from .client import client_app  # noqa: E402
 from .deploy import deploy_command  # noqa: E402
 from .eval import eval_command  # noqa: E402
 from .init import init_command  # noqa: E402
+from .jobs import jobs_app  # noqa: E402
 from .login import login_command  # noqa: E402
 from .models import models_app  # noqa: E402
 from .serve import serve_command  # noqa: E402
 from .sync import sync_app  # noqa: E402
 from .task import task_app  # noqa: E402
+from .trace import trace_app  # noqa: E402
 app.command(name="serve")(serve_command)
 app.command(name="dev", deprecated=True, hidden=True)(serve_command)  # alias for now
@@ -49,6 +51,8 @@ app.command(name="eval")(eval_command)
 app.command(name="init")(init_command)
 app.command(name="cancel")(cancel_command)
 app.add_typer(models_app, name="models")
+app.add_typer(jobs_app, name="jobs")
+app.add_typer(trace_app, name="trace")
 @app.command(name="set")

{hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/eval.py RENAMED Viewed

@@ -43,8 +43,9 @@ def _resolve_model_from_catalog(model_id: str) -> tuple[AgentType, str] | None:
     Returns None if the model isn't found or the catalog is unreachable.
     """
     try:
-        from hud.utils.gateway import list_gateway_models
+        from hud.utils.gateway import list_gateway_models, normalize_gateway_model_id
+        model_id = normalize_gateway_model_id(model_id)
         models = list_gateway_models()
     except Exception:
         return None
@@ -117,8 +118,9 @@ class AgentPreset:
 _AGENT_PRESETS: list[AgentPreset] = [
     AgentPreset("Claude Sonnet 4.6", AgentType.CLAUDE, "claude-sonnet-4-6"),
-    AgentPreset("GPT-5.4", AgentType.OPENAI, "gpt-5.4"),
-    AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3-1-pro"),
+    AgentPreset("Claude Opus 4.8", AgentType.CLAUDE, "claude-opus-4-8"),
+    AgentPreset("GPT-5.5", AgentType.OPENAI, "gpt-5.5"),
+    AgentPreset("Gemini 3.1 Pro (Preview)", AgentType.GEMINI, "gemini-3.1-pro-preview"),
     AgentPreset(
         "Grok 4-1 Fast (xAI)",
         AgentType.OPENAI_COMPATIBLE,
@@ -131,10 +133,22 @@ _AGENT_PRESETS: list[AgentPreset] = [
         },
     ),
     AgentPreset(
-        "GLM-4.6V (Z-AI)",
+        "GLM 5.2 (Z.ai)",
         AgentType.OPENAI_COMPATIBLE,
-        "z-ai/glm-4.6v",
-        {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM-4.6V"}},
+        "z-ai/glm-5.2",
+        {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM 5.2"}},
+    ),
+    AgentPreset(
+        "Kimi K2.6 (Moonshot)",
+        AgentType.OPENAI_COMPATIBLE,
+        "moonshotai/kimi-k2.6",
+        {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "Kimi K2.6"}},
+    ),
+    AgentPreset(
+        "MiniMax M3",
+        AgentType.OPENAI_COMPATIBLE,
+        "MiniMax-M3",
+        {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "MiniMax M3"}},
     ),
 ]
@@ -162,7 +176,7 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
 # use_computer_beta = true
 [openai]
-# model = "gpt-4o"
+# model = "gpt-5.5"
 # temperature = 0.7
 # max_output_tokens = 4096
@@ -402,6 +416,11 @@ class EvalConfig(BaseModel):
         if self.model:
             kwargs["model"] = self.model
+        if isinstance(kwargs.get("model"), str):
+            from hud.utils.gateway import normalize_gateway_model_id
+            kwargs["model"] = normalize_gateway_model_id(kwargs["model"])
         if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
             base_url = kwargs.get("base_url", "")
             if settings.hud_gateway_url in base_url and settings.api_key:

hud_python-0.6.4/hud/cli/jobs.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""``hud jobs`` — list jobs and their traces."""
+from __future__ import annotations
+import json
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+console = Console()
+jobs_app = typer.Typer(
+    name="jobs",
+    help="List jobs and their traces",
+    add_completion=False,
+    rich_markup_mode="rich",
+    no_args_is_help=False,
+)
+@jobs_app.callback(invoke_without_command=True)
+def jobs_command(
+    ctx: typer.Context,
+    job_id: str | None = typer.Argument(None, help="Job ID — omit to list recent jobs"),
+    json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
+    limit: int = typer.Option(20, "--limit", "-n", help="Max rows to show"),
+) -> None:
+    """List recent jobs, or show traces for a specific job.
+    Without an argument, lists the most recent jobs.
+    With a job id, lists all traces for that job.
+    """
+    if ctx.invoked_subcommand is not None:
+        return
+    from hud.cli.utils.api import require_api_key
+    require_api_key("list jobs")
+    if job_id:
+        _show_job_traces(job_id, json_output=json_output, limit=limit)
+    else:
+        _list_jobs(json_output=json_output, limit=limit)
+# ── job listing ────────────────────────────────────────────────────────────────
+def _list_jobs(*, json_output: bool, limit: int) -> None:
+    from hud.utils.platform import PlatformClient
+    client = PlatformClient.from_settings()
+    try:
+        data = client.get("/jobs", params={"limit": limit})
+    except Exception as e:
+        console.print(f"[red]Failed to fetch jobs: {e}[/red]")
+        raise typer.Exit(1) from e
+    items = data if isinstance(data, list) else (data.get("items") or [])
+    if json_output:
+        console.print_json(json.dumps(items, indent=2, default=str))
+        return
+    if not items:
+        console.print("[yellow]No jobs found.[/yellow]")
+        return
+    console.print(Panel.fit("[bold cyan]Recent Jobs[/bold cyan]", border_style="cyan"))
+    table = Table()
+    table.add_column("ID", style="blue", no_wrap=True)
+    table.add_column("Name", style="cyan")
+    table.add_column("Taskset", style="dim")
+    table.add_column("Status", style="yellow")
+    table.add_column("Created", style="dim")
+    from hud.settings import settings
+    web = settings.hud_web_url.rstrip("/")
+    for job in items:
+        jid = str(job.get("id") or "")
+        table.add_row(
+            jid,
+            job.get("name") or "-",
+            job.get("taskset_name") or "-",
+            job.get("status") or "-",
+            (str(job.get("created_at") or ""))[:19],
+        )
+    console.print(table)
+    console.print(f"\n[dim]View: {web}/jobs[/dim]")
+    console.print("[dim]Tip: hud jobs <id> to see traces for a specific job[/dim]")
+# ── job traces ────────────────────────────────────────────────────────────────
+def _show_job_traces(job_id: str, *, json_output: bool, limit: int) -> None:
+    from hud.settings import settings
+    from hud.utils.platform import PlatformClient
+    client = PlatformClient.from_settings()
+    try:
+        data = client.get(f"/jobs/{job_id}/traces", params={"limit": limit})
+    except Exception as e:
+        console.print(f"[red]Failed to fetch traces: {e}[/red]")
+        raise typer.Exit(1) from e
+    items = data if isinstance(data, list) else (data.get("items") or [])
+    if json_output:
+        console.print_json(json.dumps(items, indent=2, default=str))
+        return
+    web = settings.hud_web_url.rstrip("/")
+    if not items:
+        console.print("[yellow]No traces found for this job.[/yellow]")
+        console.print(f"[dim]View: {web}/jobs/{job_id}[/dim]")
+        return
+    console.print(
+        Panel.fit(f"[bold cyan]Job Traces[/bold cyan] [dim]{job_id}[/dim]", border_style="cyan")
+    )
+    table = Table()
+    table.add_column("Trace ID", style="blue", no_wrap=True)
+    table.add_column("Status", style="yellow")
+    table.add_column("Reward", style="green", justify="right")
+    table.add_column("Started", style="dim")
+    table.add_column("Error", style="red")
+    for tr in items:
+        tid = str(tr.get("id") or "")
+        reward = tr.get("reward")
+        table.add_row(
+            tid,
+            tr.get("status") or "-",
+            f"{reward:.3f}" if reward is not None else "-",
+            (str(tr.get("start_time") or tr.get("created_at") or ""))[:19],
+            (tr.get("error") or "")[:40],
+        )
+    console.print(table)
+    console.print(f"\n[dim]View: {web}/jobs/{job_id}[/dim]")
+    console.print("[dim]Tip: hud trace <trace_id> to inspect a specific rollout[/dim]")

{hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/models.py RENAMED Viewed

@@ -71,6 +71,8 @@ def list_models(
         )
     console.print(table)
     console.print(f"\n[dim]Gateway: {settings.hud_gateway_url}[/dim]")
+    web = settings.hud_web_url.rstrip("/")
+    console.print(f"[dim]View a model in the browser: {web}/models/<id>[/dim]")
 @models_app.command("fork")
@@ -116,6 +118,7 @@ def fork_model(
         )
     )
     console.print(f"\n[dim]Train it: hud.TrainingClient({slug!r})[/dim]")
+    console.print(f"[dim]View: {_model_url(model['id'])}[/dim]")
 @models_app.command("checkpoints")
@@ -127,13 +130,15 @@ def list_checkpoints(
     from hud.cli.utils.api import require_api_key
     require_api_key("list checkpoints")
-    checkpoints = _get_checkpoints(model)
+    model_id = _resolve_model_id(model)
+    checkpoints = _get_checkpoints(model_id)
     if json_output:
         console.print_json(json.dumps(checkpoints, indent=2))
         return
     if not checkpoints:
         console.print("[yellow]No checkpoints yet — this model serves its base weights[/yellow]")
+        console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
         return
     checkpoints = sorted(checkpoints, key=lambda c: c.get("created_at") or "")
@@ -155,6 +160,7 @@ def list_checkpoints(
             (ckpt.get("created_at") or "")[:19],
         )
     console.print(table)
+    console.print(f"\n[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
 @models_app.command("head")
@@ -170,19 +176,22 @@ def show_head(
     from hud.cli.utils.api import require_api_key
     require_api_key("manage head")
+    model_id = _resolve_model_id(model)
     if set_to is not None:
-        _set_head(model, set_to)
+        _set_head(model_id, set_to)
         console.print(f"[green]Head set to[/green] [cyan]{set_to}[/cyan]")
+        console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
         return
-    head = next((c for c in _get_checkpoints(model) if c.get("is_active")), None)
+    head = next((c for c in _get_checkpoints(model_id) if c.get("is_active")), None)
     if json_output:
         console.print_json(json.dumps(head, indent=2))
         return
     if head is None:
         console.print("[yellow]No active checkpoint — this model serves its base weights[/yellow]")
+        console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
         return
     reward = head.get("mean_reward")
@@ -196,6 +205,15 @@ def show_head(
             border_style="green",
         )
     )
+    console.print(f"[dim]View: {_model_url(model_id, tab='checkpoints')}[/dim]")
+def _model_url(model_id: str, *, tab: str | None = None) -> str:
+    """Web app URL for a model (optionally a specific tab, e.g. ``checkpoints``)."""
+    from hud.settings import settings
+    url = f"{settings.hud_web_url.rstrip('/')}/models/{model_id}"
+    return f"{url}?tab={tab}" if tab else url
 def _resolve_model_id(model: str) -> str:

{hud_python-0.6.3 → hud_python-0.6.4}/hud/cli/tests/test_eval_config.py RENAMED Viewed

@@ -50,6 +50,21 @@ def test_get_agent_kwargs_model_precedence_and_flags() -> None:
     assert kwargs["verbose"] is True
+def test_get_agent_kwargs_normalizes_gateway_model_alias() -> None:
+    cfg = EvalConfig(agent_type="openai_compatible", model="glm-5.2")
+    assert cfg.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
+def test_get_agent_kwargs_normalizes_config_model_alias() -> None:
+    cfg = EvalConfig(
+        agent_type="openai_compatible",
+        agent_config={"openai_compatible": {"model": "glm-5.2"}},
+    )
+    assert cfg.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
 def test_get_agent_kwargs_requires_agent_type() -> None:
     with pytest.raises(ValueError, match="agent_type must be set"):
         EvalConfig().get_agent_kwargs()
@@ -186,6 +201,31 @@ def test_merge_cli_overrides_fields() -> None:
     assert merged.max_steps == 7
+def test_merge_cli_resolves_gateway_model_alias(monkeypatch: pytest.MonkeyPatch) -> None:
+    from hud.utils.gateway import GatewayModelInfo, GatewayProviderInfo
+    model = GatewayModelInfo(
+        id="z-ai/glm-5.2",
+        model_name="z-ai/glm-5.2",
+        sdk_agent_type="openai_compatible",
+        provider=GatewayProviderInfo(name="openai"),
+    )
+    monkeypatch.setattr("hud.utils.gateway.list_gateway_models", lambda: [model])
+    merged = EvalConfig().merge_cli(agent="glm-5.2")
+    assert merged.agent_type is not None and merged.agent_type.value == "openai_compatible"
+    assert merged.model == "z-ai/glm-5.2"
+def test_merge_cli_config_model_alias_is_normalized() -> None:
+    merged = EvalConfig(agent_type="openai_compatible").merge_cli(
+        config=["openai_compatible.model=glm-5.2"]
+    )
+    assert merged.get_agent_kwargs()["model"] == "z-ai/glm-5.2"
 def test_merge_cli_namespaced_config() -> None:
     merged = EvalConfig().merge_cli(config=["claude.max_tokens=100"])
     assert merged.agent_config["claude"]["max_tokens"] == 100

hud-python 0.6.3__tar.gz → 0.6.4__tar.gz

hud-python 0.6.3tar.gz → 0.6.4tar.gz