PyPI - hud-python - Versions diffs - 0.5.0__tar.gz → 0.5.1__tar.gz - Mend

hud-python 0.5.0tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (301) hide show

{hud_python-0.5.0 → hud_python-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.5.0
+Version: 0.5.1
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -166,14 +166,21 @@ from hud import Environment
 env = Environment("my-env")
 @env.tool()
-def search(query: str) -> str:
-    """Search the knowledge base."""
-    return db.search(query)
-@env.scenario("find-answer")
-async def find_answer(question: str, answer: str):
-    response = yield f"Find: {question}"       # Prompt
-    yield 1.0 if answer in response else 0.0  # Reward
+def add(a: int, b: int) -> int:
+    """Add two numbers."""
+    return a + b
+@env.scenario("solve-math")
+async def solve_math(problem: str, answer: int):
+    response = yield problem                    # Prompt
+    yield 1.0 if str(answer) in response else 0.0  # Reward
+async with env("solve-math", problem="What is 2+2?", answer=4) as ctx:
+    # Your agent logic here - call tools, get response
+    result = await ctx.call_tool("add", a=2, b=2)
+    await ctx.submit(f"The answer is {result}")
+print(ctx.reward)  # 1.0
 ```
 The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
@@ -183,14 +190,20 @@ The agent runs between the yields. First yield sends the prompt, second yield sc
 Test different models. Repeat runs to see the distribution:
 ```python
-import hud
+from openai import AsyncOpenAI
+import os
-task = env("find-answer", question="What is 2+2?", answer="4")
+client = AsyncOpenAI(
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
-async with hud.eval(task, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
+# Using the env from above
+async with env("solve-math", problem="What is 2+2?", answer=4, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
     response = await client.chat.completions.create(
         model=ctx.variants["model"],
-        messages=[{"role": "user", "content": ctx.prompt}]
+        messages=[{"role": "user", "content": ctx.prompt}],
+        tools=ctx.tools  # Environment tools available to the model
     )
     await ctx.submit(response.choices[0].message.content)
 ```
@@ -205,7 +218,7 @@ Push to GitHub, connect on hud.ai, run at scale:
 hud init                  # Scaffold environment
 git push                  # Push to GitHub
 # Connect on hud.ai → New → Environment
-hud eval my-org/my-eval --model gpt-4o --group-size 100
+hud eval my-eval --model gpt-4o --group-size 100
 # Or create and run tasks on the platform
 ```

{hud_python-0.5.0 → hud_python-0.5.1}/README.md RENAMED Viewed

@@ -68,14 +68,21 @@ from hud import Environment
 env = Environment("my-env")
 @env.tool()
-def search(query: str) -> str:
-    """Search the knowledge base."""
-    return db.search(query)
-@env.scenario("find-answer")
-async def find_answer(question: str, answer: str):
-    response = yield f"Find: {question}"       # Prompt
-    yield 1.0 if answer in response else 0.0  # Reward
+def add(a: int, b: int) -> int:
+    """Add two numbers."""
+    return a + b
+@env.scenario("solve-math")
+async def solve_math(problem: str, answer: int):
+    response = yield problem                    # Prompt
+    yield 1.0 if str(answer) in response else 0.0  # Reward
+async with env("solve-math", problem="What is 2+2?", answer=4) as ctx:
+    # Your agent logic here - call tools, get response
+    result = await ctx.call_tool("add", a=2, b=2)
+    await ctx.submit(f"The answer is {result}")
+print(ctx.reward)  # 1.0
 ```
 The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
@@ -85,14 +92,20 @@ The agent runs between the yields. First yield sends the prompt, second yield sc
 Test different models. Repeat runs to see the distribution:
 ```python
-import hud
+from openai import AsyncOpenAI
+import os
-task = env("find-answer", question="What is 2+2?", answer="4")
+client = AsyncOpenAI(
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
-async with hud.eval(task, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
+# Using the env from above
+async with env("solve-math", problem="What is 2+2?", answer=4, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
     response = await client.chat.completions.create(
         model=ctx.variants["model"],
-        messages=[{"role": "user", "content": ctx.prompt}]
+        messages=[{"role": "user", "content": ctx.prompt}],
+        tools=ctx.tools  # Environment tools available to the model
     )
     await ctx.submit(response.choices[0].message.content)
 ```
@@ -107,7 +120,7 @@ Push to GitHub, connect on hud.ai, run at scale:
 hud init                  # Scaffold environment
 git push                  # Push to GitHub
 # Connect on hud.ai → New → Environment
-hud eval my-org/my-eval --model gpt-4o --group-size 100
+hud eval my-eval --model gpt-4o --group-size 100
 # Or create and run tasks on the platform
 ```

{hud_python-0.5.0 → hud_python-0.5.1}/hud/__init__.py RENAMED Viewed

@@ -18,7 +18,7 @@ from .telemetry.instrument import instrument
 def trace(*args: object, **kwargs: object) -> EvalContext:
     """Deprecated: Use hud.eval() instead.
-    .. deprecated:: 0.5.0
+    .. deprecated:: 0.5.1
         hud.trace() is deprecated. Use hud.eval() or env.eval() instead.
     """
     warnings.warn(

{hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/base.py RENAMED Viewed

@@ -182,7 +182,23 @@ class MCPAgent(ABC):
             raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
         if not ctx.prompt:
-            raise ValueError("ctx.prompt is not set - did the scenario setup run?")
+            if ctx.has_scenario:
+                # Scenario was specified but prompt is still empty
+                # (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
+                scenario = ctx._task.scenario if ctx._task else "unknown"
+                raise ValueError(
+                    f"ctx.prompt is not set.\n\n"
+                    f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
+                    f"Check that the scenario's setup function returns a non-empty string."
+                )
+            else:
+                # No scenario specified at all
+                raise ValueError(
+                    "ctx.prompt is not set.\n\n"
+                    "No scenario was specified in your task file.\n"
+                    "Either add a 'scenario' field to your task, or set ctx.prompt manually "
+                    "before running the agent."
+                )
         # Store context for tool calls
         self.ctx = ctx
@@ -194,6 +210,11 @@ class MCPAgent(ABC):
         try:
             result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
+            # Propagate error state to context for platform visibility
+            if result.isError and hasattr(ctx, "error"):
+                error_msg = result.info.get("error") if result.info else result.content
+                ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
             # Submit final answer to context (only if scenario is running)
             if result.content and ctx.has_scenario:
                 await ctx.submit(result.content)
@@ -202,6 +223,9 @@ class MCPAgent(ABC):
         except Exception as e:
             logger.exception("Error while running agent:")
+            # Propagate error to context for platform visibility
+            if hasattr(ctx, "error"):
+                ctx.error = e
             return Trace(
                 reward=0.0,
                 done=True,
@@ -537,7 +561,7 @@ def find_reward(result: MCPToolResult) -> float:
                 except json.JSONDecodeError:
                     pass
-    logger.error("Couldn't parse reward from result: %s", result)
+    logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
     return 0.0

{hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/misc/response_agent.py RENAMED Viewed

@@ -1,11 +1,14 @@
 from __future__ import annotations
+import logging
 from typing import Literal
 from openai import AsyncOpenAI
 from hud.settings import settings
+logger = logging.getLogger(__name__)
 ResponseType = Literal["STOP", "CONTINUE"]
 DEFAULT_SYSTEM_PROMPT = """\
@@ -97,5 +100,6 @@ class ResponseAgent:
             else:
                 return "CONTINUE"
-        except Exception:
+        except Exception as e:
+            logger.warning("Auto-respond failed: %s", e)
             return "CONTINUE"  # Default to continue on error

{hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/openai_chat.py RENAMED Viewed

@@ -70,6 +70,18 @@ class OpenAIChatAgent(MCPAgent):
         super().__init__(params, **kwargs)
         self.config: OpenAIChatConfig
+        if (
+            self.config.api_key
+            and self.config.base_url
+            and settings.hud_gateway_url in self.config.base_url
+            and settings.api_key
+            and self.config.api_key != settings.api_key
+        ):
+            raise ValueError(
+                "OpenAIChatAgent api_key is not allowed with HUD Gateway. "
+                "Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
+            )
         if self.config.openai_client is not None:
             self.oai = self.config.openai_client
         elif self.config.api_key is not None or self.config.base_url is not None:

{hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_base.py RENAMED Viewed

@@ -350,3 +350,67 @@ class TestMCPAgentToolSchemas:
         assert len(schemas) == 1
         assert schemas[0]["name"] == "my_tool"
         assert schemas[0]["description"] == "My tool description"
+class TestMCPAgentErrorPropagation:
+    """Tests for error propagation to EvalContext."""
+    @pytest.mark.asyncio
+    async def test_exception_propagates_to_ctx_error(self) -> None:
+        """Test that exceptions during run() set ctx.error for platform visibility."""
+        class FailingAgent(MockMCPAgent):
+            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+                raise RuntimeError("Agent crashed")
+        ctx = MockEvalContext(prompt="Do something")
+        agent = FailingAgent()
+        result = await agent.run(ctx)
+        # Should return error trace
+        assert result.isError is True
+        assert result.content is not None
+        assert "Agent crashed" in result.content
+        assert ctx.error is not None
+        assert isinstance(ctx.error, BaseException)
+        assert "Agent crashed" in str(ctx.error)
+    @pytest.mark.asyncio
+    async def test_step_error_propagates_to_ctx_error(self) -> None:
+        """Test that step-level errors (caught internally) set ctx.error."""
+        step_count = [0]
+        class FailOnSecondStepAgent(MockMCPAgent):
+            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+                step_count[0] += 1
+                if step_count[0] == 1:
+                    return AgentResponse(
+                        content="",
+                        tool_calls=[MCPToolCall(name="test_tool", arguments={})],
+                        done=False,
+                    )
+                else:
+                    raise ValueError("Step 2 failed")
+        ctx = MockEvalContext(prompt="Do something")
+        agent = FailOnSecondStepAgent()
+        result = await agent.run(ctx)
+        # Should return error trace
+        assert result.isError is True
+        assert ctx.error is not None
+        assert "Step 2 failed" in str(ctx.error)
+    @pytest.mark.asyncio
+    async def test_no_error_when_successful(self) -> None:
+        """Test that ctx.error remains None on successful run."""
+        ctx = MockEvalContext(prompt="Do something")
+        agent = MockMCPAgent()
+        result = await agent.run(ctx)
+        assert result.isError is False
+        assert ctx.error is None

{hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/eval.py RENAMED Viewed

@@ -91,10 +91,11 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
 [eval]
 # source = "hud-evals/SheetBench-50"
 # agent = "claude"
-# full = false
+# all = false  # Run all problems instead of just 1
 # max_concurrent = 30
 # max_steps = 10
 # group_size = 1
+# byok = false  # Remote only; use encrypted env vars on the platform.
 # task_ids = ["task_1", "task_2"]
 # verbose = true
 # very_verbose = true
@@ -152,12 +153,13 @@ class EvalConfig(BaseModel):
         "source",
         "agent_type",
         "task_ids",
-        "full",
+        "all",
         "max_concurrent",
         "max_steps",
         "verbose",
         "very_verbose",
         "group_size",
+        "byok",
         "remote",
         "auto_respond",
         "quiet",
@@ -171,13 +173,14 @@ class EvalConfig(BaseModel):
     agent_type: AgentType | None = None
     model: str | None = None
     task_ids: list[str] | None = None
-    full: bool = False
+    all: bool = False  # Run all problems instead of just 1
     max_concurrent: int = 30
-    max_steps: int | None = None
+    max_steps: int = 10
     verbose: bool = False
     very_verbose: bool = False
-    auto_respond: bool | None = None  # Continue without prompting (default: True for --full)
+    auto_respond: bool | None = None  # Continue without prompting
     group_size: int = 1
+    byok: bool = False
     remote: bool = False
     quiet: bool = False  # Suppress opening browser for eval links
     gateway: bool = False  # Use HUD Gateway for LLM API calls
@@ -208,6 +211,11 @@ class EvalConfig(BaseModel):
     def validate_api_keys(self) -> None:
         """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
+        # BYOK requires remote execution (check before agent_type guard)
+        if self.byok and not self.remote:
+            hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
+            raise typer.Exit(1)
         if self.agent_type is None:
             return
@@ -284,14 +292,11 @@ class EvalConfig(BaseModel):
         if self.model:
             kwargs["model"] = self.model
-        if self.agent_type == AgentType.OPENAI_COMPATIBLE:
+        # For gateway base_url, inject HUD API key if not already set
+        if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
             base_url = kwargs.get("base_url", "")
-            if "api_key" not in kwargs:
-                # Use HUD API key for gateway, otherwise fall back to OpenAI API key
-                if settings.hud_gateway_url in base_url:
-                    kwargs["api_key"] = settings.api_key
-                elif settings.openai_api_key:
-                    kwargs["api_key"] = settings.openai_api_key
+            if settings.hud_gateway_url in base_url and settings.api_key:
+                kwargs["api_key"] = settings.api_key
         # Auto-detect Bedrock when Claude is selected with a Bedrock ARN
         # Check both model and checkpoint_name for ARN patterns
@@ -454,12 +459,20 @@ class EvalConfig(BaseModel):
         overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
-        for k in ("full", "verbose", "very_verbose", "remote", "quiet", "gateway"):
+        for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
             if cli_args.get(k) is True:
                 overrides[k] = True
             elif k in overrides and cli_args.get(k) is False:
                 del overrides[k]
+        # --full is a shortcut for --all --auto-respond --max-steps 100
+        if overrides.get("full"):
+            overrides["all"] = True
+            if "auto_respond" not in overrides:
+                overrides["auto_respond"] = True
+            if "max_steps" not in overrides:
+                overrides["max_steps"] = 100
         if config:
             merged_agent_config = dict(self.agent_config)
             for item in config:
@@ -541,15 +554,13 @@ class EvalConfig(BaseModel):
             table.add_row(
                 "task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
             )
-        table.add_row("full", str(self.full))
-        table.add_row("max_steps", str(self.max_steps or (100 if self.full else 10)))
+        table.add_row("all", str(self.all))
+        table.add_row("max_steps", str(self.max_steps))
         if not self.remote:
             table.add_row("max_concurrent", str(self.max_concurrent))
         if self.group_size > 1:
             table.add_row("group_size", str(self.group_size))
-        # Show auto_respond when it will be true (explicit or via --full)
-        effective_auto_respond = self.auto_respond if self.auto_respond is not None else self.full
-        if effective_auto_respond:
+        if self.auto_respond:
             table.add_row("auto_respond", "[bold green]True[/bold green]")
         if self.very_verbose:
             table.add_row("very_verbose", "[bold green]True[/bold green]")
@@ -559,6 +570,8 @@ class EvalConfig(BaseModel):
             table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
         if self.gateway:
             table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
+        if self.byok:
+            table.add_row("byok", "[bold green]True[/bold green] (remote only)")
         # Tool filters (only if set)
         if self.allowed_tools:
@@ -642,8 +655,8 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             raise typer.Exit(1)
         hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
         tasks = filtered
-    elif not cfg.full:
-        # Single task mode (no --full, no --task-ids)
+    elif not cfg.all:
+        # Single task mode (no --all, --full, or --task-ids)
         tasks = [tasks[0]]
         hud_console.info("Using first task (run with --full or --task-ids for more)…")
@@ -651,14 +664,17 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
     # Prepare agent kwargs
     agent_kwargs = cfg.get_agent_kwargs()
-    auto_respond = cfg.auto_respond if cfg.auto_respond is not None else cfg.full
+    auto_respond = cfg.auto_respond
     if auto_respond:
         agent_kwargs = {**agent_kwargs, "auto_respond": True}
-    max_steps = cfg.max_steps or (100 if cfg.full else 10)
+    max_steps = cfg.max_steps
     # Remote execution - submit to HUD platform
     if cfg.remote:
+        agent_kwargs = {
+            k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
+        }
         # Create a job ID for tracking
         import uuid
@@ -676,9 +692,10 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             agent_params=agent_kwargs,
             max_steps=max_steps,
             group_size=cfg.group_size,
+            use_byok=cfg.byok,
         )
-        hud_console.success(f"Tasks submitted. View at: https://hud.ai/job/{job_id}")
+        hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
         return [], tasks
     # Single task mode - show extra info
@@ -724,7 +741,12 @@ def eval_command(
         None,
         help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test",  # noqa: E501
     ),
-    full: bool = typer.Option(False, "--full", help="Run entire dataset"),
+    all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
+    full: bool = typer.Option(
+        False,
+        "--full",
+        help="Run the entire dataset. Shortcut for --all --auto-respond  --max-steps 100",
+    ),
     model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
     config: list[str] | None = typer.Option(  # noqa: B008
         None, "--config", "-c", help="Agent config: key=value"
@@ -743,10 +765,10 @@ def eval_command(
     max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
     very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
-    auto_respond: bool | None = typer.Option(
-        None,
+    auto_respond: bool = typer.Option(
+        False,
         "--auto-respond",
-        help="Continue without prompting after tool calls (default: True for --full)",
+        help="Automatically prompt the agent to continue if it does not respond with a tool call",
     ),
     group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
     task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
@@ -754,6 +776,11 @@ def eval_command(
     remote: bool = typer.Option(
         False, "--remote", help="Submit tasks to platform for remote execution"
     ),
+    byok: bool = typer.Option(
+        False,
+        "--byok",
+        help="Remote only: use BYOK keys from encrypted env vars for inference",
+    ),
     quiet: bool = typer.Option(
         False, "--quiet", "-q", help="Suppress opening browser for eval links"
     ),
@@ -778,6 +805,7 @@ def eval_command(
         source=source,
         agent=agent,
         model=model,
+        all=all,
         full=full,
         max_concurrent=max_concurrent,
         max_steps=max_steps,
@@ -790,6 +818,7 @@ def eval_command(
         group_size=group_size,
         config=config,
         remote=remote,
+        byok=byok,
         quiet=quiet,
         gateway=gateway,
     )

{hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/init.py RENAMED Viewed

@@ -23,6 +23,8 @@ PRESET_MAP: dict[str, str | None] = {
     "deep-research": "hud-deepresearch",
     "browser": "hud-browser",
     "rubrics": "hud-rubrics",
+    "verilog-coding-template": "verilog-coding-template",
+    "data-science-template": "data-science-template",
 }
 SKIP_DIR_NAMES = {"node_modules", "__pycache__", "dist", "build", ".next", ".git"}
@@ -92,6 +94,8 @@ def _prompt_for_preset() -> str:
             {"name": "browser", "message": "browser"},
             {"name": "deep-research", "message": "deep-research"},
             {"name": "rubrics", "message": "rubrics"},
+            {"name": "verilog-coding-template", "message": "verilog-coding-template"},
+            {"name": "data-science-template", "message": "data-science-template"},
         ]
         display_choices = [c["message"] for c in choices]
         selected = questionary.select(

{hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/runner.py RENAMED Viewed

@@ -99,8 +99,8 @@ async def run_dataset(
     ) as ctx:
         # Create agent fresh for each context (ensures correct tool initialization)
         agent = agent_cls.create(**(agent_params or {}))
-        result = await agent.run(ctx, max_steps=max_steps)
-        ctx.reward = result.reward
+        await agent.run(ctx, max_steps=max_steps)
+        # Reward is computed by EvalContext.__aexit__ from evaluate tools
     # For parallel execution, results are collected via ctx.results
     if hasattr(ctx, "results") and ctx.results:
@@ -207,6 +207,7 @@ async def run_single_task(
             ctx.metadata.update(metadata)
         result = await agent.run(ctx, max_steps=max_steps)
-        ctx.reward = result.reward
+        # Reward is computed by EvalContext.__aexit__ from evaluate tools
+    # Return the Trace (ctx.reward is set by EvalContext.__aexit__)
     return result

{hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/utils.py RENAMED Viewed

@@ -51,6 +51,10 @@ class SingleTaskRequest(BaseModel):
         description="Additional metadata to inject into the trace context.",
     )
     trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
+    use_byok: bool = Field(
+        default=False,
+        description="If True, use BYOK headers from encrypted env vars for inference.",
+    )
     @model_validator(mode="after")
     def _validate_task(self) -> SingleTaskRequest:
@@ -110,6 +114,7 @@ async def submit_rollouts(
     group_size: int = 1,
     batch_size: int = 50,
     metadata: dict[str, Any] | None = None,
+    use_byok: bool = False,
 ) -> None:
     """Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
@@ -122,6 +127,7 @@ async def submit_rollouts(
         group_size: Number of rollouts per task (for variance estimation)
         batch_size: Number of rollouts per API batch request
         metadata: Additional metadata for each rollout
+        use_byok: If True, use BYOK keys from encrypted env vars (remote only)
     """
     from hud.eval.utils import is_v4_format
@@ -168,6 +174,7 @@ async def submit_rollouts(
                     trace_name=trace_name,
                     group_id=base_task_id if group_size > 1 else None,
                     metadata=metadata or {},
+                    use_byok=use_byok,
                 )
             )

{hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/remote.py RENAMED Viewed

@@ -61,13 +61,12 @@ class RemoteConnectorMixin(MCPConfigConnectorMixin):
             self._hub_config = hub_config
         # Create mcp_config with standard MCP URL and hub slug in headers
+        # Note: Authorization is injected at request time by httpx/aiohttp hooks
+        # in hud.eval.instrument (uses contextvar for api_key).
         mcp_config = {
             "hud": {
                 "url": settings.hud_mcp_url,
-                "headers": {
-                    "Authorization": f"Bearer {settings.api_key}",
-                    "Environment-Name": slug,
-                },
+                "headers": {"Environment-Name": slug},
             }
         }

{hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/environment.py RENAMED Viewed

@@ -323,7 +323,8 @@ class Environment(
                     if conn.is_connected:
                         await conn.disconnect()
                 name, err = errors[0]
-                raise ConnectionError(f"Failed to connect to {name}") from err
+                str_err = str(err).replace("Client failed to connect: ", "")  # Strip from FastMCP
+                raise ConnectionError(f"Failed to connect to {name}: {str_err}") from err
         await self._build_routing()
@@ -399,13 +400,20 @@ class Environment(
         if self._router.is_local(name):
             # Call tool manager directly to avoid FastMCP context requirement
             result = await self._tool_manager.call_tool(name, arguments)
-            return MCPToolResult(content=result.content, isError=False)
+            return MCPToolResult(
+                content=result.content,
+                structuredContent=result.structured_content,
+            )
         connection_name = self._router.get_connection(name)
         if connection_name:
             conn = self._connections[connection_name]
             result = await conn.call_tool(name, arguments)
-            return MCPToolResult(content=result.content, isError=result.isError)
+            return MCPToolResult(
+                content=result.content,
+                isError=result.isError,
+                structuredContent=result.structuredContent,
+            )
         raise ValueError(f"Tool not found: {name}")

hud-python 0.5.0__tar.gz → 0.5.1__tar.gz

hud-python 0.5.0tar.gz → 0.5.1tar.gz