PyPI - hud-python - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.5.1py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

hud/__init__.py +1 -1
hud/agents/__init__.py +65 -6
hud/agents/base.py +33 -15
hud/agents/claude.py +60 -31
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +15 -26
hud/agents/gemini_cua.py +6 -17
hud/agents/misc/response_agent.py +7 -0
hud/agents/openai.py +16 -29
hud/agents/openai_chat.py +3 -19
hud/agents/operator.py +5 -17
hud/agents/resolver.py +70 -0
hud/agents/tests/test_claude.py +2 -4
hud/agents/tests/test_openai.py +2 -1
hud/agents/tests/test_resolver.py +192 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +34 -3
hud/cli/build.py +37 -5
hud/cli/dev.py +11 -2
hud/cli/eval.py +51 -39
hud/cli/flows/init.py +1 -1
hud/cli/pull.py +1 -1
hud/cli/push.py +9 -2
hud/cli/tests/test_build.py +2 -2
hud/cli/tests/test_push.py +1 -1
hud/cli/utils/metadata.py +1 -1
hud/cli/utils/tests/test_metadata.py +1 -1
hud/clients/mcp_use.py +6 -1
hud/datasets/loader.py +17 -18
hud/datasets/runner.py +16 -10
hud/datasets/tests/test_loader.py +15 -15
hud/environment/__init__.py +5 -3
hud/environment/connection.py +58 -6
hud/environment/connectors/mcp_config.py +29 -1
hud/environment/environment.py +218 -77
hud/environment/router.py +175 -24
hud/environment/scenarios.py +313 -186
hud/environment/tests/test_connectors.py +10 -23
hud/environment/tests/test_environment.py +432 -0
hud/environment/tests/test_local_connectors.py +81 -40
hud/environment/tests/test_scenarios.py +820 -14
hud/eval/context.py +63 -10
hud/eval/instrument.py +4 -2
hud/eval/manager.py +79 -12
hud/eval/task.py +36 -4
hud/eval/tests/test_eval.py +1 -1
hud/eval/tests/test_task.py +147 -1
hud/eval/types.py +2 -0
hud/eval/utils.py +14 -3
hud/patches/mcp_patches.py +178 -21
hud/telemetry/instrument.py +8 -1
hud/telemetry/tests/test_eval_telemetry.py +8 -8
hud/tools/__init__.py +2 -0
hud/tools/agent.py +223 -0
hud/tools/computer/__init__.py +34 -5
hud/tools/shell.py +3 -3
hud/tools/tests/test_agent_tool.py +355 -0
hud/types.py +62 -34
hud/utils/hud_console.py +30 -17
hud/utils/strict_schema.py +1 -1
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/eval/context.py CHANGED Viewed

@@ -155,6 +155,9 @@ class EvalContext(Environment):
         self.answer: str | None = None  # Agent's submitted answer
         self.system_prompt: str | None = None  # From task.agent_config, passed to agent
+        # Agent config overrides from task (applied by agent when running)
+        self.append_setup_output: bool = False  # Whether to append setup tool output to prompt
         # Error tracking
         self.error: BaseException | None = None
@@ -230,13 +233,13 @@ class EvalContext(Environment):
         # using the contextvar set in __aenter__ (supports api_key passed to hud.eval())
         ctx._setup_calls = env._setup_calls.copy()
         ctx._evaluate_calls = env._evaluate_calls.copy()
+        ctx._integration_test_calls = getattr(env, "_integration_test_calls", []).copy()
+        ctx._setup_results = getattr(env, "_setup_results", []).copy()
         # Copy scenarios (definitions) by reference - they don't change
         ctx._scenarios = getattr(env, "_scenarios", {})
         # Create fresh session state for this eval (parallel evals each need their own)
-        ctx._scenario_sessions = {}
-        ctx._scenario_latest = {}
-        ctx._scenario_answers = {}
+        ctx._active_session = None
         # Store source env name for remote scenario lookups
         ctx._source_env_name = env.name
@@ -302,10 +305,20 @@ class EvalContext(Environment):
             code_snippet: Code being evaluated
             trace: Whether to send traces to backend
             quiet: Whether to suppress output
+        Raises:
+            ValueError: If task.args is None (template tasks cannot be run directly)
         """
         from hud.environment import Environment
         from hud.eval.task import build_eval_name
+        # Validate that task has args (not a template)
+        if task.args is None:
+            raise ValueError(
+                f"Cannot run task with args=None (this is a template). "
+                f"Provide args when creating the task: env('{task.scenario}', **args)"
+            )
         eval_name = name or build_eval_name(task.scenario, task.args)
         # task.env is guaranteed to be Environment after Task.__post_init__
@@ -328,13 +341,26 @@ class EvalContext(Environment):
         # Store task info for scenario execution
         ctx._task = task
-        # Set system_prompt from task.agent_config
+        # Copy agent_config fields from task to ctx (these override agent defaults)
         if task.agent_config:
-            if isinstance(task.agent_config, dict):
-                if task.agent_config.get("system_prompt"):
-                    ctx.system_prompt = task.agent_config["system_prompt"]
-            elif task.agent_config.system_prompt:
-                ctx.system_prompt = task.agent_config.system_prompt
+            agent_config = task.agent_config
+            if isinstance(agent_config, dict):
+                if agent_config.get("system_prompt"):
+                    ctx.system_prompt = agent_config["system_prompt"]
+                if agent_config.get("append_setup_output"):
+                    ctx.append_setup_output = agent_config["append_setup_output"]
+                # Also check append_setup_tool alias
+                if agent_config.get("append_setup_tool"):
+                    ctx.append_setup_output = agent_config["append_setup_tool"]
+            else:
+                # It's a BaseAgentConfig or TaskAgentConfig object
+                if getattr(agent_config, "system_prompt", None):
+                    ctx.system_prompt = agent_config.system_prompt
+                if getattr(agent_config, "append_setup_output", False):
+                    ctx.append_setup_output = agent_config.append_setup_output
+                # Also check append_setup_tool alias
+                if getattr(agent_config, "append_setup_tool", False):
+                    ctx.append_setup_output = True
         return ctx
@@ -343,7 +369,7 @@ class EvalContext(Environment):
         if self._task is None or self._task.scenario is None:
             return
-        prompt = await self.run_scenario_setup(self._task.scenario, self._task.args)
+        prompt = await self.run_scenario_setup(self._task.scenario, self._task.args or {})
         if prompt:
             self.prompt = prompt
@@ -417,6 +443,33 @@ class EvalContext(Environment):
         """True if a scenario is running and can accept submissions."""
         return self._task is not None and self._task.scenario is not None
+    @property
+    def setup_output(self) -> str | None:
+        """Get setup tool output as formatted string for prepending to agent context.
+        Returns None if no setup tools were executed or all results were empty.
+        Used by agents when append_setup_output is enabled.
+        """
+        import mcp.types as mcp_types
+        setup_results = getattr(self, "_setup_results", [])
+        if not setup_results:
+            return None
+        output_parts: list[str] = []
+        for result in setup_results:
+            if result.content:
+                output_parts.extend(
+                    block.text
+                    for block in result.content
+                    if isinstance(block, mcp_types.TextContent)
+                )
+        if not output_parts:
+            return None
+        return "\n".join(output_parts)
     # =========================================================================
     # Backend Integration
     # =========================================================================

hud/eval/instrument.py CHANGED Viewed

@@ -69,7 +69,8 @@ def _httpx_request_hook(request: Any) -> None:
     headers = _get_trace_headers()
     if headers is not None:
         for key, value in headers.items():
-            request.headers[key] = value
+            if key.lower() not in {k.lower() for k in request.headers}:
+                request.headers[key] = value
         logger.debug("Added trace headers to request: %s", url_str)
     # Auto-inject API key if not present or invalid (prefer contextvar, fallback to settings)
@@ -149,7 +150,8 @@ def _patch_aiohttp() -> None:
         trace_headers = _get_trace_headers()
         if trace_headers is not None:
             for key, value in trace_headers.items():
-                params.headers[key] = value
+                if key.lower() not in {k.lower() for k in params.headers}:
+                    params.headers[key] = value
             logger.debug("Added trace headers to aiohttp request: %s", url_str)
         api_key = _get_api_key()

hud/eval/manager.py CHANGED Viewed

@@ -56,14 +56,16 @@ def _get_eval_name(tasks: list[Task] | None = None) -> str:
     return "eval"
-def _send_job_enter(
+async def _send_job_enter(
     job_id: str,
     name: str,
     variants: dict[str, Any] | None,
     group: int,
     api_key: str | None,
-) -> None:
-    """Send job enter payload (sync request before traces start)."""
+    taskset: str | None = None,
+    tasks: list[dict[str, Any]] | None = None,
+) -> list[str] | None:
+    """Send job enter payload (async request before traces start)."""
     import httpx
     from hud.eval.types import JobEnterPayload
@@ -71,23 +73,35 @@ def _send_job_enter(
     api_key = api_key or settings.api_key
     if not settings.telemetry_enabled or not api_key:
-        return
+        return None
     payload = JobEnterPayload(
         name=name,
         variants=variants,
         group=group,
+        taskset=taskset,
+        tasks=tasks if taskset else None,  # only send tasks if taskset specified
     )
     try:
-        httpx.post(
-            f"{settings.hud_api_url}/trace/job/{job_id}/enter",
-            json=payload.model_dump(exclude_none=True),
-            headers={"Authorization": f"Bearer {api_key}"},
-            timeout=10.0,
-        )
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.post(
+                f"{settings.hud_api_url}/trace/job/{job_id}/enter",
+                json=payload.model_dump(exclude_none=True),
+                headers={"Authorization": f"Bearer {api_key}"},
+            )
+        if resp.is_success:
+            try:
+                data = resp.json()
+            except Exception:
+                return None
+            if isinstance(data, dict):
+                ids = data.get("task_version_ids")
+                if isinstance(ids, list) and all(isinstance(x, str) for x in ids):
+                    return ids
     except Exception as e:
         logger.warning("Failed to send job enter: %s", e)
+    return None
 @asynccontextmanager
@@ -105,6 +119,7 @@ async def run_eval(
     max_concurrent: int | None = None,
     trace: bool = True,
     quiet: bool = False,
+    taskset: str | None = None,
 ) -> AsyncGenerator[EvalContext, None]:
     """Standalone eval context manager.
@@ -235,13 +250,37 @@ async def run_eval(
     if total_evals == 1:
         if tasks:
+            # Even for single-task evals, --taskset requires a job_enter call so the run
+            # and task are linked to the taskset (via job_id + task_version_id).
+            job_id_for_run = job_id
+            if taskset:
+                eval_name = _get_eval_name(tasks=tasks)
+                if job_id_for_run is None:
+                    job_id_for_run = str(uuid.uuid4())
+                task_data = None
+                if not tasks[0].id:
+                    task_data = [tasks[0].model_dump(mode="json", exclude_none=True)]
+                created_task_version_ids = await _send_job_enter(
+                    job_id=job_id_for_run,
+                    name=eval_name,
+                    variants=variants,
+                    group=group,
+                    api_key=api_key,
+                    taskset=taskset,
+                    tasks=task_data,
+                )
+                if created_task_version_ids and not tasks[0].id:
+                    tasks[0].id = created_task_version_ids[0]
             # Single task - use EvalContext.from_task()
             ctx = EvalContext.from_task(
                 tasks[0],
                 name=name,
                 trace_id=trace_id,
                 api_key=api_key,
-                job_id=job_id,
+                job_id=job_id_for_run,
                 group_id=group_id,
                 variants=variant_combos[0],
                 code_snippet=code_snippet,
@@ -273,13 +312,41 @@ async def run_eval(
         job_url = f"https://hud.ai/jobs/{implicit_job_id}"
         # Send job enter (sync request before traces start)
-        _send_job_enter(
+        # Serialize tasks for auto-add to taskset (only tasks without existing backend id).
+        # For v5 scenario tasks, the backend task_version_id is carried in Task.id.
+        tasks_data = None
+        tasks_to_create: list[Task] = []
+        if taskset and tasks:
+            tasks_to_create = [t for t in tasks if not t.id]
+            tasks_data = (
+                [t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
+                if tasks_to_create
+                else None
+            )
+        created_task_version_ids = await _send_job_enter(
             job_id=implicit_job_id,
             name=eval_name,
             variants=variants,
             group=group,
             api_key=api_key,
+            taskset=taskset,
+            tasks=tasks_data,
         )
+        if created_task_version_ids and tasks_to_create:
+            # Assign backend IDs back onto the in-memory tasks so trace enter includes
+            # task_version_id.
+            # Platform guarantees ordered one-to-one mapping, but warn if counts differ.
+            if len(created_task_version_ids) != len(tasks_to_create):
+                logger.warning(
+                    "Task count mismatch: sent %d tasks, received %d IDs. "
+                    "Some tasks may not be linked to the taskset.",
+                    len(tasks_to_create),
+                    len(created_task_version_ids),
+                )
+            for task_obj, task_version_id in zip(
+                tasks_to_create, created_task_version_ids, strict=False
+            ):
+                task_obj.id = task_version_id
         # Print job URL (not individual trace URLs)
         if not quiet:

hud/eval/task.py CHANGED Viewed

@@ -53,6 +53,9 @@ class TaskAgentConfig(BaseModel):
     """Agent configuration for a Task.
     Contains settings that should be passed to the agent when running this task.
+    Note: allowed_tools/disallowed_tools are handled at the Environment level
+    (via env.include()/env.exclude() for v5, or extracted by build_env_from_v4() for v4).
     """
     model_config = ConfigDict(extra="ignore")
@@ -62,12 +65,26 @@ class TaskAgentConfig(BaseModel):
         description="Custom system prompt to pass to the agent",
     )
+    # Agent behavior settings (from v4 agent_config, applied by EvalContext)
+    append_setup_output: bool = Field(
+        default=False,
+        description="Append setup tool output to the agent's initial prompt",
+    )
+    append_setup_tool: bool = Field(
+        default=False,
+        description="Alias for append_setup_output (backwards compat)",
+    )
     @model_validator(mode="before")
     @classmethod
     def warn_extra_fields(cls, data: Any) -> Any:
         """Warn about extra fields that will be ignored."""
         if isinstance(data, dict):
-            known_fields = {"system_prompt"}
+            known_fields = {
+                "system_prompt",
+                "append_setup_output",
+                "append_setup_tool",
+            }
             extra = set(data.keys()) - known_fields
             if extra:
                 logger.warning(
@@ -148,7 +165,10 @@ class Task(BaseModel):
     env: Any = Field(default=None)  # Typed as Any for input flexibility, validated below
     scenario: str | None = None
     id: str | None = None
-    args: dict[str, Any] = Field(default_factory=dict)
+    args: dict[str, Any] | None = Field(
+        default=None,
+        description="Scenario arguments. None indicates a template (args filled in later).",
+    )
     validation: list[MCPToolCall] | None = None
     # Agent config - settings passed to agent (system_prompt, etc.)
@@ -284,8 +304,20 @@ class Task(BaseModel):
                 ]
             # Preserve agent_config
+            agent_config: dict[str, Any] = {}
             if data.get("agent_config"):
-                result["agent_config"] = data["agent_config"]
+                agent_config.update(data["agent_config"])
+            # Restore tool filters from Environment (they were extracted during v4 conversion)
+            if self.env is not None:
+                if getattr(self.env, "_agent_include", None) is not None:
+                    agent_config["allowed_tools"] = self.env._agent_include
+                elif "allowed_tools" not in agent_config:
+                    # ["*"] was converted to None, restore it for serialization
+                    agent_config["allowed_tools"] = ["*"]
+                if getattr(self.env, "_agent_exclude", None) is not None:
+                    agent_config["disallowed_tools"] = self.env._agent_exclude
+            if agent_config:
+                result["agent_config"] = agent_config
             # Preserve metadata
             if data.get("metadata"):
@@ -335,6 +367,6 @@ class Task(BaseModel):
             id=self.id,
             env=self.env,  # Share reference
             scenario=self.scenario,
-            args=self.args.copy() if self.args else {},
+            args=self.args.copy() if self.args is not None else None,
             validation=self.validation.copy() if self.validation else None,
         )

hud/eval/tests/test_eval.py CHANGED Viewed

@@ -16,7 +16,7 @@ class TestTaskDataclass:
         assert task.env is None
         assert task.scenario is None
-        assert task.args == {}
+        assert task.args is None  # None = template, {} = runnable with no args
     def test_init_with_env_dict(self) -> None:
         """Task auto-converts env dict to Environment via validator."""

hud/eval/tests/test_task.py CHANGED Viewed

@@ -85,13 +85,24 @@ class TestTaskSerialization:
         task = Task.from_v4(v4_dict)
         data = task.model_dump(mode="json")
-        assert data.get("agent_config") == {"system_prompt": "Custom system prompt"}
+        # agent_config should preserve system_prompt and restore tool filters
+        agent_config = data.get("agent_config")
+        assert agent_config is not None
+        assert agent_config["system_prompt"] == "Custom system prompt"
+        # allowed_tools defaults to ["*"] when not specified (restored during serialization)
+        assert agent_config["allowed_tools"] == ["*"]
+        # These have default False values from TaskAgentConfig
+        assert agent_config["append_setup_output"] is False
+        assert agent_config["append_setup_tool"] is False
         # Roundtrip
         task2 = Task(**data)
         assert task2.agent_config is not None
         assert isinstance(task2.agent_config, TaskAgentConfig)
         assert task2.agent_config.system_prompt == "Custom system prompt"
+        # Tool filters should be on Environment after roundtrip
+        assert task2.env is not None
+        assert task2.env._agent_include is None  # ["*"] → None
     def test_v4_preserves_metadata(self) -> None:
         """v4 Task preserves metadata through roundtrip."""
@@ -143,3 +154,138 @@ class TestTaskValidation:
         assert isinstance(task.agent_config, TaskAgentConfig)
         assert task.agent_config.system_prompt == "Hello"
+class TestV4AgentConfigToolFilters:
+    """Tests for v4 agent_config.allowed_tools and disallowed_tools processing."""
+    def test_v4_extracts_allowed_tools(self) -> None:
+        """v4 allowed_tools is extracted and stored on Environment."""
+        v4_dict = {
+            "prompt": "Test prompt",
+            "mcp_config": {"server": {"url": "http://localhost"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+            "agent_config": {
+                "allowed_tools": ["browser_*", "file_read"],
+            },
+        }
+        task = Task.from_v4(v4_dict)
+        assert task.env is not None
+        assert task.env._agent_include == ["browser_*", "file_read"]
+    def test_v4_extracts_disallowed_tools(self) -> None:
+        """v4 disallowed_tools is extracted and stored on Environment."""
+        v4_dict = {
+            "prompt": "Test prompt",
+            "mcp_config": {"server": {"url": "http://localhost"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+            "agent_config": {
+                "disallowed_tools": ["*setup*", "*evaluate*", "checkout_branch"],
+            },
+        }
+        task = Task.from_v4(v4_dict)
+        assert task.env is not None
+        assert task.env._agent_exclude == ["*setup*", "*evaluate*", "checkout_branch"]
+    def test_v4_wildcard_star_allowed_converts_to_none(self) -> None:
+        """v4 allowed_tools=['*'] converts to None (meaning include all)."""
+        v4_dict = {
+            "prompt": "Test prompt",
+            "mcp_config": {"server": {"url": "http://localhost"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+            "agent_config": {
+                "allowed_tools": ["*"],
+            },
+        }
+        task = Task.from_v4(v4_dict)
+        assert task.env is not None
+        # ["*"] should be converted to None
+        assert task.env._agent_include is None
+    def test_v4_both_allowed_and_disallowed(self) -> None:
+        """v4 supports both allowed_tools and disallowed_tools together."""
+        v4_dict = {
+            "prompt": "Test prompt",
+            "mcp_config": {"server": {"url": "http://localhost"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+            "agent_config": {
+                "allowed_tools": ["*"],
+                "disallowed_tools": ["*setup*", "*evaluate*"],
+            },
+        }
+        task = Task.from_v4(v4_dict)
+        assert task.env is not None
+        assert task.env._agent_include is None  # ["*"] → None
+        assert task.env._agent_exclude == ["*setup*", "*evaluate*"]
+    @pytest.mark.asyncio
+    async def test_v4_tool_filters_applied_in_as_tools(self) -> None:
+        """v4 tool filters are applied when calling env.as_tools()."""
+        v4_dict = {
+            "prompt": "Test prompt",
+            "mcp_config": {"server": {"url": "http://localhost"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+            "agent_config": {
+                "allowed_tools": ["*"],
+                "disallowed_tools": ["*setup*"],
+            },
+        }
+        task = Task.from_v4(v4_dict)
+        env = task.env
+        assert env is not None
+        # Add local tools to test filtering
+        @env.tool()
+        def my_setup_tool() -> str:
+            """Should be filtered out."""
+            return "setup"
+        @env.tool()
+        def run_query() -> str:
+            """Should be visible."""
+            return "query"
+        await env._build_routing()
+        tools = env.as_tools()
+        tool_names = [t.name for t in tools]
+        assert "my_setup_tool" not in tool_names
+        assert "run_query" in tool_names
+    def test_v4_tool_filters_preserved_in_serialization(self) -> None:
+        """v4 tool filters are preserved when serializing for remote execution."""
+        v4_dict = {
+            "prompt": "Test prompt",
+            "mcp_config": {"server": {"url": "http://localhost"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+            "agent_config": {
+                "allowed_tools": ["*"],
+                "disallowed_tools": ["*setup*", "*evaluate*", "*grade*"],
+            },
+        }
+        task = Task.from_v4(v4_dict)
+        # Serialize (this is what gets sent to remote execution)
+        data = task.model_dump(mode="json")
+        # agent_config must include the tool filters for remote execution
+        assert "agent_config" in data
+        assert data["agent_config"]["allowed_tools"] == ["*"]
+        assert data["agent_config"]["disallowed_tools"] == ["*setup*", "*evaluate*", "*grade*"]
+        # Verify roundtrip works (remote worker will deserialize this)
+        task2 = Task(**data)
+        assert task2.env is not None
+        assert task2.env._agent_include is None  # ["*"] → None
+        assert task2.env._agent_exclude == ["*setup*", "*evaluate*", "*grade*"]

hud/eval/types.py CHANGED Viewed

@@ -53,6 +53,8 @@ class JobEnterPayload(BaseModel):
     name: str | None = None
     variants: dict[str, Any] | None = None  # Full variant config
     group: int | None = None
+    taskset: str | None = None  # taskset slug to associate job with
+    tasks: list[dict[str, Any]] | None = None  # task definitions to add to taskset
 __all__ = [

hud/eval/utils.py CHANGED Viewed

@@ -138,6 +138,7 @@ def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
     }
     # Map integration_test_tool → validation (same concept: tool calls to verify)
+    # Also populate _integration_test_calls for IntegrationTestRunner compatibility
     if legacy.integration_test_tool:
         int_test = legacy.integration_test_tool
         if not isinstance(int_test, list):
@@ -147,10 +148,20 @@ def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
             call if isinstance(call, MCPToolCall) else MCPToolCall(**call.model_dump())
             for call in int_test
         ]
+        # Populate _integration_test_calls on env for IntegrationTestRunner
+        env._integration_test_calls = [(call.name, call.arguments or {}) for call in int_test]
-    # Extract agent_config (just system_prompt for now)
-    if legacy.agent_config and legacy.agent_config.system_prompt:
-        result["agent_config"] = {"system_prompt": legacy.agent_config.system_prompt}
+    # Extract agent_config fields that need to be passed through
+    if legacy.agent_config:
+        agent_config_dict: dict[str, Any] = {}
+        if legacy.agent_config.system_prompt:
+            agent_config_dict["system_prompt"] = legacy.agent_config.system_prompt
+        if legacy.agent_config.append_setup_output:
+            agent_config_dict["append_setup_output"] = legacy.agent_config.append_setup_output
+        if legacy.agent_config.append_setup_tool:
+            agent_config_dict["append_setup_tool"] = legacy.agent_config.append_setup_tool
+        if agent_config_dict:
+            result["agent_config"] = agent_config_dict
     # Preserve metadata
     if legacy.metadata:

hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.5.1py3-none-any.whl → 0.5.13py3-none-any.whl