PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +70 -5
hud/agents/base.py +238 -500
hud/agents/claude.py +236 -247
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +264 -0
hud/agents/gemini_cua.py +324 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +48 -36
hud/agents/openai.py +282 -296
hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
hud/agents/operator.py +199 -0
hud/agents/resolver.py +70 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +381 -214
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +377 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_resolver.py +192 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +493 -546
hud/cli/analyze.py +43 -5
hud/cli/build.py +699 -113
hud/cli/debug.py +8 -5
hud/cli/dev.py +889 -732
hud/cli/eval.py +793 -667
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/pull.py +1 -1
hud/cli/push.py +38 -13
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +110 -8
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push.py +1 -1
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +70 -1
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +45 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +326 -0
hud/datasets/runner.py +198 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +52 -0
hud/environment/connection.py +258 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +137 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +835 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +263 -0
hud/environment/scenarios.py +620 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +205 -0
hud/environment/tests/test_environment.py +593 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +242 -0
hud/environment/tests/test_scenarios.py +1086 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +727 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +187 -0
hud/eval/manager.py +533 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +372 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +291 -0
hud/eval/types.py +65 -0
hud/eval/utils.py +194 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +308 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +165 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +18 -2
hud/tools/agent.py +223 -0
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +36 -3
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_agent_tool.py +355 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +194 -56
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +89 -18
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.13.dist-info/METADATA +264 -0
hud_python-0.5.13.dist-info/RECORD +305 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/environment/environment.py ADDED Viewed

@@ -0,0 +1,835 @@
+"""Environment class - unified MCP server and client."""
+from __future__ import annotations
+import asyncio
+import logging
+from collections.abc import Awaitable, Callable
+from typing import TYPE_CHECKING, Any, Literal, Self
+import mcp.types as mcp_types
+from hud.environment.connectors import ConnectorsMixin
+from hud.environment.integrations import IntegrationsMixin
+from hud.environment.mock import MockMixin
+from hud.environment.router import ConflictResolution, ToolRouter
+from hud.environment.scenarios import ScenarioMixin
+from hud.server.server import MCPServer
+from hud.types import MCPToolResult
+if TYPE_CHECKING:
+    import types
+    from hud.environment.connection import Connector
+    from hud.eval.task import Task
+__all__ = ["Environment"]
+logger = logging.getLogger(__name__)
+# Suppress verbose fastmcp logging
+logging.getLogger("fastmcp.server.server").setLevel(logging.WARNING)
+logging.getLogger("fastmcp.server.openapi").setLevel(logging.WARNING)
+# Type alias for async callables (no-arg functions that return awaitable)
+AsyncCallable = Callable[[], Awaitable[Any]]
+class Environment(
+    ConnectorsMixin,
+    IntegrationsMixin,
+    MockMixin,
+    ScenarioMixin,
+    MCPServer,
+):
+    """Unified MCP environment that acts as both server and client.
+    Features:
+        - Define local tools with @env.tool decorator
+        - Connect to HUD Hub, URLs, or mcp_config dicts
+        - Automatic tool routing (local vs remote)
+        - Format tools for any LLM provider
+        - Integrate with popular agent frameworks
+        - Mock mode for testing without real connections
+    Connector methods (connect to sources):
+        connect_hub(name) - HUD Hub environment
+        connect_url(url) - MCP server via URL
+        connect_mcp(config) - Single mcp_config server
+        connect_mcp_config(mcp_config) - Multiple mcp_config servers
+        connect_image(image) - Docker image via stdio
+        connect_fastapi(app) - Mount FastAPI app as MCP server
+        connect_openapi(spec) - Mount OpenAPI spec as MCP server
+        connect_server(server) - Mount MCPServer/FastMCP directly
+    Mock methods (for testing):
+        mock() - Enable mock mode, all tools return mock values
+        unmock() - Disable mock mode
+        mock_tool(name, output) - Set specific mock output for a tool
+        is_mock - Check if mock mode is enabled
+    OpenAI integrations:
+        as_openai_chat_tools() - Chat Completions format
+        as_openai_responses_tools() - Responses API format
+        as_openai_agent_tools() - Agents SDK (requires openai-agents)
+    Anthropic/Claude integrations:
+        as_claude_tools() - Claude API format
+        as_claude_programmatic_tools() - Programmatic tool use
+        as_anthropic_runner() - Tool runner (requires anthropic)
+    Google/Gemini integrations:
+        as_gemini_tools() - Gemini format
+        as_gemini_tool_config() - Tool execution config
+    LangChain integrations:
+        as_langchain_tools() - StructuredTools (requires langchain-core)
+    Example:
+        ```python
+        env = Environment("my-env")
+        @env.tool
+        def greet(name: str) -> str:
+            return f"Hello, {name}!"
+        env.connect_hub("browser", prefix="browser")
+        async with env:
+            # Get tools in any format
+            openai_tools = env.as_openai_chat_tools()
+            claude_tools = env.as_claude_tools()
+            # Call tools - automatically routed
+            result = await env.call_tool("greet", name="World")
+            # Or pass provider-specific format - auto-detected
+            result = await env.call_tool(response.choices[0].message.tool_calls[0])
+        # Mock mode for testing
+        env.mock()
+        env.mock_tool("browser_navigate", "Navigation successful")
+        async with env:
+            result = await env.call_tool("browser_navigate", url="https://example.com")
+            # Returns mock value instead of actually navigating
+        ```
+    """
+    MAX_CONCURRENT_CONNECTIONS = 10
+    @staticmethod
+    def _normalize_name(name: str) -> str:
+        """Normalize environment name to lowercase with hyphens.
+        - Strips whitespace
+        - Replaces spaces and underscores with hyphens
+        - Lowercases the result
+        - Removes any non-alphanumeric characters except hyphens
+        """
+        import re
+        normalized = name.strip().lower()
+        normalized = normalized.replace(" ", "-").replace("_", "-")
+        # Keep only alphanumeric and hyphens
+        normalized = re.sub(r"[^a-z0-9-]", "", normalized)
+        # Collapse multiple hyphens
+        normalized = re.sub(r"-+", "-", normalized)
+        # Strip leading/trailing hyphens
+        return normalized.strip("-") or "environment"
+    def __init__(
+        self,
+        name: str = "environment",
+        instructions: str | None = None,
+        conflict_resolution: ConflictResolution = ConflictResolution.PREFIX,
+        **fastmcp_kwargs: Any,
+    ) -> None:
+        # Normalize name to prevent casing/spacing issues
+        name = self._normalize_name(name)
+        super().__init__(name=name, instructions=instructions, **fastmcp_kwargs)
+        self._connections: dict[str, Connector] = {}
+        self._router = ToolRouter(conflict_resolution=conflict_resolution)
+        # Granular routing flags - only rebuild what's invalidated
+        self._tool_routing_built = False
+        self._prompt_routing_built = False
+        self._resource_routing_built = False
+        self._in_context = False
+        # Tool call queues - run after connections established
+        self._setup_calls: list[tuple[str, dict[str, Any]]] = []
+        self._evaluate_calls: list[tuple[str, dict[str, Any]]] = []
+        self._integration_test_calls: list[tuple[str, dict[str, Any]]] = []
+        # Store setup tool results for append_setup_output feature
+        self._setup_results: list[MCPToolResult] = []
+        # Default prompt (EvalContext has per-run prompt)
+        self.prompt: str | None = None
+        # Serialization support
+        # _hub_config: set by connect_hub() for v5 format {"name": "hub", "include": [...]}
+        # _mcp_config: set by connect_mcp_config() for v4 format {"server_name": {...}}
+        self._hub_config: dict[str, Any] | None = None
+        self._mcp_config: dict[str, dict[str, Any]] | None = None
+        # Agent-level tool filtering (applied in as_tools(), not at connection level)
+        # This allows Environment to call all tools while limiting agent visibility
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+        # Initialize mock state
+        self._init_mock()
+        # Initialize scenario state
+        self._init_scenarios()
+    # =========================================================================
+    # Core Methods
+    # =========================================================================
+    def as_tools(self) -> list[mcp_types.Tool]:
+        """Return tools in MCP format (base format).
+        Applies agent-level include/exclude filtering if set.
+        Supports fnmatch-style wildcards (e.g., "*setup*", "browser_*").
+        """
+        import fnmatch
+        tools = self._router.tools
+        # Apply agent-level filtering (from v4 allowed_tools/disallowed_tools)
+        if self._agent_include is not None or self._agent_exclude is not None:
+            filtered = []
+            for tool in tools:
+                # Include filter: None means include all, check if matches any pattern
+                if self._agent_include is not None and not any(
+                    fnmatch.fnmatch(tool.name, pattern) for pattern in self._agent_include
+                ):
+                    continue
+                # Exclude filter: skip if tool matches any exclude pattern
+                if self._agent_exclude is not None and any(
+                    fnmatch.fnmatch(tool.name, pattern) for pattern in self._agent_exclude
+                ):
+                    continue
+                filtered.append(tool)
+            return filtered
+        return tools
+    def add_tool(self, obj: Any, **kwargs: Any) -> None:
+        super().add_tool(obj, **kwargs)
+        self._tool_routing_built = False  # Only invalidate tool routing
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> Any:
+        """Call a tool, auto-detecting format and returning matching result format.
+        Accepts any format:
+            - String with kwargs: call_tool("navigate", url="...")
+            - Tuple: call_tool(("navigate", {"url": "..."}))
+            - MCPToolCall: call_tool(MCPToolCall(name="navigate", ...))
+            - OpenAI: call_tool(response.choices[0].message.tool_calls[0])
+            - Claude: call_tool(response.content[0])  # tool_use block
+            - Gemini: call_tool(response.candidates[0].content.parts[0])
+        Returns:
+            Result formatted to match input format (OpenAI -> OpenAI tool message, etc.)
+        """
+        from hud.environment.utils import format_result, parse_tool_call
+        # Parse the tool call (kwargs merged when call is string)
+        parsed, fmt = parse_tool_call(call, **kwargs)
+        result = await self._execute_tool(parsed.name, parsed.arguments or {})
+        return format_result(result, parsed, fmt)
+    def _connections_with_tool(self, tool_name: str) -> set[str]:
+        """Get connection names that have a specific tool.
+        Uses cached_tools from each Connector to check availability.
+        """
+        result = set()
+        for name, connector in self._connections.items():
+            tool_names = {t.name for t in connector.cached_tools}
+            if tool_name in tool_names:
+                result.add(name)
+        return result
+    async def _broadcast_tool(
+        self,
+        tool_name: str,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """Broadcast a tool call to all connections that have the tool.
+        Automatically filters to only connections where the tool exists
+        (based on cached_tools from initial discovery).
+        For internal tools (starting with _), tries ALL connections since
+        internal tools are hidden from list_tools() and won't be in cached_tools.
+        Args:
+            tool_name: Name of the tool to call
+            **kwargs: Arguments to pass to the tool
+        Returns:
+            Dict mapping connection name to result (or exception)
+        """
+        import asyncio
+        # For internal tools (underscore prefix), try ALL connections since
+        # they're hidden from list_tools() and won't appear in cached_tools.
+        # For regular tools, only try connections that advertise the tool.
+        if tool_name.startswith("_"):
+            targets = set(self._connections.keys())
+        else:
+            targets = self._connections_with_tool(tool_name)
+        results: dict[str, Any] = {}
+        async def call_one(name: str) -> None:
+            connector = self._connections.get(name)
+            if not connector or not connector.client:
+                return
+            try:
+                # Use connector.call_tool which expects arguments as a dict
+                results[name] = await connector.call_tool(tool_name, kwargs)
+                logger.debug("Broadcast '%s' to '%s' succeeded", tool_name, name)
+            except Exception as e:
+                results[name] = e
+                logger.debug("Broadcast '%s' to '%s' failed: %s", tool_name, name, e)
+        await asyncio.gather(*[call_one(n) for n in targets], return_exceptions=True)
+        return results
+    async def call_tools(self, calls: Any) -> list[Any]:
+        """Call multiple tools, returning results in matching formats."""
+        if calls is None:
+            return []
+        if not isinstance(calls, list):
+            return [await self.call_tool(calls)]
+        # Filter to tool calls only (skip text blocks, etc.)
+        tool_calls = []
+        for call in calls:
+            t = call.get("type") if isinstance(call, dict) else getattr(call, "type", None)
+            if t is None or t in ("tool_use", "function"):
+                tool_calls.append(call)
+        return await asyncio.gather(*[self.call_tool(c) for c in tool_calls])
+    # =========================================================================
+    # Lifecycle Configuration
+    # =========================================================================
+    def setup_tool(self, call: Any, /, **kwargs: Any) -> Environment:
+        """Add a tool call to execute after connections are established."""
+        from hud.environment.utils import parse_tool_call
+        if isinstance(call, str) and kwargs:
+            self._setup_calls.append((call, kwargs))
+        else:
+            parsed, _ = parse_tool_call(call)
+            self._setup_calls.append((parsed.name, parsed.arguments or {}))
+        return self
+    def evaluate_tool(self, call: Any, /, **kwargs: Any) -> Environment:
+        """Add a tool call to execute before disconnecting."""
+        from hud.environment.utils import parse_tool_call
+        if isinstance(call, str) and kwargs:
+            self._evaluate_calls.append((call, kwargs))
+        else:
+            parsed, _ = parse_tool_call(call)
+            self._evaluate_calls.append((parsed.name, parsed.arguments or {}))
+        return self
+    # =========================================================================
+    # Context Manager
+    # =========================================================================
+    async def __aenter__(self) -> Self:
+        """Connect all connectors, build routing, run setup tools."""
+        self._in_context = True
+        # Connect to all servers and fetch tools/prompts/resources in parallel
+        sem = asyncio.Semaphore(self.MAX_CONCURRENT_CONNECTIONS)
+        errors: list[tuple[str, Exception]] = []
+        async def connect_one(name: str, conn: Connector) -> None:
+            async with sem:
+                try:
+                    await conn.connect()
+                    # Batch fetch all MCP primitives in parallel for performance
+                    await asyncio.gather(
+                        conn.list_tools(),
+                        conn.list_prompts(),
+                        conn.list_resources(),
+                    )
+                except Exception as e:
+                    errors.append((name, e))
+        if self._connections:
+            await asyncio.gather(*[connect_one(n, c) for n, c in self._connections.items()])
+            if errors:
+                for conn in self._connections.values():
+                    if conn.is_connected:
+                        await conn.disconnect()
+                name, err = errors[0]
+                str_err = str(err).replace("Client failed to connect: ", "")  # Strip from FastMCP
+                raise ConnectionError(f"Failed to connect to {name}: {str_err}") from err
+        await self._build_routing()
+        # Setup tool calls (after connections) - abort if any setup tool fails
+        # Store results for append_setup_output feature
+        self._setup_results = []
+        for name, args in self._setup_calls:
+            result = await self._execute_tool(name, args)
+            self._setup_results.append(result)
+            if result.isError:
+                # Extract error message from result content
+                error_msg = "Setup tool failed"
+                if result.content:
+                    for block in result.content:
+                        if isinstance(block, mcp_types.TextContent):
+                            error_msg = block.text
+                            break
+                # Clean up connections before raising (since __aexit__ won't be called)
+                for conn in self._connections.values():
+                    if conn.is_connected:
+                        await conn.disconnect()
+                raise RuntimeError(f"Setup tool '{name}' failed: {error_msg}")
+        return self
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: types.TracebackType | None,
+    ) -> None:
+        """Run evaluate tools, exit queue, then disconnect."""
+        from hud.agents.base import find_reward
+        # Evaluate tool calls and collect rewards
+        rewards: list[float] = []
+        for name, args in self._evaluate_calls:
+            try:
+                result = await self._execute_tool(name, args)
+                rewards.append(find_reward(result))
+            except Exception as e:
+                logger.warning("Evaluate tool %s failed: %s", name, e)
+                # Record 0.0 for failed evaluate tools so they affect the average
+                rewards.append(0.0)
+        # Store average reward from evaluate tools
+        self._evaluate_reward: float | None = None
+        if rewards:
+            self._evaluate_reward = sum(rewards) / len(rewards)
+        self._in_context = False
+        if self._connections:
+            await asyncio.gather(*[c.disconnect() for c in self._connections.values()])
+        self._router.clear()
+        self._tool_routing_built = False
+        self._prompt_routing_built = False
+        self._resource_routing_built = False
+        self._active_session = None  # Clear stale scenario state
+    async def run_async(
+        self,
+        transport: Literal["stdio", "http", "sse"] | None = None,
+        show_banner: bool = True,
+        **transport_kwargs: Any,
+    ) -> None:
+        """Run the MCP server, auto-connecting all connectors first.
+        This ensures that tools from external MCP servers (via connect_mcp_config)
+        are discovered and available when the server starts.
+        """
+        async with self:  # Connect all connectors via __aenter__
+            await super().run_async(
+                transport=transport, show_banner=show_banner, **transport_kwargs
+            )
+    async def _build_routing(self) -> None:
+        """Build routing for tools, prompts, and resources in parallel.
+        Only rebuilds what's actually invalidated for performance.
+        """
+        tasks = []
+        if not self._tool_routing_built:
+            tasks.append(self._build_tool_routing())
+        if not self._prompt_routing_built:
+            tasks.append(self._build_prompt_routing())
+        if not self._resource_routing_built:
+            tasks.append(self._build_resource_routing())
+        if tasks:
+            await asyncio.gather(*tasks)
+    async def _build_tool_routing(self) -> None:
+        """Build tool routing from local tools and connection caches."""
+        local_tools_dict = await self._tool_manager.get_tools()
+        local_tools = list(local_tools_dict.values())
+        self._router.build(
+            local_tools=[t.to_mcp_tool() for t in local_tools],
+            connections=self._connections,
+            connection_order=list(self._connections.keys()),
+        )
+        # Populate mock schemas for auto-generated mock values
+        self._populate_mock_schemas()
+        self._tool_routing_built = True
+    async def _build_prompt_routing(self) -> None:
+        """Build prompt routing from local prompts and connections."""
+        local_prompts_dict = await self._prompt_manager.get_prompts()
+        local_prompts = [p.to_mcp_prompt() for p in local_prompts_dict.values()]
+        self._router.build_prompts(local_prompts, self._connections)
+        self._prompt_routing_built = True
+    async def _build_resource_routing(self) -> None:
+        """Build resource routing from local resources and connections."""
+        local_resources_dict = await self._resource_manager.get_resources()
+        local_resources = [r.to_mcp_resource() for r in local_resources_dict.values()]
+        self._router.build_resources(local_resources, self._connections)
+        self._resource_routing_built = True
+    # =========================================================================
+    # MCP Protocol Overrides - Include connector tools in MCP responses
+    # =========================================================================
+    def _setup_handlers(self) -> None:
+        """Override FastMCP to register our custom handlers for tools."""
+        # Call parent to set up all standard handlers
+        super()._setup_handlers()
+        # Re-register our custom handlers (overwrites parent's registrations)
+        self._mcp_server.list_tools()(self._env_list_tools)
+        self._mcp_server.call_tool()(self._env_call_tool)
+    async def _env_list_tools(self) -> list[mcp_types.Tool]:
+        """Return all tools including those from connectors."""
+        if not self._tool_routing_built:
+            await self._build_tool_routing()
+        return self._router.tools
+    async def _env_call_tool(self, name: str, arguments: dict[str, Any] | None = None) -> list[Any]:
+        """Route tool calls through our router (handles both local and connector tools)."""
+        result = await self._execute_tool(name, arguments or {})
+        return result.content or []
+    # =========================================================================
+    # Tool Operations
+    # =========================================================================
+    async def list_tools(self) -> list[mcp_types.Tool]:
+        """Refresh tools from all connections and rebuild tool routing."""
+        if self._connections:
+            await asyncio.gather(*[c.list_tools() for c in self._connections.values()])
+        await self._build_tool_routing()
+        return self._router.tools
+    async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
+        """Execute a tool by name. Routes to local or remote handler.
+        If mock mode is enabled, returns a mock result instead of executing.
+        """
+        # Check mock mode first
+        if self._mock_mode:
+            logger.debug("Mock mode: returning mock result for tool %s", name)
+            return self._get_mock_result(name, arguments)
+        # Rebuild tool routing if invalidated (e.g., after add_tool)
+        if not self._tool_routing_built:
+            await self._build_tool_routing()
+        if self._router.is_local(name):
+            # Call tool manager directly to avoid FastMCP context requirement
+            result = await self._tool_manager.call_tool(name, arguments)
+            return MCPToolResult(
+                content=result.content, structuredContent=result.structured_content
+            )
+        connection_name = self._router.get_connection(name)
+        if connection_name:
+            conn = self._connections[connection_name]
+            result = await conn.call_tool(name, arguments)
+            return MCPToolResult(
+                content=result.content,
+                isError=result.isError,
+                structuredContent=result.structuredContent,
+            )
+        raise ValueError(f"Tool not found: {name}")
+    # =========================================================================
+    # Resource Operations
+    # =========================================================================
+    async def list_resources(self) -> list[mcp_types.Resource]:
+        """Refresh resources from all connections and rebuild resource routing."""
+        if self._connections:
+            await asyncio.gather(*[c.list_resources() for c in self._connections.values()])
+        await self._build_resource_routing()
+        return self._router.resources
+    async def read_resource(
+        self, uri: str
+    ) -> list[mcp_types.TextResourceContents | mcp_types.BlobResourceContents]:
+        """Read a resource by URI using router for connection lookup."""
+        from pydantic import AnyUrl
+        # Ensure resource routing is built
+        if not self._resource_routing_built:
+            await self._build_resource_routing()
+        # Use router to find which connection has this resource
+        conn_name = self._router.get_resource_connection(uri)
+        if conn_name is None:
+            # Local resource
+            try:
+                result = await self._resource_manager.read_resource(uri)
+                resource_uri = AnyUrl(uri)
+                if isinstance(result, str):
+                    return [mcp_types.TextResourceContents(uri=resource_uri, text=result)]
+                import base64
+                return [
+                    mcp_types.BlobResourceContents(
+                        uri=resource_uri, blob=base64.b64encode(result).decode()
+                    )
+                ]
+            except Exception as e:
+                logger.debug("Local resource read failed for %s: %s", uri, e)
+                raise ValueError(f"Resource not found: {uri}") from e
+        else:
+            # Remote resource
+            conn = self._connections.get(conn_name)
+            if conn is None:
+                raise ValueError(f"Connection '{conn_name}' not found for resource '{uri}'")
+            return await conn.read_resource(uri)
+    # =========================================================================
+    # Prompt Operations
+    # =========================================================================
+    async def list_prompts(self) -> list[mcp_types.Prompt]:
+        """Refresh prompts from all connections and rebuild prompt routing."""
+        if self._connections:
+            await asyncio.gather(*[c.list_prompts() for c in self._connections.values()])
+        await self._build_prompt_routing()
+        return self._router.prompts
+    async def get_prompt(
+        self, name: str, arguments: dict[str, Any] | None = None
+    ) -> mcp_types.GetPromptResult:
+        """Get a prompt by name using router for connection lookup."""
+        # Ensure prompt routing is built
+        if not self._prompt_routing_built:
+            await self._build_prompt_routing()
+        # Use router to find which connection has this prompt
+        conn_name = self._router.get_prompt_connection(name)
+        if conn_name is None:
+            # Local prompt
+            try:
+                return await self._prompt_manager.render_prompt(name, arguments or {})
+            except Exception as e:
+                raise ValueError(f"Prompt not found: {name}") from e
+        else:
+            # Remote prompt
+            conn = self._connections.get(conn_name)
+            if conn is None:
+                raise ValueError(f"Connection '{conn_name}' not found for prompt '{name}'")
+            return await conn.get_prompt(name, arguments)
+    # =========================================================================
+    # Server Methods
+    # =========================================================================
+    def serve(
+        self,
+        transport: Literal["stdio", "sse", "streamable-http"] = "streamable-http",
+        host: str = "0.0.0.0",  # noqa: S104
+        port: int = 8000,
+        **kwargs: Any,
+    ) -> None:
+        """Start serving as an MCP server."""
+        self.run(transport=transport, host=host, port=port, **kwargs)
+    # =========================================================================
+    # Properties
+    # =========================================================================
+    @property
+    def connections(self) -> dict[str, Connector]:
+        return self._connections
+    @property
+    def is_connected(self) -> bool:
+        return self._in_context
+    @property
+    def is_parallelizable(self) -> bool:
+        """True if all connections are remote (can spawn multiple instances)."""
+        if not self._connections:
+            return True  # No connections = can parallelize (local tools only)
+        return all(conn.is_remote for conn in self._connections.values())
+    @property
+    def local_connections(self) -> list[str]:
+        """Names of local (non-parallelizable) connections."""
+        return [name for name, conn in self._connections.items() if conn.is_local]
+    # =========================================================================
+    # Serialization
+    # =========================================================================
+    @property
+    def is_serializable(self) -> bool:
+        """True if environment can be serialized (no local tools/scenarios).
+        For v5 format: requires hub config from connect_hub()
+        For v4 format: requires mcp_config, prompt, AND evaluate_tool
+        """
+        # Check for local tools (registered via @env.tool)
+        if self._router._local_tool_names:
+            return False
+        # Check for local scenarios (registered via @env.scenario)
+        if getattr(self, "_scenarios", {}):
+            return False
+        # v5 hub format
+        if self._hub_config is not None:
+            return True
+        # v4 format requires mcp_config + prompt + evaluate_tool
+        if self._mcp_config is not None:
+            return bool(self.prompt and self._evaluate_calls)
+        return False
+    def to_config(self) -> dict[str, Any]:
+        """Serialize environment config for remote submission.
+        Returns the config in either v5 format (hub-based) or v4 format (legacy).
+        For v4 format, automatically includes prompt, setup_tool, and evaluate_tool
+        from the Environment's state.
+        Returns:
+            dict: Serializable config
+        Raises:
+            ValueError: If environment has local tools/scenarios that can't be serialized
+        Example:
+            ```python
+            # v5 hub-based
+            env = Environment("my").connect_hub("browser", include=["navigate"])
+            env.to_config()  # {"name": "browser", "include": ["navigate"]}
+            # v4 legacy (from Task.from_v4())
+            task = Task.from_v4(legacy_task)
+            task.env.to_config()  # {"prompt": "...", "mcp_config": {...}, ...}
+            ```
+        """
+        if self._router._local_tool_names:
+            raise ValueError(
+                f"Cannot serialize Environment with local tools: "
+                f"{list(self._router._local_tool_names)}. "
+                "Local tools require local execution. For remote submission, "
+                "use dict config or connect to a remote hub."
+            )
+        if getattr(self, "_scenarios", {}):
+            raise ValueError(
+                f"Cannot serialize Environment with local scenarios: "
+                f"{list(self._scenarios.keys())}. "
+                "Local scenarios require local execution. For remote submission, "
+                "define scenarios on the remote environment."
+            )
+        # v5 hub-based format
+        if self._hub_config is not None:
+            return self._hub_config.copy()
+        # v4 legacy format - requires mcp_config, prompt, AND evaluate_tool
+        if self._mcp_config is not None:
+            # Validate required fields for v4 format
+            if not self.prompt:
+                raise ValueError(
+                    "Cannot serialize v4 Environment without prompt. "
+                    "Set env.prompt before serializing."
+                )
+            if not self._evaluate_calls:
+                raise ValueError(
+                    "Cannot serialize v4 Environment without evaluate_tool. "
+                    "Use env.evaluate_tool() to define evaluation criteria."
+                )
+            config: dict[str, Any] = {
+                "prompt": self.prompt,
+                "mcp_config": self._mcp_config,
+                "evaluate_tool": [
+                    {"name": name, "arguments": args} for name, args in self._evaluate_calls
+                ],
+            }
+            if self._setup_calls:
+                config["setup_tool"] = [
+                    {"name": name, "arguments": args} for name, args in self._setup_calls
+                ]
+            return config
+        raise ValueError(
+            "Cannot serialize Environment without config. "
+            "Use connect_hub() for v5 tasks or connect_mcp_config() for legacy tasks."
+        )
+    def __repr__(self) -> str:
+        return f"Environment({self.name!r}, connections={list(self._connections.keys())})"
+    # =========================================================================
+    # Task Creation
+    # =========================================================================
+    def __call__(
+        self,
+        scenario: str | None = None,
+        **args: Any,
+    ) -> Task:
+        """Create a Task from this environment.
+        Returns a Task that can be passed to hud.eval() for orchestration.
+        Args:
+            scenario: Scenario name to run (from @env.scenario). Optional for v4 legacy.
+            **args: Arguments for the scenario
+        Returns:
+            Task: A runnable evaluation unit
+        Example:
+            ```python
+            env = Environment("my-env").connect_hub("browser")
+            @env.scenario()
+            async def checkout(user_id: str):
+                yield "Complete checkout"
+                yield 1.0
+            # Single task via hud.eval
+            async with hud.eval(env("checkout", user_id="alice")) as ctx:
+                await agent.run(ctx.prompt)
+            # Multiple tasks with variants
+            tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
+            async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
+                ...
+            ```
+        """
+        from hud.eval.task import Task
+        return Task(
+            env=self,
+            scenario=scenario,
+            args=args,
+        )

hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl