hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,694 @@
|
|
|
1
|
+
"""Environment class - unified MCP server and client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import Awaitable, Callable
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal, Self
|
|
9
|
+
|
|
10
|
+
import mcp.types as mcp_types
|
|
11
|
+
|
|
12
|
+
from hud.environment.connectors import ConnectorsMixin
|
|
13
|
+
from hud.environment.integrations import IntegrationsMixin
|
|
14
|
+
from hud.environment.mock import MockMixin
|
|
15
|
+
from hud.environment.router import ConflictResolution, ToolRouter
|
|
16
|
+
from hud.environment.scenarios import ScenarioMixin
|
|
17
|
+
from hud.server.server import MCPServer
|
|
18
|
+
from hud.types import MCPToolResult
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import types
|
|
22
|
+
|
|
23
|
+
from hud.environment.connection import Connector
|
|
24
|
+
from hud.eval.task import Task
|
|
25
|
+
|
|
26
|
+
__all__ = ["Environment"]
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# Suppress verbose fastmcp logging
|
|
31
|
+
logging.getLogger("fastmcp.server.server").setLevel(logging.WARNING)
|
|
32
|
+
logging.getLogger("fastmcp.server.openapi").setLevel(logging.WARNING)
|
|
33
|
+
|
|
34
|
+
# Type alias for async callables (no-arg functions that return awaitable)
|
|
35
|
+
AsyncCallable = Callable[[], Awaitable[Any]]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Environment(
|
|
39
|
+
ConnectorsMixin,
|
|
40
|
+
IntegrationsMixin,
|
|
41
|
+
MockMixin,
|
|
42
|
+
ScenarioMixin,
|
|
43
|
+
MCPServer,
|
|
44
|
+
):
|
|
45
|
+
"""Unified MCP environment that acts as both server and client.
|
|
46
|
+
|
|
47
|
+
Features:
|
|
48
|
+
- Define local tools with @env.tool decorator
|
|
49
|
+
- Connect to HUD Hub, URLs, or mcp_config dicts
|
|
50
|
+
- Automatic tool routing (local vs remote)
|
|
51
|
+
- Format tools for any LLM provider
|
|
52
|
+
- Integrate with popular agent frameworks
|
|
53
|
+
- Mock mode for testing without real connections
|
|
54
|
+
|
|
55
|
+
Connector methods (connect to sources):
|
|
56
|
+
connect_hub(name) - HUD Hub environment
|
|
57
|
+
connect_url(url) - MCP server via URL
|
|
58
|
+
connect_mcp(config) - Single mcp_config server
|
|
59
|
+
connect_mcp_config(mcp_config) - Multiple mcp_config servers
|
|
60
|
+
connect_image(image) - Docker image via stdio
|
|
61
|
+
connect_fastapi(app) - Mount FastAPI app as MCP server
|
|
62
|
+
connect_openapi(spec) - Mount OpenAPI spec as MCP server
|
|
63
|
+
connect_server(server) - Mount MCPServer/FastMCP directly
|
|
64
|
+
|
|
65
|
+
Mock methods (for testing):
|
|
66
|
+
mock() - Enable mock mode, all tools return mock values
|
|
67
|
+
unmock() - Disable mock mode
|
|
68
|
+
mock_tool(name, output) - Set specific mock output for a tool
|
|
69
|
+
is_mock - Check if mock mode is enabled
|
|
70
|
+
|
|
71
|
+
OpenAI integrations:
|
|
72
|
+
as_openai_chat_tools() - Chat Completions format
|
|
73
|
+
as_openai_responses_tools() - Responses API format
|
|
74
|
+
as_openai_agent_tools() - Agents SDK (requires openai-agents)
|
|
75
|
+
|
|
76
|
+
Anthropic/Claude integrations:
|
|
77
|
+
as_claude_tools() - Claude API format
|
|
78
|
+
as_claude_programmatic_tools() - Programmatic tool use
|
|
79
|
+
as_anthropic_runner() - Tool runner (requires anthropic)
|
|
80
|
+
|
|
81
|
+
Google/Gemini integrations:
|
|
82
|
+
as_gemini_tools() - Gemini format
|
|
83
|
+
as_gemini_tool_config() - Tool execution config
|
|
84
|
+
|
|
85
|
+
LangChain integrations:
|
|
86
|
+
as_langchain_tools() - StructuredTools (requires langchain-core)
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
```python
|
|
90
|
+
env = Environment("my-env")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@env.tool
|
|
94
|
+
def greet(name: str) -> str:
|
|
95
|
+
return f"Hello, {name}!"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
env.connect_hub("browser", prefix="browser")
|
|
99
|
+
|
|
100
|
+
async with env:
|
|
101
|
+
# Get tools in any format
|
|
102
|
+
openai_tools = env.as_openai_chat_tools()
|
|
103
|
+
claude_tools = env.as_claude_tools()
|
|
104
|
+
|
|
105
|
+
# Call tools - automatically routed
|
|
106
|
+
result = await env.call_tool("greet", name="World")
|
|
107
|
+
|
|
108
|
+
# Or pass provider-specific format - auto-detected
|
|
109
|
+
result = await env.call_tool(response.choices[0].message.tool_calls[0])
|
|
110
|
+
|
|
111
|
+
# Mock mode for testing
|
|
112
|
+
env.mock()
|
|
113
|
+
env.mock_tool("browser_navigate", "Navigation successful")
|
|
114
|
+
async with env:
|
|
115
|
+
result = await env.call_tool("browser_navigate", url="https://example.com")
|
|
116
|
+
# Returns mock value instead of actually navigating
|
|
117
|
+
```
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
MAX_CONCURRENT_CONNECTIONS = 10
|
|
121
|
+
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
name: str = "environment",
|
|
125
|
+
instructions: str | None = None,
|
|
126
|
+
conflict_resolution: ConflictResolution = ConflictResolution.PREFIX,
|
|
127
|
+
**fastmcp_kwargs: Any,
|
|
128
|
+
) -> None:
|
|
129
|
+
super().__init__(name=name, instructions=instructions, **fastmcp_kwargs)
|
|
130
|
+
self._connections: dict[str, Connector] = {}
|
|
131
|
+
self._router = ToolRouter(conflict_resolution=conflict_resolution)
|
|
132
|
+
self._in_context = False
|
|
133
|
+
|
|
134
|
+
# Tool call queues - run after connections established
|
|
135
|
+
self._setup_calls: list[tuple[str, dict[str, Any]]] = []
|
|
136
|
+
self._evaluate_calls: list[tuple[str, dict[str, Any]]] = []
|
|
137
|
+
|
|
138
|
+
# Default prompt (EvalContext has per-run prompt)
|
|
139
|
+
self.prompt: str | None = None
|
|
140
|
+
|
|
141
|
+
# Serialization support
|
|
142
|
+
# _hub_config: set by connect_hub() for v5 format {"name": "hub", "include": [...]}
|
|
143
|
+
# _mcp_config: set by connect_mcp_config() for v4 format {"server_name": {...}}
|
|
144
|
+
self._hub_config: dict[str, Any] | None = None
|
|
145
|
+
self._mcp_config: dict[str, dict[str, Any]] | None = None
|
|
146
|
+
|
|
147
|
+
# Agent-level tool filtering (applied in as_tools(), not at connection level)
|
|
148
|
+
# This allows Environment to call all tools while limiting agent visibility
|
|
149
|
+
self._agent_include: list[str] | None = None
|
|
150
|
+
self._agent_exclude: list[str] | None = None
|
|
151
|
+
|
|
152
|
+
# Initialize mock state
|
|
153
|
+
self._init_mock()
|
|
154
|
+
|
|
155
|
+
# Initialize scenario state
|
|
156
|
+
self._init_scenarios()
|
|
157
|
+
|
|
158
|
+
# =========================================================================
|
|
159
|
+
# Core Methods
|
|
160
|
+
# =========================================================================
|
|
161
|
+
|
|
162
|
+
def as_tools(self) -> list[mcp_types.Tool]:
|
|
163
|
+
"""Return tools in MCP format (base format).
|
|
164
|
+
|
|
165
|
+
Applies agent-level include/exclude filtering if set.
|
|
166
|
+
"""
|
|
167
|
+
tools = self._router.tools
|
|
168
|
+
|
|
169
|
+
# Apply agent-level filtering (from v4 allowed_tools/disallowed_tools)
|
|
170
|
+
if self._agent_include is not None or self._agent_exclude is not None:
|
|
171
|
+
filtered = []
|
|
172
|
+
for tool in tools:
|
|
173
|
+
# Include filter: None means include all
|
|
174
|
+
if self._agent_include is not None and tool.name not in self._agent_include:
|
|
175
|
+
continue
|
|
176
|
+
# Exclude filter
|
|
177
|
+
if self._agent_exclude is not None and tool.name in self._agent_exclude:
|
|
178
|
+
continue
|
|
179
|
+
filtered.append(tool)
|
|
180
|
+
return filtered
|
|
181
|
+
|
|
182
|
+
return tools
|
|
183
|
+
|
|
184
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> Any:
|
|
185
|
+
"""Call a tool, auto-detecting format and returning matching result format.
|
|
186
|
+
|
|
187
|
+
Accepts any format:
|
|
188
|
+
- String with kwargs: call_tool("navigate", url="...")
|
|
189
|
+
- Tuple: call_tool(("navigate", {"url": "..."}))
|
|
190
|
+
- MCPToolCall: call_tool(MCPToolCall(name="navigate", ...))
|
|
191
|
+
- OpenAI: call_tool(response.choices[0].message.tool_calls[0])
|
|
192
|
+
- Claude: call_tool(response.content[0]) # tool_use block
|
|
193
|
+
- Gemini: call_tool(response.candidates[0].content.parts[0])
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Result formatted to match input format (OpenAI -> OpenAI tool message, etc.)
|
|
197
|
+
"""
|
|
198
|
+
from hud.environment.utils import format_result, parse_tool_call
|
|
199
|
+
|
|
200
|
+
# Parse the tool call (kwargs merged when call is string)
|
|
201
|
+
parsed, fmt = parse_tool_call(call, **kwargs)
|
|
202
|
+
result = await self._execute_tool(parsed.name, parsed.arguments or {})
|
|
203
|
+
return format_result(result, parsed, fmt)
|
|
204
|
+
|
|
205
|
+
def _connections_with_tool(self, tool_name: str) -> set[str]:
|
|
206
|
+
"""Get connection names that have a specific tool.
|
|
207
|
+
|
|
208
|
+
Uses cached_tools from each Connector to check availability.
|
|
209
|
+
"""
|
|
210
|
+
result = set()
|
|
211
|
+
for name, connector in self._connections.items():
|
|
212
|
+
tool_names = {t.name for t in connector.cached_tools}
|
|
213
|
+
if tool_name in tool_names:
|
|
214
|
+
result.add(name)
|
|
215
|
+
return result
|
|
216
|
+
|
|
217
|
+
async def _broadcast_tool(
|
|
218
|
+
self,
|
|
219
|
+
tool_name: str,
|
|
220
|
+
**kwargs: Any,
|
|
221
|
+
) -> dict[str, Any]:
|
|
222
|
+
"""Broadcast a tool call to all connections that have the tool.
|
|
223
|
+
|
|
224
|
+
Automatically filters to only connections where the tool exists
|
|
225
|
+
(based on cached_tools from initial discovery).
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
tool_name: Name of the tool to call
|
|
229
|
+
**kwargs: Arguments to pass to the tool
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Dict mapping connection name to result (or exception)
|
|
233
|
+
"""
|
|
234
|
+
import asyncio
|
|
235
|
+
|
|
236
|
+
# Only call connections that have this tool
|
|
237
|
+
targets = self._connections_with_tool(tool_name)
|
|
238
|
+
if not targets:
|
|
239
|
+
return {}
|
|
240
|
+
|
|
241
|
+
results: dict[str, Any] = {}
|
|
242
|
+
|
|
243
|
+
async def call_one(name: str) -> None:
|
|
244
|
+
connector = self._connections.get(name)
|
|
245
|
+
if not connector or not connector.client:
|
|
246
|
+
return
|
|
247
|
+
try:
|
|
248
|
+
results[name] = await connector.client.call_tool(tool_name, **kwargs)
|
|
249
|
+
logger.debug("Broadcast '%s' to '%s' succeeded", tool_name, name)
|
|
250
|
+
except Exception as e:
|
|
251
|
+
results[name] = e
|
|
252
|
+
logger.debug("Broadcast '%s' to '%s' failed: %s", tool_name, name, e)
|
|
253
|
+
|
|
254
|
+
await asyncio.gather(*[call_one(n) for n in targets], return_exceptions=True)
|
|
255
|
+
return results
|
|
256
|
+
|
|
257
|
+
async def call_tools(self, calls: Any) -> list[Any]:
|
|
258
|
+
"""Call multiple tools, returning results in matching formats."""
|
|
259
|
+
if calls is None:
|
|
260
|
+
return []
|
|
261
|
+
if not isinstance(calls, list):
|
|
262
|
+
return [await self.call_tool(calls)]
|
|
263
|
+
|
|
264
|
+
# Filter to tool calls only (skip text blocks, etc.)
|
|
265
|
+
tool_calls = []
|
|
266
|
+
for call in calls:
|
|
267
|
+
t = call.get("type") if isinstance(call, dict) else getattr(call, "type", None)
|
|
268
|
+
if t is None or t in ("tool_use", "function"):
|
|
269
|
+
tool_calls.append(call)
|
|
270
|
+
|
|
271
|
+
return await asyncio.gather(*[self.call_tool(c) for c in tool_calls])
|
|
272
|
+
|
|
273
|
+
# =========================================================================
|
|
274
|
+
# Lifecycle Configuration
|
|
275
|
+
# =========================================================================
|
|
276
|
+
|
|
277
|
+
def setup_tool(self, call: Any, /, **kwargs: Any) -> Environment:
|
|
278
|
+
"""Add a tool call to execute after connections are established."""
|
|
279
|
+
from hud.environment.utils import parse_tool_call
|
|
280
|
+
|
|
281
|
+
if isinstance(call, str) and kwargs:
|
|
282
|
+
self._setup_calls.append((call, kwargs))
|
|
283
|
+
else:
|
|
284
|
+
parsed, _ = parse_tool_call(call)
|
|
285
|
+
self._setup_calls.append((parsed.name, parsed.arguments or {}))
|
|
286
|
+
return self
|
|
287
|
+
|
|
288
|
+
def evaluate_tool(self, call: Any, /, **kwargs: Any) -> Environment:
|
|
289
|
+
"""Add a tool call to execute before disconnecting."""
|
|
290
|
+
from hud.environment.utils import parse_tool_call
|
|
291
|
+
|
|
292
|
+
if isinstance(call, str) and kwargs:
|
|
293
|
+
self._evaluate_calls.append((call, kwargs))
|
|
294
|
+
else:
|
|
295
|
+
parsed, _ = parse_tool_call(call)
|
|
296
|
+
self._evaluate_calls.append((parsed.name, parsed.arguments or {}))
|
|
297
|
+
return self
|
|
298
|
+
|
|
299
|
+
# =========================================================================
|
|
300
|
+
# Context Manager
|
|
301
|
+
# =========================================================================
|
|
302
|
+
|
|
303
|
+
async def __aenter__(self) -> Self:
|
|
304
|
+
"""Connect all connectors, build routing, run setup tools."""
|
|
305
|
+
self._in_context = True
|
|
306
|
+
|
|
307
|
+
# Connect to all servers (on_connect callbacks run first within connect())
|
|
308
|
+
sem = asyncio.Semaphore(self.MAX_CONCURRENT_CONNECTIONS)
|
|
309
|
+
errors: list[tuple[str, Exception]] = []
|
|
310
|
+
|
|
311
|
+
async def connect_one(name: str, conn: Connector) -> None:
|
|
312
|
+
async with sem:
|
|
313
|
+
try:
|
|
314
|
+
await conn.connect()
|
|
315
|
+
await conn.list_tools()
|
|
316
|
+
except Exception as e:
|
|
317
|
+
errors.append((name, e))
|
|
318
|
+
|
|
319
|
+
if self._connections:
|
|
320
|
+
await asyncio.gather(*[connect_one(n, c) for n, c in self._connections.items()])
|
|
321
|
+
if errors:
|
|
322
|
+
for conn in self._connections.values():
|
|
323
|
+
if conn.is_connected:
|
|
324
|
+
await conn.disconnect()
|
|
325
|
+
name, err = errors[0]
|
|
326
|
+
str_err = str(err).replace("Client failed to connect: ", "") # Strip from FastMCP
|
|
327
|
+
raise ConnectionError(f"Failed to connect to {name}: {str_err}") from err
|
|
328
|
+
|
|
329
|
+
await self._build_routing()
|
|
330
|
+
|
|
331
|
+
# Setup tool calls (after connections)
|
|
332
|
+
for name, args in self._setup_calls:
|
|
333
|
+
await self._execute_tool(name, args)
|
|
334
|
+
|
|
335
|
+
return self
|
|
336
|
+
|
|
337
|
+
async def __aexit__(
|
|
338
|
+
self,
|
|
339
|
+
exc_type: type[BaseException] | None,
|
|
340
|
+
exc_val: BaseException | None,
|
|
341
|
+
exc_tb: types.TracebackType | None,
|
|
342
|
+
) -> None:
|
|
343
|
+
"""Run evaluate tools, exit queue, then disconnect."""
|
|
344
|
+
from hud.agents.base import find_reward
|
|
345
|
+
|
|
346
|
+
# Evaluate tool calls and collect rewards
|
|
347
|
+
rewards: list[float] = []
|
|
348
|
+
for name, args in self._evaluate_calls:
|
|
349
|
+
try:
|
|
350
|
+
result = await self._execute_tool(name, args)
|
|
351
|
+
rewards.append(find_reward(result))
|
|
352
|
+
except Exception as e:
|
|
353
|
+
logger.warning("Evaluate tool %s failed: %s", name, e)
|
|
354
|
+
|
|
355
|
+
# Store average reward from evaluate tools
|
|
356
|
+
self._evaluate_reward: float | None = None
|
|
357
|
+
if rewards:
|
|
358
|
+
self._evaluate_reward = sum(rewards) / len(rewards)
|
|
359
|
+
|
|
360
|
+
self._in_context = False
|
|
361
|
+
if self._connections:
|
|
362
|
+
await asyncio.gather(*[c.disconnect() for c in self._connections.values()])
|
|
363
|
+
self._router.clear()
|
|
364
|
+
|
|
365
|
+
async def _build_routing(self) -> None:
|
|
366
|
+
"""Build tool routing from local tools and connection caches."""
|
|
367
|
+
# Use get_tools() not list_tools() - it includes mounted servers without
|
|
368
|
+
# requiring MCP server communication (via_server=False)
|
|
369
|
+
local_tools_dict = await self._tool_manager.get_tools()
|
|
370
|
+
local_tools = list(local_tools_dict.values())
|
|
371
|
+
self._router.build(
|
|
372
|
+
local_tools=[t.to_mcp_tool() for t in local_tools],
|
|
373
|
+
connections=self._connections,
|
|
374
|
+
connection_order=list(self._connections.keys()),
|
|
375
|
+
)
|
|
376
|
+
# Populate mock schemas for auto-generated mock values
|
|
377
|
+
self._populate_mock_schemas()
|
|
378
|
+
|
|
379
|
+
# =========================================================================
|
|
380
|
+
# Tool Operations
|
|
381
|
+
# =========================================================================
|
|
382
|
+
|
|
383
|
+
async def list_tools(self) -> list[mcp_types.Tool]:
|
|
384
|
+
"""Refresh tools from all connections and rebuild routing."""
|
|
385
|
+
if self._connections:
|
|
386
|
+
await asyncio.gather(*[c.list_tools() for c in self._connections.values()])
|
|
387
|
+
await self._build_routing()
|
|
388
|
+
return self._router.tools
|
|
389
|
+
|
|
390
|
+
async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
|
|
391
|
+
"""Execute a tool by name. Routes to local or remote handler.
|
|
392
|
+
|
|
393
|
+
If mock mode is enabled, returns a mock result instead of executing.
|
|
394
|
+
"""
|
|
395
|
+
# Check mock mode first
|
|
396
|
+
if self._mock_mode:
|
|
397
|
+
logger.debug("Mock mode: returning mock result for tool %s", name)
|
|
398
|
+
return self._get_mock_result(name, arguments)
|
|
399
|
+
|
|
400
|
+
if self._router.is_local(name):
|
|
401
|
+
# Call tool manager directly to avoid FastMCP context requirement
|
|
402
|
+
result = await self._tool_manager.call_tool(name, arguments)
|
|
403
|
+
return MCPToolResult(
|
|
404
|
+
content=result.content,
|
|
405
|
+
structuredContent=result.structured_content,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
connection_name = self._router.get_connection(name)
|
|
409
|
+
if connection_name:
|
|
410
|
+
conn = self._connections[connection_name]
|
|
411
|
+
result = await conn.call_tool(name, arguments)
|
|
412
|
+
return MCPToolResult(
|
|
413
|
+
content=result.content,
|
|
414
|
+
isError=result.isError,
|
|
415
|
+
structuredContent=result.structuredContent,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
raise ValueError(f"Tool not found: {name}")
|
|
419
|
+
|
|
420
|
+
# =========================================================================
|
|
421
|
+
# Resource Operations
|
|
422
|
+
# =========================================================================
|
|
423
|
+
|
|
424
|
+
async def list_resources(self) -> list[mcp_types.Resource]:
|
|
425
|
+
"""List all resources (local + remote)."""
|
|
426
|
+
local = list((await self._resource_manager.get_resources()).values())
|
|
427
|
+
resources: list[mcp_types.Resource] = [r.to_mcp_resource() for r in local]
|
|
428
|
+
|
|
429
|
+
if self._connections:
|
|
430
|
+
results = await asyncio.gather(
|
|
431
|
+
*[c.list_resources() for c in self._connections.values()], return_exceptions=True
|
|
432
|
+
)
|
|
433
|
+
for r in results:
|
|
434
|
+
if isinstance(r, list):
|
|
435
|
+
resources.extend(r)
|
|
436
|
+
|
|
437
|
+
return resources
|
|
438
|
+
|
|
439
|
+
async def read_resource(
|
|
440
|
+
self, uri: str
|
|
441
|
+
) -> list[mcp_types.TextResourceContents | mcp_types.BlobResourceContents]:
|
|
442
|
+
"""Read a resource by URI (tries local first, then remote)."""
|
|
443
|
+
from pydantic import AnyUrl
|
|
444
|
+
|
|
445
|
+
try:
|
|
446
|
+
result = await self._resource_manager.read_resource(uri)
|
|
447
|
+
resource_uri = AnyUrl(uri)
|
|
448
|
+
if isinstance(result, str):
|
|
449
|
+
return [mcp_types.TextResourceContents(uri=resource_uri, text=result)]
|
|
450
|
+
import base64
|
|
451
|
+
|
|
452
|
+
return [
|
|
453
|
+
mcp_types.BlobResourceContents(
|
|
454
|
+
uri=resource_uri, blob=base64.b64encode(result).decode()
|
|
455
|
+
)
|
|
456
|
+
]
|
|
457
|
+
except Exception as e:
|
|
458
|
+
logger.debug("Local resource read failed for %s: %s", uri, e)
|
|
459
|
+
|
|
460
|
+
for conn in self._connections.values():
|
|
461
|
+
try:
|
|
462
|
+
return await conn.read_resource(uri)
|
|
463
|
+
except Exception as e:
|
|
464
|
+
logger.debug("Remote resource read failed for %s: %s", uri, e)
|
|
465
|
+
continue
|
|
466
|
+
|
|
467
|
+
raise ValueError(f"Resource not found: {uri}")
|
|
468
|
+
|
|
469
|
+
# =========================================================================
|
|
470
|
+
# Prompt Operations
|
|
471
|
+
# =========================================================================
|
|
472
|
+
|
|
473
|
+
async def list_prompts(self) -> list[mcp_types.Prompt]:
|
|
474
|
+
"""List all prompts (local + remote)."""
|
|
475
|
+
local = list((await self._prompt_manager.get_prompts()).values())
|
|
476
|
+
prompts: list[mcp_types.Prompt] = [p.to_mcp_prompt() for p in local]
|
|
477
|
+
|
|
478
|
+
if self._connections:
|
|
479
|
+
results = await asyncio.gather(
|
|
480
|
+
*[c.list_prompts() for c in self._connections.values()], return_exceptions=True
|
|
481
|
+
)
|
|
482
|
+
for r in results:
|
|
483
|
+
if isinstance(r, list):
|
|
484
|
+
prompts.extend(r)
|
|
485
|
+
|
|
486
|
+
return prompts
|
|
487
|
+
|
|
488
|
+
async def get_prompt(
|
|
489
|
+
self, name: str, arguments: dict[str, Any] | None = None
|
|
490
|
+
) -> mcp_types.GetPromptResult:
|
|
491
|
+
"""Get a prompt by name (tries local first, then remote)."""
|
|
492
|
+
try:
|
|
493
|
+
return await self._prompt_manager.render_prompt(name, arguments or {})
|
|
494
|
+
except Exception as e:
|
|
495
|
+
logger.debug("Local prompt render failed for %s: %s", name, e)
|
|
496
|
+
|
|
497
|
+
for conn in self._connections.values():
|
|
498
|
+
try:
|
|
499
|
+
return await conn.get_prompt(name, arguments)
|
|
500
|
+
except Exception as e:
|
|
501
|
+
logger.debug("Remote prompt get failed for %s: %s", name, e)
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
raise ValueError(f"Prompt not found: {name}")
|
|
505
|
+
|
|
506
|
+
# =========================================================================
|
|
507
|
+
# Server Methods
|
|
508
|
+
# =========================================================================
|
|
509
|
+
|
|
510
|
+
def serve(
|
|
511
|
+
self,
|
|
512
|
+
transport: Literal["stdio", "sse", "streamable-http"] = "streamable-http",
|
|
513
|
+
host: str = "0.0.0.0", # noqa: S104
|
|
514
|
+
port: int = 8000,
|
|
515
|
+
**kwargs: Any,
|
|
516
|
+
) -> None:
|
|
517
|
+
"""Start serving as an MCP server."""
|
|
518
|
+
self.run(transport=transport, host=host, port=port, **kwargs)
|
|
519
|
+
|
|
520
|
+
# =========================================================================
|
|
521
|
+
# Properties
|
|
522
|
+
# =========================================================================
|
|
523
|
+
|
|
524
|
+
@property
|
|
525
|
+
def connections(self) -> dict[str, Connector]:
|
|
526
|
+
return self._connections
|
|
527
|
+
|
|
528
|
+
@property
|
|
529
|
+
def is_connected(self) -> bool:
|
|
530
|
+
return self._in_context
|
|
531
|
+
|
|
532
|
+
@property
|
|
533
|
+
def is_parallelizable(self) -> bool:
|
|
534
|
+
"""True if all connections are remote (can spawn multiple instances)."""
|
|
535
|
+
if not self._connections:
|
|
536
|
+
return True # No connections = can parallelize (local tools only)
|
|
537
|
+
return all(conn.is_remote for conn in self._connections.values())
|
|
538
|
+
|
|
539
|
+
@property
|
|
540
|
+
def local_connections(self) -> list[str]:
|
|
541
|
+
"""Names of local (non-parallelizable) connections."""
|
|
542
|
+
return [name for name, conn in self._connections.items() if conn.is_local]
|
|
543
|
+
|
|
544
|
+
# =========================================================================
|
|
545
|
+
# Serialization
|
|
546
|
+
# =========================================================================
|
|
547
|
+
|
|
548
|
+
@property
|
|
549
|
+
def is_serializable(self) -> bool:
|
|
550
|
+
"""True if environment can be serialized (no local tools/scenarios).
|
|
551
|
+
|
|
552
|
+
For v5 format: requires hub config from connect_hub()
|
|
553
|
+
For v4 format: requires mcp_config, prompt, AND evaluate_tool
|
|
554
|
+
"""
|
|
555
|
+
# Check for local tools (registered via @env.tool)
|
|
556
|
+
if self._router._local_names:
|
|
557
|
+
return False
|
|
558
|
+
# Check for local scenarios (registered via @env.scenario)
|
|
559
|
+
if getattr(self, "_scenarios", {}):
|
|
560
|
+
return False
|
|
561
|
+
# v5 hub format
|
|
562
|
+
if self._hub_config is not None:
|
|
563
|
+
return True
|
|
564
|
+
# v4 format requires mcp_config + prompt + evaluate_tool
|
|
565
|
+
if self._mcp_config is not None:
|
|
566
|
+
return bool(self.prompt and self._evaluate_calls)
|
|
567
|
+
return False
|
|
568
|
+
|
|
569
|
+
def to_config(self) -> dict[str, Any]:
|
|
570
|
+
"""Serialize environment config for remote submission.
|
|
571
|
+
|
|
572
|
+
Returns the config in either v5 format (hub-based) or v4 format (legacy).
|
|
573
|
+
For v4 format, automatically includes prompt, setup_tool, and evaluate_tool
|
|
574
|
+
from the Environment's state.
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
dict: Serializable config
|
|
578
|
+
|
|
579
|
+
Raises:
|
|
580
|
+
ValueError: If environment has local tools/scenarios that can't be serialized
|
|
581
|
+
|
|
582
|
+
Example:
|
|
583
|
+
```python
|
|
584
|
+
# v5 hub-based
|
|
585
|
+
env = Environment("my").connect_hub("browser", include=["navigate"])
|
|
586
|
+
env.to_config() # {"name": "browser", "include": ["navigate"]}
|
|
587
|
+
|
|
588
|
+
# v4 legacy (from Task.from_v4())
|
|
589
|
+
task = Task.from_v4(legacy_task)
|
|
590
|
+
task.env.to_config() # {"prompt": "...", "mcp_config": {...}, ...}
|
|
591
|
+
```
|
|
592
|
+
"""
|
|
593
|
+
if self._router._local_names:
|
|
594
|
+
raise ValueError(
|
|
595
|
+
f"Cannot serialize Environment with local tools: "
|
|
596
|
+
f"{list(self._router._local_names)}. "
|
|
597
|
+
"Local tools require local execution. For remote submission, "
|
|
598
|
+
"use dict config or connect to a remote hub."
|
|
599
|
+
)
|
|
600
|
+
if getattr(self, "_scenarios", {}):
|
|
601
|
+
raise ValueError(
|
|
602
|
+
f"Cannot serialize Environment with local scenarios: "
|
|
603
|
+
f"{list(self._scenarios.keys())}. "
|
|
604
|
+
"Local scenarios require local execution. For remote submission, "
|
|
605
|
+
"define scenarios on the remote environment."
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
# v5 hub-based format
|
|
609
|
+
if self._hub_config is not None:
|
|
610
|
+
return self._hub_config.copy()
|
|
611
|
+
|
|
612
|
+
# v4 legacy format - requires mcp_config, prompt, AND evaluate_tool
|
|
613
|
+
if self._mcp_config is not None:
|
|
614
|
+
# Validate required fields for v4 format
|
|
615
|
+
if not self.prompt:
|
|
616
|
+
raise ValueError(
|
|
617
|
+
"Cannot serialize v4 Environment without prompt. "
|
|
618
|
+
"Set env.prompt before serializing."
|
|
619
|
+
)
|
|
620
|
+
if not self._evaluate_calls:
|
|
621
|
+
raise ValueError(
|
|
622
|
+
"Cannot serialize v4 Environment without evaluate_tool. "
|
|
623
|
+
"Use env.evaluate_tool() to define evaluation criteria."
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
config: dict[str, Any] = {
|
|
627
|
+
"prompt": self.prompt,
|
|
628
|
+
"mcp_config": self._mcp_config,
|
|
629
|
+
"evaluate_tool": [
|
|
630
|
+
{"name": name, "arguments": args} for name, args in self._evaluate_calls
|
|
631
|
+
],
|
|
632
|
+
}
|
|
633
|
+
if self._setup_calls:
|
|
634
|
+
config["setup_tool"] = [
|
|
635
|
+
{"name": name, "arguments": args} for name, args in self._setup_calls
|
|
636
|
+
]
|
|
637
|
+
return config
|
|
638
|
+
|
|
639
|
+
raise ValueError(
|
|
640
|
+
"Cannot serialize Environment without config. "
|
|
641
|
+
"Use connect_hub() for v5 tasks or connect_mcp_config() for legacy tasks."
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
def __repr__(self) -> str:
|
|
645
|
+
return f"Environment({self.name!r}, connections={list(self._connections.keys())})"
|
|
646
|
+
|
|
647
|
+
# =========================================================================
|
|
648
|
+
# Task Creation
|
|
649
|
+
# =========================================================================
|
|
650
|
+
|
|
651
|
+
def __call__(
|
|
652
|
+
self,
|
|
653
|
+
scenario: str | None = None,
|
|
654
|
+
**args: Any,
|
|
655
|
+
) -> Task:
|
|
656
|
+
"""Create a Task from this environment.
|
|
657
|
+
|
|
658
|
+
Returns a Task that can be passed to hud.eval() for orchestration.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
scenario: Scenario name to run (from @env.scenario). Optional for v4 legacy.
|
|
662
|
+
**args: Arguments for the scenario
|
|
663
|
+
|
|
664
|
+
Returns:
|
|
665
|
+
Task: A runnable evaluation unit
|
|
666
|
+
|
|
667
|
+
Example:
|
|
668
|
+
```python
|
|
669
|
+
env = Environment("my-env").connect_hub("browser")
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
@env.scenario()
|
|
673
|
+
async def checkout(user_id: str):
|
|
674
|
+
yield "Complete checkout"
|
|
675
|
+
yield 1.0
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
# Single task via hud.eval
|
|
679
|
+
async with hud.eval(env("checkout", user_id="alice")) as ctx:
|
|
680
|
+
await agent.run(ctx.prompt)
|
|
681
|
+
|
|
682
|
+
# Multiple tasks with variants
|
|
683
|
+
tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
|
|
684
|
+
async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
|
|
685
|
+
...
|
|
686
|
+
```
|
|
687
|
+
"""
|
|
688
|
+
from hud.eval.task import Task
|
|
689
|
+
|
|
690
|
+
return Task(
|
|
691
|
+
env=self,
|
|
692
|
+
scenario=scenario,
|
|
693
|
+
args=args,
|
|
694
|
+
)
|