hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Tests for Environment scenario decorator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from hud.environment import Environment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestScenarioDecorator:
|
|
11
|
+
"""Tests for @env.scenario decorator."""
|
|
12
|
+
|
|
13
|
+
def test_scenario_registers_function(self) -> None:
|
|
14
|
+
"""@env.scenario registers the function."""
|
|
15
|
+
env = Environment("test-env")
|
|
16
|
+
|
|
17
|
+
@env.scenario("greet")
|
|
18
|
+
async def greet_scenario(name: str):
|
|
19
|
+
yield f"Hello, {name}!"
|
|
20
|
+
yield 1.0
|
|
21
|
+
|
|
22
|
+
assert "greet" in env._scenarios
|
|
23
|
+
|
|
24
|
+
def test_scenario_creates_mcp_prompt(self) -> None:
|
|
25
|
+
"""@env.scenario creates an MCP prompt."""
|
|
26
|
+
env = Environment("test-env")
|
|
27
|
+
|
|
28
|
+
@env.scenario("greet", description="Greeting scenario")
|
|
29
|
+
async def greet_scenario(name: str):
|
|
30
|
+
yield f"Hello, {name}!"
|
|
31
|
+
yield 1.0
|
|
32
|
+
|
|
33
|
+
# Check that prompt was registered via prompt manager
|
|
34
|
+
prompt_names = list(env._prompt_manager._prompts.keys())
|
|
35
|
+
assert "test-env:greet" in prompt_names
|
|
36
|
+
|
|
37
|
+
def test_scenario_creates_mcp_resource(self) -> None:
|
|
38
|
+
"""@env.scenario creates an MCP resource."""
|
|
39
|
+
env = Environment("test-env")
|
|
40
|
+
|
|
41
|
+
@env.scenario("greet")
|
|
42
|
+
async def greet_scenario(name: str):
|
|
43
|
+
yield f"Hello, {name}!"
|
|
44
|
+
yield 1.0
|
|
45
|
+
|
|
46
|
+
# Check that resource was registered via resource manager
|
|
47
|
+
resource_uris = list(env._resource_manager._resources.keys())
|
|
48
|
+
assert "test-env:greet" in resource_uris
|
|
49
|
+
|
|
50
|
+
def test_scenario_extracts_arguments(self) -> None:
|
|
51
|
+
"""@env.scenario extracts function arguments for prompt."""
|
|
52
|
+
env = Environment("test-env")
|
|
53
|
+
|
|
54
|
+
@env.scenario("checkout")
|
|
55
|
+
async def checkout_scenario(user_id: str, amount: int = 100):
|
|
56
|
+
yield f"Checkout for {user_id}: ${amount}"
|
|
57
|
+
yield 1.0
|
|
58
|
+
|
|
59
|
+
# Find the prompt
|
|
60
|
+
prompt = env._prompt_manager._prompts.get("test-env:checkout")
|
|
61
|
+
assert prompt is not None
|
|
62
|
+
assert prompt.arguments is not None
|
|
63
|
+
|
|
64
|
+
# Check arguments
|
|
65
|
+
arg_names = [arg.name for arg in prompt.arguments]
|
|
66
|
+
assert "user_id" in arg_names
|
|
67
|
+
assert "amount" in arg_names
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TestScenarioExecution:
|
|
71
|
+
"""Tests for scenario execution flow."""
|
|
72
|
+
|
|
73
|
+
@pytest.mark.asyncio
|
|
74
|
+
async def test_scenario_setup_phase(self) -> None:
|
|
75
|
+
"""Scenario setup phase yields prompt."""
|
|
76
|
+
env = Environment("test-env")
|
|
77
|
+
setup_ran = False
|
|
78
|
+
|
|
79
|
+
@env.scenario("test")
|
|
80
|
+
async def test_scenario():
|
|
81
|
+
nonlocal setup_ran
|
|
82
|
+
setup_ran = True
|
|
83
|
+
yield "Test prompt"
|
|
84
|
+
yield 1.0
|
|
85
|
+
|
|
86
|
+
# Get the prompt handler
|
|
87
|
+
prompt = env._prompt_manager._prompts.get("test-env:test")
|
|
88
|
+
assert prompt is not None
|
|
89
|
+
|
|
90
|
+
# Run setup via prompt render (which calls fn) - no need for context
|
|
91
|
+
result = await prompt.render({})
|
|
92
|
+
|
|
93
|
+
assert setup_ran
|
|
94
|
+
# Result is list of PromptMessage
|
|
95
|
+
assert len(result) > 0
|
|
96
|
+
assert "Test prompt" in str(result[0].content)
|
|
97
|
+
|
|
98
|
+
@pytest.mark.asyncio
|
|
99
|
+
async def test_scenario_stores_session(self) -> None:
|
|
100
|
+
"""Scenario stores generator in session for evaluate phase."""
|
|
101
|
+
env = Environment("test-env")
|
|
102
|
+
|
|
103
|
+
@env.scenario("test")
|
|
104
|
+
async def test_scenario():
|
|
105
|
+
yield "Test prompt"
|
|
106
|
+
yield 1.0
|
|
107
|
+
|
|
108
|
+
# Run setup via prompt - no need for context
|
|
109
|
+
prompt = env._prompt_manager._prompts.get("test-env:test")
|
|
110
|
+
assert prompt is not None
|
|
111
|
+
await prompt.render({})
|
|
112
|
+
|
|
113
|
+
# Check session was stored
|
|
114
|
+
assert "test" in env._scenario_latest
|
|
115
|
+
|
|
116
|
+
@pytest.mark.asyncio
|
|
117
|
+
async def test_scenario_full_flow(self) -> None:
|
|
118
|
+
"""Scenario runs setup and evaluate phases correctly."""
|
|
119
|
+
env = Environment("test-env")
|
|
120
|
+
phases = []
|
|
121
|
+
|
|
122
|
+
@env.scenario("test")
|
|
123
|
+
async def test_scenario():
|
|
124
|
+
phases.append("setup")
|
|
125
|
+
yield "Test prompt"
|
|
126
|
+
phases.append("evaluate")
|
|
127
|
+
yield 0.95
|
|
128
|
+
|
|
129
|
+
# Setup phase - no context needed for prompt/resource
|
|
130
|
+
prompt = env._prompt_manager._prompts.get("test-env:test")
|
|
131
|
+
assert prompt is not None
|
|
132
|
+
await prompt.render({})
|
|
133
|
+
assert "setup" in phases
|
|
134
|
+
assert "evaluate" not in phases
|
|
135
|
+
|
|
136
|
+
# Evaluate phase
|
|
137
|
+
resource = env._resource_manager._resources.get("test-env:test")
|
|
138
|
+
assert resource is not None
|
|
139
|
+
await resource.read()
|
|
140
|
+
assert "evaluate" in phases
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class TestScenarioWithArgs:
|
|
144
|
+
"""Tests for scenarios with arguments."""
|
|
145
|
+
|
|
146
|
+
@pytest.mark.asyncio
|
|
147
|
+
async def test_scenario_receives_args(self) -> None:
|
|
148
|
+
"""Scenario receives arguments from prompt call."""
|
|
149
|
+
env = Environment("test-env")
|
|
150
|
+
received_args = {}
|
|
151
|
+
|
|
152
|
+
@env.scenario("checkout")
|
|
153
|
+
async def checkout_scenario(user_id: str, amount: int = 100):
|
|
154
|
+
received_args["user_id"] = user_id
|
|
155
|
+
received_args["amount"] = amount
|
|
156
|
+
yield f"Checkout {user_id}: ${amount}"
|
|
157
|
+
yield 1.0
|
|
158
|
+
|
|
159
|
+
prompt = env._prompt_manager._prompts.get("test-env:checkout")
|
|
160
|
+
assert prompt is not None
|
|
161
|
+
# No context needed for prompt render
|
|
162
|
+
await prompt.render({"user_id": "alice", "amount": 50})
|
|
163
|
+
|
|
164
|
+
assert received_args["user_id"] == "alice"
|
|
165
|
+
assert received_args["amount"] == 50
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class TestScenarioSubmit:
|
|
169
|
+
"""Tests for scenario submit and answer flow."""
|
|
170
|
+
|
|
171
|
+
@pytest.mark.asyncio
|
|
172
|
+
async def test_submit_stores_answer(self) -> None:
|
|
173
|
+
"""submit() stores answer for scenario."""
|
|
174
|
+
env = Environment("test-env")
|
|
175
|
+
|
|
176
|
+
@env.scenario("test")
|
|
177
|
+
async def test_scenario():
|
|
178
|
+
yield "What is 2+2?"
|
|
179
|
+
yield 1.0
|
|
180
|
+
|
|
181
|
+
# Run setup
|
|
182
|
+
prompt = env._prompt_manager._prompts.get("test-env:test")
|
|
183
|
+
assert prompt is not None
|
|
184
|
+
await prompt.render({})
|
|
185
|
+
|
|
186
|
+
# Submit answer
|
|
187
|
+
await env.submit("test", "4")
|
|
188
|
+
|
|
189
|
+
assert env._scenario_answers.get("test") == "4"
|
|
190
|
+
|
|
191
|
+
@pytest.mark.asyncio
|
|
192
|
+
async def test_scenario_receives_answer(self) -> None:
|
|
193
|
+
"""Scenario receives submitted answer via yield."""
|
|
194
|
+
env = Environment("test-env")
|
|
195
|
+
received_answer = None
|
|
196
|
+
|
|
197
|
+
@env.scenario("qa")
|
|
198
|
+
async def qa_scenario():
|
|
199
|
+
nonlocal received_answer
|
|
200
|
+
answer = yield "What is 2+2?"
|
|
201
|
+
received_answer = answer
|
|
202
|
+
yield 1.0 if answer == "4" else 0.0
|
|
203
|
+
|
|
204
|
+
# Run setup
|
|
205
|
+
prompt = env._prompt_manager._prompts.get("test-env:qa")
|
|
206
|
+
assert prompt is not None
|
|
207
|
+
await prompt.render({})
|
|
208
|
+
|
|
209
|
+
# Submit answer
|
|
210
|
+
env._scenario_answers["qa"] = "4"
|
|
211
|
+
|
|
212
|
+
# Run evaluate
|
|
213
|
+
resource = env._resource_manager._resources.get("test-env:qa")
|
|
214
|
+
assert resource is not None
|
|
215
|
+
await resource.read()
|
|
216
|
+
|
|
217
|
+
assert received_answer == "4"
|
|
218
|
+
|
|
219
|
+
@pytest.mark.asyncio
|
|
220
|
+
async def test_scenario_evaluates_answer(self) -> None:
|
|
221
|
+
"""Scenario evaluates answer and returns reward."""
|
|
222
|
+
env = Environment("test-env")
|
|
223
|
+
|
|
224
|
+
@env.scenario("grading")
|
|
225
|
+
async def grading_scenario():
|
|
226
|
+
answer = yield "What is the capital of France?"
|
|
227
|
+
yield 1.0 if "paris" in answer.lower() else 0.0
|
|
228
|
+
|
|
229
|
+
# Run setup
|
|
230
|
+
prompt = env._prompt_manager._prompts.get("test-env:grading")
|
|
231
|
+
assert prompt is not None
|
|
232
|
+
await prompt.render({})
|
|
233
|
+
|
|
234
|
+
# Submit correct answer
|
|
235
|
+
env._scenario_answers["grading"] = "Paris"
|
|
236
|
+
|
|
237
|
+
# Run evaluate
|
|
238
|
+
resource = env._resource_manager._resources.get("test-env:grading")
|
|
239
|
+
assert resource is not None
|
|
240
|
+
result = await resource.read()
|
|
241
|
+
|
|
242
|
+
import json
|
|
243
|
+
|
|
244
|
+
data = json.loads(result)
|
|
245
|
+
assert data["reward"] == 1.0
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class TestScenarioMeta:
|
|
249
|
+
"""Tests for scenario _meta containing code."""
|
|
250
|
+
|
|
251
|
+
def test_scenario_captures_source_code(self) -> None:
|
|
252
|
+
"""@env.scenario captures function source in meta."""
|
|
253
|
+
env = Environment("test-env")
|
|
254
|
+
|
|
255
|
+
@env.scenario("example")
|
|
256
|
+
async def example_scenario(x: int):
|
|
257
|
+
yield f"Process {x}"
|
|
258
|
+
yield 1.0
|
|
259
|
+
|
|
260
|
+
prompt = env._prompt_manager._prompts.get("test-env:example")
|
|
261
|
+
assert prompt is not None
|
|
262
|
+
assert prompt.meta is not None
|
|
263
|
+
assert "code" in prompt.meta
|
|
264
|
+
assert "async def example_scenario" in prompt.meta["code"]
|
|
265
|
+
assert "yield" in prompt.meta["code"]
|
|
266
|
+
|
|
267
|
+
def test_scenario_meta_on_resource(self) -> None:
|
|
268
|
+
"""Resource also has source code in meta."""
|
|
269
|
+
env = Environment("test-env")
|
|
270
|
+
|
|
271
|
+
@env.scenario("example")
|
|
272
|
+
async def example_scenario():
|
|
273
|
+
yield "Test"
|
|
274
|
+
yield 1.0
|
|
275
|
+
|
|
276
|
+
resource = env._resource_manager._resources.get("test-env:example")
|
|
277
|
+
assert resource is not None
|
|
278
|
+
assert resource.meta is not None
|
|
279
|
+
assert "code" in resource.meta
|
|
280
|
+
assert "async def example_scenario" in resource.meta["code"]
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Tests for @env.tool() decorator and tool operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from hud.environment import Environment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestToolDecorator:
|
|
11
|
+
"""Tests for @env.tool() decorator."""
|
|
12
|
+
|
|
13
|
+
def test_tool_registers_function(self) -> None:
|
|
14
|
+
"""@env.tool registers the function in tool manager."""
|
|
15
|
+
env = Environment("test-env")
|
|
16
|
+
|
|
17
|
+
@env.tool()
|
|
18
|
+
def add(a: int, b: int) -> int:
|
|
19
|
+
"""Add two numbers."""
|
|
20
|
+
return a + b
|
|
21
|
+
|
|
22
|
+
# Check tool was registered
|
|
23
|
+
tool_names = list(env._tool_manager._tools.keys())
|
|
24
|
+
assert "add" in tool_names
|
|
25
|
+
|
|
26
|
+
def test_tool_with_custom_name(self) -> None:
|
|
27
|
+
"""@env.tool(name=...) uses custom name."""
|
|
28
|
+
env = Environment("test-env")
|
|
29
|
+
|
|
30
|
+
@env.tool(name="custom_add")
|
|
31
|
+
def add(a: int, b: int) -> int:
|
|
32
|
+
return a + b
|
|
33
|
+
|
|
34
|
+
tool_names = list(env._tool_manager._tools.keys())
|
|
35
|
+
assert "custom_add" in tool_names
|
|
36
|
+
assert "add" not in tool_names
|
|
37
|
+
|
|
38
|
+
def test_tool_preserves_docstring(self) -> None:
|
|
39
|
+
"""@env.tool preserves function docstring as description."""
|
|
40
|
+
env = Environment("test-env")
|
|
41
|
+
|
|
42
|
+
@env.tool()
|
|
43
|
+
def greet(name: str) -> str:
|
|
44
|
+
"""Greet someone by name."""
|
|
45
|
+
return f"Hello, {name}!"
|
|
46
|
+
|
|
47
|
+
tool = env._tool_manager._tools.get("greet")
|
|
48
|
+
assert tool is not None
|
|
49
|
+
assert "Greet someone by name" in (tool.description or "")
|
|
50
|
+
|
|
51
|
+
def test_tool_async_function(self) -> None:
|
|
52
|
+
"""@env.tool works with async functions."""
|
|
53
|
+
env = Environment("test-env")
|
|
54
|
+
|
|
55
|
+
@env.tool()
|
|
56
|
+
async def fetch_data(url: str) -> str:
|
|
57
|
+
"""Fetch data from URL."""
|
|
58
|
+
return f"Data from {url}"
|
|
59
|
+
|
|
60
|
+
tool_names = list(env._tool_manager._tools.keys())
|
|
61
|
+
assert "fetch_data" in tool_names
|
|
62
|
+
|
|
63
|
+
def test_tool_returns_function(self) -> None:
|
|
64
|
+
"""@env.tool returns the original function."""
|
|
65
|
+
env = Environment("test-env")
|
|
66
|
+
|
|
67
|
+
@env.tool()
|
|
68
|
+
def add(a: int, b: int) -> int:
|
|
69
|
+
return a + b
|
|
70
|
+
|
|
71
|
+
# Should be able to call it directly
|
|
72
|
+
assert add(2, 3) == 5
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TestListTools:
|
|
76
|
+
"""Tests for list_tools and as_tools."""
|
|
77
|
+
|
|
78
|
+
@pytest.mark.asyncio
|
|
79
|
+
async def test_as_tools_returns_registered_tools(self) -> None:
|
|
80
|
+
"""as_tools returns list of registered MCP tools."""
|
|
81
|
+
env = Environment("test-env")
|
|
82
|
+
|
|
83
|
+
@env.tool()
|
|
84
|
+
def tool1() -> str:
|
|
85
|
+
return "1"
|
|
86
|
+
|
|
87
|
+
@env.tool()
|
|
88
|
+
def tool2() -> str:
|
|
89
|
+
return "2"
|
|
90
|
+
|
|
91
|
+
async with env:
|
|
92
|
+
tools = env.as_tools()
|
|
93
|
+
tool_names = [t.name for t in tools]
|
|
94
|
+
assert "tool1" in tool_names
|
|
95
|
+
assert "tool2" in tool_names
|
|
96
|
+
|
|
97
|
+
@pytest.mark.asyncio
|
|
98
|
+
async def test_as_tools_empty_when_no_tools(self) -> None:
|
|
99
|
+
"""as_tools returns empty list when no tools registered."""
|
|
100
|
+
env = Environment("test-env")
|
|
101
|
+
async with env:
|
|
102
|
+
tools = env.as_tools()
|
|
103
|
+
# May have built-in _hud_submit tool
|
|
104
|
+
user_tools = [t for t in tools if not t.name.startswith("_")]
|
|
105
|
+
assert len(user_tools) == 0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class TestCallTool:
|
|
109
|
+
"""Tests for call_tool method."""
|
|
110
|
+
|
|
111
|
+
@pytest.mark.asyncio
|
|
112
|
+
async def test_call_tool_executes_function(self) -> None:
|
|
113
|
+
"""call_tool executes registered tool function."""
|
|
114
|
+
env = Environment("test-env")
|
|
115
|
+
executed = []
|
|
116
|
+
|
|
117
|
+
@env.tool()
|
|
118
|
+
def greet(name: str) -> str:
|
|
119
|
+
executed.append(name)
|
|
120
|
+
return f"Hello, {name}!"
|
|
121
|
+
|
|
122
|
+
async with env:
|
|
123
|
+
result = await env.call_tool("greet", name="Alice")
|
|
124
|
+
|
|
125
|
+
assert executed == ["Alice"]
|
|
126
|
+
assert result is not None
|
|
127
|
+
|
|
128
|
+
@pytest.mark.asyncio
|
|
129
|
+
async def test_call_tool_async_function(self) -> None:
|
|
130
|
+
"""call_tool works with async tool functions."""
|
|
131
|
+
env = Environment("test-env")
|
|
132
|
+
|
|
133
|
+
@env.tool()
|
|
134
|
+
async def async_greet(name: str) -> str:
|
|
135
|
+
return f"Hello, {name}!"
|
|
136
|
+
|
|
137
|
+
async with env:
|
|
138
|
+
result = await env.call_tool("async_greet", name="Bob")
|
|
139
|
+
|
|
140
|
+
assert result is not None
|
|
141
|
+
|
|
142
|
+
@pytest.mark.asyncio
|
|
143
|
+
async def test_call_tool_not_found(self) -> None:
|
|
144
|
+
"""call_tool raises for unknown tool."""
|
|
145
|
+
env = Environment("test-env")
|
|
146
|
+
|
|
147
|
+
async with env:
|
|
148
|
+
with pytest.raises(ValueError, match="Tool not found"):
|
|
149
|
+
await env.call_tool("nonexistent")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class TestMockMode:
|
|
153
|
+
"""Tests for mock mode."""
|
|
154
|
+
|
|
155
|
+
def test_mock_mode_default_false(self) -> None:
|
|
156
|
+
"""Mock mode is False by default."""
|
|
157
|
+
env = Environment("test-env")
|
|
158
|
+
assert env._mock_mode is False
|
|
159
|
+
assert env.is_mock is False
|
|
160
|
+
|
|
161
|
+
def test_mock_enables_mock_mode(self) -> None:
|
|
162
|
+
"""mock() enables mock mode."""
|
|
163
|
+
env = Environment("test-env")
|
|
164
|
+
env.mock()
|
|
165
|
+
assert env._mock_mode is True
|
|
166
|
+
assert env.is_mock is True
|
|
167
|
+
|
|
168
|
+
def test_unmock_disables_mock_mode(self) -> None:
|
|
169
|
+
"""unmock() disables mock mode."""
|
|
170
|
+
env = Environment("test-env")
|
|
171
|
+
env.mock()
|
|
172
|
+
env.unmock()
|
|
173
|
+
assert env._mock_mode is False
|
|
174
|
+
|
|
175
|
+
def test_mock_returns_self_for_chaining(self) -> None:
|
|
176
|
+
"""mock() returns self for chaining."""
|
|
177
|
+
env = Environment("test-env")
|
|
178
|
+
result = env.mock()
|
|
179
|
+
assert result is env
|
|
180
|
+
|
|
181
|
+
def test_mock_tool_sets_custom_output(self) -> None:
|
|
182
|
+
"""mock_tool() sets custom output for a tool."""
|
|
183
|
+
env = Environment("test-env")
|
|
184
|
+
env.mock_tool("navigate", "Custom result")
|
|
185
|
+
assert env._mock_outputs["navigate"] == "Custom result"
|
|
186
|
+
|
|
187
|
+
@pytest.mark.asyncio
|
|
188
|
+
async def test_mock_mode_returns_mock_response(self) -> None:
|
|
189
|
+
"""Mock mode returns mock response instead of executing tool."""
|
|
190
|
+
env = Environment("test-env")
|
|
191
|
+
call_count = 0
|
|
192
|
+
|
|
193
|
+
@env.tool()
|
|
194
|
+
def real_tool() -> str:
|
|
195
|
+
nonlocal call_count
|
|
196
|
+
call_count += 1
|
|
197
|
+
return "real result"
|
|
198
|
+
|
|
199
|
+
env.mock()
|
|
200
|
+
env.mock_tool("real_tool", "mocked result")
|
|
201
|
+
|
|
202
|
+
async with env:
|
|
203
|
+
result = await env.call_tool("real_tool")
|
|
204
|
+
|
|
205
|
+
# Tool should not be called in mock mode
|
|
206
|
+
assert call_count == 0
|
|
207
|
+
# Should get the mock result
|
|
208
|
+
assert result is not None
|
hud/environment/types.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Environment types for configuration and tracing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
__all__ = ["EnvConfig"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EnvConfig(BaseModel):
|
|
11
|
+
"""Environment configuration for Tasks.
|
|
12
|
+
|
|
13
|
+
Specifies which hub to connect to and optional tool filtering.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
name: Hub name to connect via connect_hub() (e.g., "browser", "sheets")
|
|
17
|
+
include: Optional whitelist of tool names to include
|
|
18
|
+
exclude: Optional blacklist of tool names to exclude
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name: str = Field(description="Hub name to connect to")
|
|
22
|
+
include: list[str] | None = Field(default=None, description="Whitelist of tool names")
|
|
23
|
+
exclude: list[str] | None = Field(default=None, description="Blacklist of tool names")
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Environment utilities."""
|
|
2
|
+
|
|
3
|
+
from hud.environment.utils.formats import (
|
|
4
|
+
ToolFormat,
|
|
5
|
+
format_result,
|
|
6
|
+
parse_tool_call,
|
|
7
|
+
parse_tool_calls,
|
|
8
|
+
result_to_string,
|
|
9
|
+
)
|
|
10
|
+
from hud.environment.utils.schema import (
|
|
11
|
+
ensure_strict_schema,
|
|
12
|
+
json_type_to_python,
|
|
13
|
+
schema_to_pydantic,
|
|
14
|
+
)
|
|
15
|
+
from hud.environment.utils.tool_wrappers import (
|
|
16
|
+
create_async_tool_fn,
|
|
17
|
+
create_sync_tool_fn,
|
|
18
|
+
create_tool_fns,
|
|
19
|
+
stringify_result,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"ToolFormat",
|
|
24
|
+
"create_async_tool_fn",
|
|
25
|
+
"create_sync_tool_fn",
|
|
26
|
+
"create_tool_fns",
|
|
27
|
+
"ensure_strict_schema",
|
|
28
|
+
"format_result",
|
|
29
|
+
"json_type_to_python",
|
|
30
|
+
"parse_tool_call",
|
|
31
|
+
"parse_tool_calls",
|
|
32
|
+
"result_to_string",
|
|
33
|
+
"schema_to_pydantic",
|
|
34
|
+
"stringify_result",
|
|
35
|
+
]
|