hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Tests for hud.eval.context module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.eval.context import (
|
|
10
|
+
EvalContext,
|
|
11
|
+
get_current_trace_headers,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestEvalContext:
|
|
16
|
+
"""Tests for EvalContext."""
|
|
17
|
+
|
|
18
|
+
def test_init_generates_trace_id(self) -> None:
|
|
19
|
+
"""EvalContext generates trace_id if not provided."""
|
|
20
|
+
ctx = EvalContext(name="test-task", quiet=True)
|
|
21
|
+
|
|
22
|
+
assert ctx.trace_id is not None
|
|
23
|
+
assert len(ctx.trace_id) == 36 # UUID format
|
|
24
|
+
|
|
25
|
+
def test_init_uses_provided_trace_id(self) -> None:
|
|
26
|
+
"""EvalContext uses provided trace_id."""
|
|
27
|
+
ctx = EvalContext(name="test-task", trace_id="custom-id", quiet=True)
|
|
28
|
+
|
|
29
|
+
assert ctx.trace_id == "custom-id"
|
|
30
|
+
|
|
31
|
+
def test_headers_contains_trace_id(self) -> None:
|
|
32
|
+
"""headers property returns dict with trace ID."""
|
|
33
|
+
ctx = EvalContext(name="test-task", trace_id="test-123", quiet=True)
|
|
34
|
+
|
|
35
|
+
assert ctx.headers == {"Trace-Id": "test-123"}
|
|
36
|
+
|
|
37
|
+
def test_success_true_when_no_error(self) -> None:
|
|
38
|
+
"""success property returns True when no error."""
|
|
39
|
+
ctx = EvalContext(name="test-task", quiet=True)
|
|
40
|
+
|
|
41
|
+
assert ctx.success is True
|
|
42
|
+
|
|
43
|
+
def test_success_false_when_error(self) -> None:
|
|
44
|
+
"""success property returns False when error is set."""
|
|
45
|
+
ctx = EvalContext(name="test-task", quiet=True)
|
|
46
|
+
ctx.error = ValueError("test error")
|
|
47
|
+
|
|
48
|
+
assert ctx.success is False
|
|
49
|
+
|
|
50
|
+
def test_variants_empty_by_default(self) -> None:
|
|
51
|
+
"""variants is empty dict by default."""
|
|
52
|
+
ctx = EvalContext(name="test-task", quiet=True)
|
|
53
|
+
|
|
54
|
+
assert ctx.variants == {}
|
|
55
|
+
|
|
56
|
+
def test_variants_set_from_init(self) -> None:
|
|
57
|
+
"""variants set from parameter."""
|
|
58
|
+
ctx = EvalContext(
|
|
59
|
+
name="test-task",
|
|
60
|
+
variants={"model": "gpt-4o", "temp": 0.7},
|
|
61
|
+
quiet=True,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
assert ctx.variants == {"model": "gpt-4o", "temp": 0.7}
|
|
65
|
+
|
|
66
|
+
@pytest.mark.asyncio
|
|
67
|
+
async def test_context_manager_sets_headers(self) -> None:
|
|
68
|
+
"""Context manager sets trace headers in contextvar."""
|
|
69
|
+
ctx = EvalContext(name="test-task", trace_id="test-123", quiet=True)
|
|
70
|
+
|
|
71
|
+
# Mock telemetry calls
|
|
72
|
+
with (
|
|
73
|
+
patch.object(ctx, "_eval_enter", new_callable=AsyncMock),
|
|
74
|
+
patch.object(ctx, "_eval_exit", new_callable=AsyncMock),
|
|
75
|
+
patch.object(EvalContext, "__aenter__", return_value=ctx),
|
|
76
|
+
patch.object(EvalContext, "__aexit__", return_value=None),
|
|
77
|
+
):
|
|
78
|
+
assert get_current_trace_headers() is None
|
|
79
|
+
|
|
80
|
+
# Manually set token for test
|
|
81
|
+
from hud.eval.context import _current_trace_headers
|
|
82
|
+
|
|
83
|
+
token = _current_trace_headers.set(ctx.headers)
|
|
84
|
+
try:
|
|
85
|
+
headers = get_current_trace_headers()
|
|
86
|
+
assert headers is not None
|
|
87
|
+
assert headers["Trace-Id"] == "test-123"
|
|
88
|
+
finally:
|
|
89
|
+
_current_trace_headers.reset(token)
|
|
90
|
+
|
|
91
|
+
assert get_current_trace_headers() is None
|
|
92
|
+
|
|
93
|
+
def test_repr(self) -> None:
|
|
94
|
+
"""__repr__ shows useful info."""
|
|
95
|
+
ctx = EvalContext(
|
|
96
|
+
name="test-task",
|
|
97
|
+
trace_id="abc12345-6789-0000-0000-000000000000",
|
|
98
|
+
quiet=True,
|
|
99
|
+
)
|
|
100
|
+
ctx.reward = 0.95
|
|
101
|
+
|
|
102
|
+
repr_str = repr(ctx)
|
|
103
|
+
assert "abc12345" in repr_str
|
|
104
|
+
assert "test-task" in repr_str
|
|
105
|
+
assert "0.95" in repr_str
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class TestEvalContextPrompt:
|
|
109
|
+
"""Tests for EvalContext.prompt feature."""
|
|
110
|
+
|
|
111
|
+
def test_prompt_can_be_set(self) -> None:
|
|
112
|
+
"""EvalContext.prompt can be set."""
|
|
113
|
+
ctx = EvalContext(name="test-task", quiet=True)
|
|
114
|
+
ctx.prompt = "Test prompt"
|
|
115
|
+
|
|
116
|
+
assert ctx.prompt == "Test prompt"
|
|
117
|
+
|
|
118
|
+
def test_prompt_included_in_payload(self) -> None:
|
|
119
|
+
"""Prompt is included in eval payload."""
|
|
120
|
+
ctx = EvalContext(name="test-task", quiet=True)
|
|
121
|
+
ctx.prompt = "Test prompt"
|
|
122
|
+
|
|
123
|
+
payload = ctx._build_base_payload()
|
|
124
|
+
assert payload.prompt == "Test prompt"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class TestEvalContextFromEnvironment:
|
|
128
|
+
"""Tests for EvalContext.from_environment factory."""
|
|
129
|
+
|
|
130
|
+
def test_copies_connections(self) -> None:
|
|
131
|
+
"""from_environment copies connections from parent (deep copy)."""
|
|
132
|
+
from hud.environment import Environment
|
|
133
|
+
|
|
134
|
+
parent = Environment("parent-env")
|
|
135
|
+
# Add a mock connection with copy method
|
|
136
|
+
mock_conn = MagicMock()
|
|
137
|
+
mock_conn_copy = MagicMock()
|
|
138
|
+
mock_conn.copy.return_value = mock_conn_copy
|
|
139
|
+
parent._connections["test-conn"] = mock_conn
|
|
140
|
+
|
|
141
|
+
ctx = EvalContext.from_environment(parent, name="test-task")
|
|
142
|
+
|
|
143
|
+
# Verify connection was copied (not same object)
|
|
144
|
+
assert "test-conn" in ctx._connections
|
|
145
|
+
mock_conn.copy.assert_called_once()
|
|
146
|
+
assert ctx._connections["test-conn"] is mock_conn_copy
|
|
147
|
+
|
|
148
|
+
def test_copies_prompt(self) -> None:
|
|
149
|
+
"""from_environment copies prompt from parent."""
|
|
150
|
+
from hud.environment import Environment
|
|
151
|
+
|
|
152
|
+
parent = Environment("parent-env")
|
|
153
|
+
parent.prompt = "Parent prompt"
|
|
154
|
+
|
|
155
|
+
ctx = EvalContext.from_environment(parent, name="test-task")
|
|
156
|
+
|
|
157
|
+
assert ctx.prompt == "Parent prompt"
|
|
158
|
+
|
|
159
|
+
def test_sets_eval_properties(self) -> None:
|
|
160
|
+
"""from_environment sets eval-specific properties."""
|
|
161
|
+
from hud.environment import Environment
|
|
162
|
+
|
|
163
|
+
parent = Environment("parent-env")
|
|
164
|
+
|
|
165
|
+
ctx = EvalContext.from_environment(
|
|
166
|
+
parent,
|
|
167
|
+
name="test-task",
|
|
168
|
+
trace_id="custom-trace",
|
|
169
|
+
variants={"model": "gpt-4o"},
|
|
170
|
+
group_id="group-123",
|
|
171
|
+
index=5,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
assert ctx.eval_name == "test-task"
|
|
175
|
+
assert ctx.trace_id == "custom-trace"
|
|
176
|
+
assert ctx.variants == {"model": "gpt-4o"}
|
|
177
|
+
assert ctx.group_id == "group-123"
|
|
178
|
+
assert ctx.index == 5
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Tests for hud.eval.task module (Task class)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from hud.eval.task import Task
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestTaskDataclass:
|
|
11
|
+
"""Tests for Task as a Pydantic model."""
|
|
12
|
+
|
|
13
|
+
def test_init_defaults(self) -> None:
|
|
14
|
+
"""Task initializes with sensible defaults."""
|
|
15
|
+
task = Task()
|
|
16
|
+
|
|
17
|
+
assert task.env is None
|
|
18
|
+
assert task.scenario is None
|
|
19
|
+
assert task.args == {}
|
|
20
|
+
|
|
21
|
+
def test_init_with_env_dict(self) -> None:
|
|
22
|
+
"""Task auto-converts env dict to Environment via validator."""
|
|
23
|
+
from hud.environment import Environment
|
|
24
|
+
|
|
25
|
+
task = Task(
|
|
26
|
+
env={"name": "browser", "include": ["navigate"]},
|
|
27
|
+
scenario="checkout",
|
|
28
|
+
args={"user_id": "alice"},
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# env dict is auto-converted to Environment
|
|
32
|
+
assert isinstance(task.env, Environment)
|
|
33
|
+
assert task.scenario == "checkout"
|
|
34
|
+
assert task.args == {"user_id": "alice"}
|
|
35
|
+
|
|
36
|
+
def test_copy_creates_new_instance(self) -> None:
|
|
37
|
+
"""copy() creates a new Task instance."""
|
|
38
|
+
original = Task(
|
|
39
|
+
env={"name": "test"},
|
|
40
|
+
scenario="checkout",
|
|
41
|
+
args={"user_id": "alice"},
|
|
42
|
+
)
|
|
43
|
+
copied = original.copy()
|
|
44
|
+
|
|
45
|
+
assert copied is not original
|
|
46
|
+
assert copied.env is original.env # Env reference is shared (intentional)
|
|
47
|
+
assert copied.scenario == original.scenario
|
|
48
|
+
assert copied.args == original.args
|
|
49
|
+
assert copied.args is not original.args # Args are deep copied
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TestEnvironmentCall:
|
|
53
|
+
"""Tests for Environment.__call__ returning Task."""
|
|
54
|
+
|
|
55
|
+
def test_call_returns_task(self) -> None:
|
|
56
|
+
"""Environment() returns a Task object."""
|
|
57
|
+
from hud.environment import Environment
|
|
58
|
+
|
|
59
|
+
env = Environment("test-env")
|
|
60
|
+
task = env()
|
|
61
|
+
|
|
62
|
+
assert isinstance(task, Task)
|
|
63
|
+
|
|
64
|
+
def test_call_with_scenario_sets_scenario(self) -> None:
|
|
65
|
+
"""Environment(scenario) sets scenario name."""
|
|
66
|
+
from hud.environment import Environment
|
|
67
|
+
|
|
68
|
+
env = Environment("test-env")
|
|
69
|
+
task = env("checkout")
|
|
70
|
+
|
|
71
|
+
assert task.scenario == "checkout"
|
|
72
|
+
|
|
73
|
+
def test_call_with_args_sets_args(self) -> None:
|
|
74
|
+
"""Environment(scenario, **args) sets args."""
|
|
75
|
+
from hud.environment import Environment
|
|
76
|
+
|
|
77
|
+
env = Environment("test-env")
|
|
78
|
+
task = env("checkout", user_id="alice", amount=100)
|
|
79
|
+
|
|
80
|
+
assert task.args == {"user_id": "alice", "amount": 100}
|
|
81
|
+
|
|
82
|
+
def test_call_returns_task_with_env(self) -> None:
|
|
83
|
+
"""Environment() returns Task with env reference."""
|
|
84
|
+
from hud.environment import Environment
|
|
85
|
+
|
|
86
|
+
env = Environment("test-env")
|
|
87
|
+
task = env()
|
|
88
|
+
|
|
89
|
+
# Task has reference to the Environment
|
|
90
|
+
assert task.env is env
|
|
91
|
+
|
|
92
|
+
# With setup_tool (v4 legacy)
|
|
93
|
+
env2 = Environment("test-env").setup_tool("navigate", url="https://example.com")
|
|
94
|
+
task2 = env2()
|
|
95
|
+
assert task2.env is env2
|
|
96
|
+
assert len(task2.env._setup_calls) == 1
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class TestTaskFromV4:
|
|
100
|
+
"""Tests for Task.from_v4() migration helper."""
|
|
101
|
+
|
|
102
|
+
def test_from_v4_with_legacy_task(self) -> None:
|
|
103
|
+
"""Task.from_v4() accepts LegacyTask object."""
|
|
104
|
+
import warnings
|
|
105
|
+
|
|
106
|
+
# Suppress the deprecation warning from LegacyTask
|
|
107
|
+
with warnings.catch_warnings():
|
|
108
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
109
|
+
from hud.types import LegacyTask
|
|
110
|
+
|
|
111
|
+
legacy = LegacyTask(
|
|
112
|
+
prompt="Navigate to google.com",
|
|
113
|
+
mcp_config={"hud": {"url": "https://mcp.hud.ai"}},
|
|
114
|
+
evaluate_tool={"name": "check", "arguments": {}},
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
task = Task.from_v4(legacy)
|
|
118
|
+
|
|
119
|
+
assert isinstance(task, Task)
|
|
120
|
+
assert task.env is not None
|
|
121
|
+
assert task.env.prompt == "Navigate to google.com"
|
|
122
|
+
assert task.scenario is None # Uses setup/evaluate_tool, not scenarios
|
|
123
|
+
|
|
124
|
+
def test_from_v4_with_dict(self) -> None:
|
|
125
|
+
"""Task.from_v4() accepts dict with LegacyTask fields."""
|
|
126
|
+
task = Task.from_v4(
|
|
127
|
+
{
|
|
128
|
+
"prompt": "Navigate to google.com",
|
|
129
|
+
"mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
|
|
130
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
131
|
+
}
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
assert isinstance(task, Task)
|
|
135
|
+
assert task.env is not None
|
|
136
|
+
assert task.env.prompt == "Navigate to google.com"
|
|
137
|
+
|
|
138
|
+
def test_from_v4_with_json_string(self) -> None:
|
|
139
|
+
"""Task.from_v4() accepts JSON string."""
|
|
140
|
+
import json
|
|
141
|
+
|
|
142
|
+
data = {
|
|
143
|
+
"prompt": "Navigate to google.com",
|
|
144
|
+
"mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
|
|
145
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
146
|
+
}
|
|
147
|
+
task = Task.from_v4(json.dumps(data))
|
|
148
|
+
|
|
149
|
+
assert isinstance(task, Task)
|
|
150
|
+
assert task.env is not None
|
|
151
|
+
assert task.env.prompt == "Navigate to google.com"
|
|
152
|
+
|
|
153
|
+
def test_from_v4_with_setup_tool(self) -> None:
|
|
154
|
+
"""Task.from_v4() preserves setup_tool via env._setup_calls."""
|
|
155
|
+
task = Task.from_v4(
|
|
156
|
+
{
|
|
157
|
+
"prompt": "Check URL",
|
|
158
|
+
"mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
|
|
159
|
+
"setup_tool": {"name": "navigate", "arguments": {"url": "https://google.com"}},
|
|
160
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
161
|
+
}
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# setup_tool is converted to env._setup_calls
|
|
165
|
+
assert len(task.env._setup_calls) == 1
|
|
166
|
+
assert task.env._setup_calls[0] == ("navigate", {"url": "https://google.com"})
|
|
167
|
+
|
|
168
|
+
def test_from_v4_with_evaluate_tool(self) -> None:
|
|
169
|
+
"""Task.from_v4() preserves evaluate_tool via env._evaluate_calls."""
|
|
170
|
+
task = Task.from_v4(
|
|
171
|
+
{
|
|
172
|
+
"prompt": "Check URL",
|
|
173
|
+
"mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
|
|
174
|
+
"evaluate_tool": {"name": "check_url", "arguments": {"expected": "google"}},
|
|
175
|
+
}
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# evaluate_tool is converted to env._evaluate_calls
|
|
179
|
+
assert len(task.env._evaluate_calls) == 1
|
|
180
|
+
assert task.env._evaluate_calls[0] == ("check_url", {"expected": "google"})
|
|
181
|
+
|
|
182
|
+
def test_from_v4_with_invalid_type_raises(self) -> None:
|
|
183
|
+
"""Task.from_v4() raises TypeError for invalid input."""
|
|
184
|
+
with pytest.raises(TypeError):
|
|
185
|
+
Task.from_v4(12345) # type: ignore[arg-type]
|
|
186
|
+
|
|
187
|
+
def test_from_v4_with_invalid_json_raises(self) -> None:
|
|
188
|
+
"""Task.from_v4() raises JSONDecodeError for invalid JSON."""
|
|
189
|
+
import json
|
|
190
|
+
|
|
191
|
+
with pytest.raises(json.JSONDecodeError):
|
|
192
|
+
Task.from_v4("not valid json")
|
|
193
|
+
|
|
194
|
+
def test_from_v4_does_not_warn_on_use(self) -> None:
|
|
195
|
+
"""Task.from_v4() suppresses LegacyTask deprecation warning."""
|
|
196
|
+
import warnings
|
|
197
|
+
|
|
198
|
+
with warnings.catch_warnings(record=True) as w:
|
|
199
|
+
warnings.simplefilter("always")
|
|
200
|
+
Task.from_v4(
|
|
201
|
+
{
|
|
202
|
+
"prompt": "test",
|
|
203
|
+
"mcp_config": {"hud": {}},
|
|
204
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
205
|
+
}
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Should not trigger deprecation warning since we're migrating
|
|
209
|
+
legacy_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
|
|
210
|
+
assert len(legacy_warnings) == 0
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Tests for hud.eval.manager module (hud.eval() function)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import AsyncMock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.eval.context import EvalContext, get_current_trace_headers
|
|
10
|
+
from hud.eval.manager import run_eval
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TestRunEvalNoArgs:
|
|
14
|
+
"""Tests for hud.eval() with no arguments (blank eval)."""
|
|
15
|
+
|
|
16
|
+
@pytest.mark.asyncio
|
|
17
|
+
async def test_blank_eval_creates_context(self) -> None:
|
|
18
|
+
"""hud.eval() with no args creates an EvalContext."""
|
|
19
|
+
with (
|
|
20
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
21
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
|
|
22
|
+
):
|
|
23
|
+
async with run_eval(quiet=True) as ctx:
|
|
24
|
+
assert isinstance(ctx, EvalContext)
|
|
25
|
+
assert ctx.eval_name == "eval"
|
|
26
|
+
|
|
27
|
+
@pytest.mark.asyncio
|
|
28
|
+
async def test_blank_eval_generates_trace_id(self) -> None:
|
|
29
|
+
"""hud.eval() with no args generates a trace_id."""
|
|
30
|
+
with (
|
|
31
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
32
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
|
|
33
|
+
):
|
|
34
|
+
async with run_eval(quiet=True) as ctx:
|
|
35
|
+
assert ctx.trace_id is not None
|
|
36
|
+
assert len(ctx.trace_id) == 36 # UUID format
|
|
37
|
+
|
|
38
|
+
@pytest.mark.asyncio
|
|
39
|
+
async def test_blank_eval_sets_trace_headers(self) -> None:
|
|
40
|
+
"""hud.eval() sets trace headers in contextvar during context."""
|
|
41
|
+
with (
|
|
42
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
43
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
|
|
44
|
+
):
|
|
45
|
+
# Before context, no headers
|
|
46
|
+
assert get_current_trace_headers() is None
|
|
47
|
+
|
|
48
|
+
async with run_eval(quiet=True) as ctx:
|
|
49
|
+
# Inside context, headers are set
|
|
50
|
+
headers = get_current_trace_headers()
|
|
51
|
+
assert headers is not None
|
|
52
|
+
assert headers["Trace-Id"] == ctx.trace_id
|
|
53
|
+
|
|
54
|
+
# After context, headers are cleared
|
|
55
|
+
assert get_current_trace_headers() is None
|
|
56
|
+
|
|
57
|
+
@pytest.mark.asyncio
|
|
58
|
+
async def test_blank_eval_reward_can_be_set(self) -> None:
|
|
59
|
+
"""hud.eval() allows setting reward on context."""
|
|
60
|
+
with (
|
|
61
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
62
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
|
|
63
|
+
):
|
|
64
|
+
async with run_eval(quiet=True) as ctx:
|
|
65
|
+
assert ctx.reward is None
|
|
66
|
+
ctx.reward = 0.95
|
|
67
|
+
|
|
68
|
+
assert ctx.reward == 0.95
|
|
69
|
+
|
|
70
|
+
@pytest.mark.asyncio
|
|
71
|
+
async def test_blank_eval_reports_reward_on_exit(self) -> None:
|
|
72
|
+
"""hud.eval() reports reward to backend on exit."""
|
|
73
|
+
with (
|
|
74
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
75
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock) as mock_exit,
|
|
76
|
+
):
|
|
77
|
+
async with run_eval(quiet=True) as ctx:
|
|
78
|
+
ctx.reward = 0.85
|
|
79
|
+
|
|
80
|
+
# _eval_exit should have been called (with no error)
|
|
81
|
+
mock_exit.assert_called_once_with(None)
|
|
82
|
+
|
|
83
|
+
@pytest.mark.asyncio
|
|
84
|
+
async def test_blank_eval_empty_variants(self) -> None:
|
|
85
|
+
"""hud.eval() with no args has empty variants dict."""
|
|
86
|
+
with (
|
|
87
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
88
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
|
|
89
|
+
):
|
|
90
|
+
async with run_eval(quiet=True) as ctx:
|
|
91
|
+
assert ctx.variants == {}
|
|
92
|
+
|
|
93
|
+
@pytest.mark.asyncio
|
|
94
|
+
async def test_blank_eval_has_headers_property(self) -> None:
|
|
95
|
+
"""hud.eval() context has headers property for gateway integration."""
|
|
96
|
+
with (
|
|
97
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
98
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
|
|
99
|
+
):
|
|
100
|
+
async with run_eval(quiet=True) as ctx:
|
|
101
|
+
headers = ctx.headers
|
|
102
|
+
assert "Trace-Id" in headers
|
|
103
|
+
assert headers["Trace-Id"] == ctx.trace_id
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class TestRunEvalWithApiKey:
|
|
107
|
+
"""Tests for hud.eval() with api_key parameter."""
|
|
108
|
+
|
|
109
|
+
@pytest.mark.asyncio
|
|
110
|
+
async def test_api_key_passed_to_context(self) -> None:
|
|
111
|
+
"""hud.eval(api_key=...) passes api_key to context."""
|
|
112
|
+
with (
|
|
113
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
114
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
|
|
115
|
+
):
|
|
116
|
+
async with run_eval(api_key="test-key", quiet=True) as ctx:
|
|
117
|
+
assert ctx._eval_api_key == "test-key"
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class TestRunEvalWithJobId:
|
|
121
|
+
"""Tests for hud.eval() with job_id parameter."""
|
|
122
|
+
|
|
123
|
+
@pytest.mark.asyncio
|
|
124
|
+
async def test_job_id_passed_to_context(self) -> None:
|
|
125
|
+
"""hud.eval(job_id=...) passes job_id to context."""
|
|
126
|
+
with (
|
|
127
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
128
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
|
|
129
|
+
):
|
|
130
|
+
async with run_eval(job_id="job-123", quiet=True) as ctx:
|
|
131
|
+
assert ctx.job_id == "job-123"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TestRunEvalErrorHandling:
|
|
135
|
+
"""Tests for hud.eval() error handling."""
|
|
136
|
+
|
|
137
|
+
@pytest.mark.asyncio
|
|
138
|
+
async def test_error_tracked_on_exception(self) -> None:
|
|
139
|
+
"""hud.eval() tracks error when exception occurs."""
|
|
140
|
+
with (
|
|
141
|
+
patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
|
|
142
|
+
patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock) as mock_exit,
|
|
143
|
+
):
|
|
144
|
+
with pytest.raises(ValueError):
|
|
145
|
+
async with run_eval(quiet=True):
|
|
146
|
+
raise ValueError("test error")
|
|
147
|
+
|
|
148
|
+
# _eval_exit should have been called with error message
|
|
149
|
+
mock_exit.assert_called_once()
|
|
150
|
+
error_msg = mock_exit.call_args[0][0]
|
|
151
|
+
assert error_msg is not None
|
|
152
|
+
assert "test error" in error_msg
|