hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Tests for hud.datasets.loader module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import MagicMock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.datasets.loader import load_tasks
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestLoadTasks:
|
|
13
|
+
"""Tests for load_tasks() function."""
|
|
14
|
+
|
|
15
|
+
@patch("httpx.Client")
|
|
16
|
+
@patch("hud.settings.settings")
|
|
17
|
+
def test_load_tasks_success(
|
|
18
|
+
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
19
|
+
) -> None:
|
|
20
|
+
"""load_tasks() successfully loads tasks from API."""
|
|
21
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
22
|
+
mock_settings.api_key = "test_key"
|
|
23
|
+
|
|
24
|
+
mock_response = MagicMock()
|
|
25
|
+
# New EvalsetTasksResponse format: tasks keyed by task ID
|
|
26
|
+
mock_response.json.return_value = {
|
|
27
|
+
"evalset_id": "evalset-123",
|
|
28
|
+
"evalset_name": "test-dataset",
|
|
29
|
+
"tasks": {
|
|
30
|
+
"task-1": {
|
|
31
|
+
"env": {"name": "test"},
|
|
32
|
+
"scenario": "checkout",
|
|
33
|
+
"args": {"user": "alice"},
|
|
34
|
+
},
|
|
35
|
+
"task-2": {
|
|
36
|
+
"env": {"name": "test"},
|
|
37
|
+
"scenario": "login",
|
|
38
|
+
"args": {"user": "bob"},
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
}
|
|
42
|
+
mock_response.raise_for_status = MagicMock()
|
|
43
|
+
|
|
44
|
+
mock_client = MagicMock()
|
|
45
|
+
mock_client.get.return_value = mock_response
|
|
46
|
+
mock_client.__enter__.return_value = mock_client
|
|
47
|
+
mock_client.__exit__.return_value = None
|
|
48
|
+
mock_client_class.return_value = mock_client
|
|
49
|
+
|
|
50
|
+
tasks = load_tasks("test-org/test-dataset")
|
|
51
|
+
|
|
52
|
+
assert len(tasks) == 2
|
|
53
|
+
# Tasks are keyed by ID in dict, order may vary
|
|
54
|
+
scenarios = {t.scenario for t in tasks}
|
|
55
|
+
assert scenarios == {"checkout", "login"}
|
|
56
|
+
# Check task IDs are set from dict keys
|
|
57
|
+
task_ids = {t.id for t in tasks}
|
|
58
|
+
assert task_ids == {"task-1", "task-2"}
|
|
59
|
+
mock_client.get.assert_called_once_with(
|
|
60
|
+
"https://api.hud.ai/tasks/evalset/test-org/test-dataset",
|
|
61
|
+
headers={"Authorization": "Bearer test_key"},
|
|
62
|
+
params={"all": "true"},
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@patch("httpx.Client")
|
|
66
|
+
@patch("hud.settings.settings")
|
|
67
|
+
def test_load_tasks_single_task(
|
|
68
|
+
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
69
|
+
) -> None:
|
|
70
|
+
"""load_tasks() handles single task in EvalsetTasksResponse."""
|
|
71
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
72
|
+
mock_settings.api_key = "test_key"
|
|
73
|
+
|
|
74
|
+
mock_response = MagicMock()
|
|
75
|
+
mock_response.json.return_value = {
|
|
76
|
+
"evalset_id": "evalset-123",
|
|
77
|
+
"evalset_name": "test-dataset",
|
|
78
|
+
"tasks": {
|
|
79
|
+
"task-1": {
|
|
80
|
+
"env": {"name": "test"},
|
|
81
|
+
"scenario": "checkout",
|
|
82
|
+
"args": {"user": "alice"},
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
}
|
|
86
|
+
mock_response.raise_for_status = MagicMock()
|
|
87
|
+
|
|
88
|
+
mock_client = MagicMock()
|
|
89
|
+
mock_client.get.return_value = mock_response
|
|
90
|
+
mock_client.__enter__.return_value = mock_client
|
|
91
|
+
mock_client.__exit__.return_value = None
|
|
92
|
+
mock_client_class.return_value = mock_client
|
|
93
|
+
|
|
94
|
+
tasks = load_tasks("test-org/test-dataset")
|
|
95
|
+
|
|
96
|
+
assert len(tasks) == 1
|
|
97
|
+
assert tasks[0].scenario == "checkout"
|
|
98
|
+
assert tasks[0].id == "task-1"
|
|
99
|
+
|
|
100
|
+
@patch("httpx.Client")
|
|
101
|
+
@patch("hud.settings.settings")
|
|
102
|
+
def test_load_tasks_no_api_key(
|
|
103
|
+
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
104
|
+
) -> None:
|
|
105
|
+
"""load_tasks() works without API key."""
|
|
106
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
107
|
+
mock_settings.api_key = None
|
|
108
|
+
|
|
109
|
+
mock_response = MagicMock()
|
|
110
|
+
mock_response.json.return_value = {
|
|
111
|
+
"evalset_id": "evalset-123",
|
|
112
|
+
"evalset_name": "test-dataset",
|
|
113
|
+
"tasks": {},
|
|
114
|
+
}
|
|
115
|
+
mock_response.raise_for_status = MagicMock()
|
|
116
|
+
|
|
117
|
+
mock_client = MagicMock()
|
|
118
|
+
mock_client.get.return_value = mock_response
|
|
119
|
+
mock_client.__enter__.return_value = mock_client
|
|
120
|
+
mock_client.__exit__.return_value = None
|
|
121
|
+
mock_client_class.return_value = mock_client
|
|
122
|
+
|
|
123
|
+
tasks = load_tasks("test-org/test-dataset")
|
|
124
|
+
|
|
125
|
+
assert len(tasks) == 0
|
|
126
|
+
mock_client.get.assert_called_once_with(
|
|
127
|
+
"https://api.hud.ai/tasks/evalset/test-org/test-dataset",
|
|
128
|
+
headers={},
|
|
129
|
+
params={"all": "true"},
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
@patch("httpx.Client")
|
|
133
|
+
@patch("hud.settings.settings")
|
|
134
|
+
def test_load_tasks_http_error(
|
|
135
|
+
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
136
|
+
) -> None:
|
|
137
|
+
"""load_tasks() raises ValueError on HTTP error."""
|
|
138
|
+
import httpx
|
|
139
|
+
|
|
140
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
141
|
+
mock_settings.api_key = "test_key"
|
|
142
|
+
|
|
143
|
+
mock_client = MagicMock()
|
|
144
|
+
mock_client.get.side_effect = httpx.HTTPError("Network error")
|
|
145
|
+
mock_client.__enter__.return_value = mock_client
|
|
146
|
+
mock_client.__exit__.return_value = None
|
|
147
|
+
mock_client_class.return_value = mock_client
|
|
148
|
+
|
|
149
|
+
with pytest.raises(ValueError, match="Failed to load tasks"):
|
|
150
|
+
load_tasks("test-org/test-dataset")
|
|
151
|
+
|
|
152
|
+
@patch("httpx.Client")
|
|
153
|
+
@patch("hud.settings.settings")
|
|
154
|
+
def test_load_tasks_json_error(
|
|
155
|
+
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
156
|
+
) -> None:
|
|
157
|
+
"""load_tasks() raises ValueError on JSON processing error."""
|
|
158
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
159
|
+
mock_settings.api_key = "test_key"
|
|
160
|
+
|
|
161
|
+
mock_response = MagicMock()
|
|
162
|
+
mock_response.json.side_effect = Exception("Invalid JSON")
|
|
163
|
+
mock_response.raise_for_status = MagicMock()
|
|
164
|
+
|
|
165
|
+
mock_client = MagicMock()
|
|
166
|
+
mock_client.get.return_value = mock_response
|
|
167
|
+
mock_client.__enter__.return_value = mock_client
|
|
168
|
+
mock_client.__exit__.return_value = None
|
|
169
|
+
mock_client_class.return_value = mock_client
|
|
170
|
+
|
|
171
|
+
with pytest.raises(ValueError, match="Failed to load tasks"):
|
|
172
|
+
load_tasks("test-org/test-dataset")
|
|
173
|
+
|
|
174
|
+
@patch("httpx.Client")
|
|
175
|
+
@patch("hud.settings.settings")
|
|
176
|
+
def test_load_tasks_empty(self, mock_settings: MagicMock, mock_client_class: MagicMock) -> None:
|
|
177
|
+
"""load_tasks() handles empty dataset."""
|
|
178
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
179
|
+
mock_settings.api_key = "test_key"
|
|
180
|
+
|
|
181
|
+
mock_response = MagicMock()
|
|
182
|
+
mock_response.json.return_value = {"tasks": {}}
|
|
183
|
+
mock_response.raise_for_status = MagicMock()
|
|
184
|
+
|
|
185
|
+
mock_client = MagicMock()
|
|
186
|
+
mock_client.get.return_value = mock_response
|
|
187
|
+
mock_client.__enter__.return_value = mock_client
|
|
188
|
+
mock_client.__exit__.return_value = None
|
|
189
|
+
mock_client_class.return_value = mock_client
|
|
190
|
+
|
|
191
|
+
tasks = load_tasks("test-org/test-dataset")
|
|
192
|
+
|
|
193
|
+
assert len(tasks) == 0
|
|
194
|
+
|
|
195
|
+
@patch("httpx.Client")
|
|
196
|
+
@patch("hud.settings.settings")
|
|
197
|
+
def test_load_tasks_missing_fields(
|
|
198
|
+
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
199
|
+
) -> None:
|
|
200
|
+
"""load_tasks() handles tasks with missing optional fields (but env is required)."""
|
|
201
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
202
|
+
mock_settings.api_key = "test_key"
|
|
203
|
+
|
|
204
|
+
mock_response = MagicMock()
|
|
205
|
+
mock_response.json.return_value = {
|
|
206
|
+
"tasks": {"task-1": {"env": {"name": "test-env"}, "scenario": "test"}},
|
|
207
|
+
}
|
|
208
|
+
mock_response.raise_for_status = MagicMock()
|
|
209
|
+
|
|
210
|
+
mock_client = MagicMock()
|
|
211
|
+
mock_client.get.return_value = mock_response
|
|
212
|
+
mock_client.__enter__.return_value = mock_client
|
|
213
|
+
mock_client.__exit__.return_value = None
|
|
214
|
+
mock_client_class.return_value = mock_client
|
|
215
|
+
|
|
216
|
+
tasks = load_tasks("test-org/test-dataset")
|
|
217
|
+
|
|
218
|
+
assert len(tasks) == 1
|
|
219
|
+
assert tasks[0].scenario == "test"
|
|
220
|
+
assert tasks[0].id == "task-1"
|
|
221
|
+
assert tasks[0].args == {}
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""Tests for hud.datasets.utils module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.datasets.utils import (
|
|
10
|
+
BatchRequest,
|
|
11
|
+
SingleTaskRequest,
|
|
12
|
+
cancel_all_jobs,
|
|
13
|
+
cancel_job,
|
|
14
|
+
cancel_task,
|
|
15
|
+
submit_rollouts,
|
|
16
|
+
)
|
|
17
|
+
from hud.eval.display import display_results
|
|
18
|
+
from hud.types import AgentType, LegacyTask, Trace
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestSingleTaskRequest:
|
|
22
|
+
"""Tests for SingleTaskRequest schema."""
|
|
23
|
+
|
|
24
|
+
def test_valid_request(self):
|
|
25
|
+
"""Test creating a valid SingleTaskRequest with v5 task."""
|
|
26
|
+
request = SingleTaskRequest(
|
|
27
|
+
task={"env": {"name": "browser"}, "scenario": "checkout"},
|
|
28
|
+
agent_type=AgentType.CLAUDE,
|
|
29
|
+
agent_params={"checkpoint_name": "claude-sonnet-4-5"},
|
|
30
|
+
max_steps=10,
|
|
31
|
+
job_id="job-123",
|
|
32
|
+
task_id="task-1",
|
|
33
|
+
trace_name="Test trace",
|
|
34
|
+
)
|
|
35
|
+
assert request.task_id == "task-1"
|
|
36
|
+
assert request.agent_type == AgentType.CLAUDE
|
|
37
|
+
|
|
38
|
+
def test_empty_job_id_rejected(self):
|
|
39
|
+
"""Test that empty job_id is rejected."""
|
|
40
|
+
with pytest.raises(ValueError, match="job_id must be a non-empty string"):
|
|
41
|
+
SingleTaskRequest(
|
|
42
|
+
task={"prompt": "test", "mcp_config": {}},
|
|
43
|
+
agent_type=AgentType.CLAUDE,
|
|
44
|
+
job_id="",
|
|
45
|
+
task_id="task-1",
|
|
46
|
+
trace_name="Test",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def test_invalid_task_rejected(self):
|
|
50
|
+
"""Test that invalid task payload is rejected (neither v4 nor v5)."""
|
|
51
|
+
with pytest.raises(ValueError, match="Task must have 'env'"):
|
|
52
|
+
SingleTaskRequest(
|
|
53
|
+
task={"invalid_field": "test"}, # Missing required fields
|
|
54
|
+
agent_type=AgentType.CLAUDE,
|
|
55
|
+
job_id="job-123",
|
|
56
|
+
task_id="task-1",
|
|
57
|
+
trace_name="Test",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def test_incomplete_v4_task_rejected(self):
|
|
61
|
+
"""Test that incomplete v4 task (missing evaluate_tool) is rejected."""
|
|
62
|
+
# When prompt + mcp_config is present but evaluate_tool is missing,
|
|
63
|
+
# it's detected as v4 format but fails validation
|
|
64
|
+
with pytest.raises(ValueError, match="v4 task missing required fields"):
|
|
65
|
+
SingleTaskRequest(
|
|
66
|
+
task={
|
|
67
|
+
"prompt": "test",
|
|
68
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
69
|
+
# Missing evaluate_tool
|
|
70
|
+
},
|
|
71
|
+
agent_type=AgentType.CLAUDE,
|
|
72
|
+
job_id="job-123",
|
|
73
|
+
task_id="task-1",
|
|
74
|
+
trace_name="Test",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def test_valid_v4_task_accepted(self):
|
|
78
|
+
"""Test that complete v4 task is accepted."""
|
|
79
|
+
request = SingleTaskRequest(
|
|
80
|
+
task={
|
|
81
|
+
"prompt": "test",
|
|
82
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
83
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
84
|
+
},
|
|
85
|
+
agent_type=AgentType.CLAUDE,
|
|
86
|
+
job_id="job-123",
|
|
87
|
+
task_id="task-1",
|
|
88
|
+
trace_name="Test",
|
|
89
|
+
)
|
|
90
|
+
assert request.task_id == "task-1"
|
|
91
|
+
|
|
92
|
+
def test_valid_v5_task_accepted(self):
|
|
93
|
+
"""Test that v5 task with env is accepted."""
|
|
94
|
+
request = SingleTaskRequest(
|
|
95
|
+
task={"env": {"name": "browser"}, "scenario": "login"},
|
|
96
|
+
agent_type=AgentType.CLAUDE,
|
|
97
|
+
job_id="job-123",
|
|
98
|
+
task_id="task-1",
|
|
99
|
+
trace_name="Test",
|
|
100
|
+
)
|
|
101
|
+
assert request.task_id == "task-1"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class TestBatchRequest:
|
|
105
|
+
"""Tests for BatchRequest schema."""
|
|
106
|
+
|
|
107
|
+
def test_valid_batch(self):
|
|
108
|
+
"""Test creating a valid batch request."""
|
|
109
|
+
requests = [
|
|
110
|
+
SingleTaskRequest(
|
|
111
|
+
task={"env": {"name": "browser"}, "scenario": "test"},
|
|
112
|
+
agent_type=AgentType.CLAUDE,
|
|
113
|
+
job_id="job-123",
|
|
114
|
+
task_id=f"task-{i}",
|
|
115
|
+
trace_name=f"Trace {i}",
|
|
116
|
+
)
|
|
117
|
+
for i in range(3)
|
|
118
|
+
]
|
|
119
|
+
batch = BatchRequest(requests=requests)
|
|
120
|
+
assert len(batch.requests) == 3
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TestCancellationFunctions:
|
|
124
|
+
"""Tests for cancellation functions."""
|
|
125
|
+
|
|
126
|
+
@pytest.mark.asyncio
|
|
127
|
+
async def test_cancel_task(self):
|
|
128
|
+
"""Test cancel_task makes correct API call."""
|
|
129
|
+
with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
|
|
130
|
+
mock_response = MagicMock()
|
|
131
|
+
mock_response.json.return_value = {"cancelled": True, "task_id": "task-1"}
|
|
132
|
+
mock_response.raise_for_status = MagicMock()
|
|
133
|
+
|
|
134
|
+
mock_client = AsyncMock()
|
|
135
|
+
mock_client.post.return_value = mock_response
|
|
136
|
+
mock_client.__aenter__.return_value = mock_client
|
|
137
|
+
mock_client.__aexit__.return_value = None
|
|
138
|
+
mock_client_cls.return_value = mock_client
|
|
139
|
+
|
|
140
|
+
with patch("hud.datasets.utils.settings") as mock_settings:
|
|
141
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
142
|
+
mock_settings.api_key = "test-key"
|
|
143
|
+
|
|
144
|
+
result = await cancel_task("job-123", "task-1")
|
|
145
|
+
|
|
146
|
+
assert result["cancelled"] is True
|
|
147
|
+
mock_client.post.assert_called_once()
|
|
148
|
+
call_args = mock_client.post.call_args
|
|
149
|
+
assert "cancel" in call_args[0][0]
|
|
150
|
+
assert call_args[1]["json"]["job_id"] == "job-123"
|
|
151
|
+
assert call_args[1]["json"]["task_id"] == "task-1"
|
|
152
|
+
|
|
153
|
+
@pytest.mark.asyncio
|
|
154
|
+
async def test_cancel_job(self):
|
|
155
|
+
"""Test cancel_job makes correct API call."""
|
|
156
|
+
with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
|
|
157
|
+
mock_response = MagicMock()
|
|
158
|
+
mock_response.json.return_value = {"cancelled": 5, "job_id": "job-123"}
|
|
159
|
+
mock_response.raise_for_status = MagicMock()
|
|
160
|
+
|
|
161
|
+
mock_client = AsyncMock()
|
|
162
|
+
mock_client.post.return_value = mock_response
|
|
163
|
+
mock_client.__aenter__.return_value = mock_client
|
|
164
|
+
mock_client.__aexit__.return_value = None
|
|
165
|
+
mock_client_cls.return_value = mock_client
|
|
166
|
+
|
|
167
|
+
with patch("hud.datasets.utils.settings") as mock_settings:
|
|
168
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
169
|
+
mock_settings.api_key = "test-key"
|
|
170
|
+
|
|
171
|
+
result = await cancel_job("job-123")
|
|
172
|
+
|
|
173
|
+
assert result["cancelled"] == 5
|
|
174
|
+
mock_client.post.assert_called_once()
|
|
175
|
+
|
|
176
|
+
@pytest.mark.asyncio
|
|
177
|
+
async def test_cancel_all_jobs(self):
|
|
178
|
+
"""Test cancel_all_jobs makes correct API call."""
|
|
179
|
+
with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
|
|
180
|
+
mock_response = MagicMock()
|
|
181
|
+
mock_response.json.return_value = {"jobs_cancelled": 3, "total_tasks_cancelled": 10}
|
|
182
|
+
mock_response.raise_for_status = MagicMock()
|
|
183
|
+
|
|
184
|
+
mock_client = AsyncMock()
|
|
185
|
+
mock_client.post.return_value = mock_response
|
|
186
|
+
mock_client.__aenter__.return_value = mock_client
|
|
187
|
+
mock_client.__aexit__.return_value = None
|
|
188
|
+
mock_client_cls.return_value = mock_client
|
|
189
|
+
|
|
190
|
+
with patch("hud.datasets.utils.settings") as mock_settings:
|
|
191
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
192
|
+
mock_settings.api_key = "test-key"
|
|
193
|
+
|
|
194
|
+
result = await cancel_all_jobs()
|
|
195
|
+
|
|
196
|
+
assert result["jobs_cancelled"] == 3
|
|
197
|
+
assert result["total_tasks_cancelled"] == 10
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class TestDisplayResults:
|
|
201
|
+
"""Tests for display_results function."""
|
|
202
|
+
|
|
203
|
+
def test_display_with_traces(self):
|
|
204
|
+
"""Test displaying single-run trace results."""
|
|
205
|
+
tasks = [
|
|
206
|
+
LegacyTask(id="t1", prompt="Test task 1", mcp_config={}),
|
|
207
|
+
LegacyTask(id="t2", prompt="Test task 2", mcp_config={}),
|
|
208
|
+
]
|
|
209
|
+
results = [
|
|
210
|
+
Trace(reward=0.9, done=True),
|
|
211
|
+
Trace(reward=0.5, done=True),
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
# Should not raise
|
|
215
|
+
display_results(results, tasks=tasks)
|
|
216
|
+
|
|
217
|
+
def test_display_with_group_stats(self):
|
|
218
|
+
"""Test displaying group statistics."""
|
|
219
|
+
tasks = [
|
|
220
|
+
LegacyTask(id="t1", prompt="Test task 1", mcp_config={}),
|
|
221
|
+
]
|
|
222
|
+
results = [
|
|
223
|
+
{
|
|
224
|
+
"task_id": "t1",
|
|
225
|
+
"prompt": "Test task 1",
|
|
226
|
+
"mean_reward": 0.85,
|
|
227
|
+
"std_reward": 0.1,
|
|
228
|
+
"min_reward": 0.7,
|
|
229
|
+
"max_reward": 1.0,
|
|
230
|
+
"success_rate": 0.9,
|
|
231
|
+
"group_size": 3,
|
|
232
|
+
"rewards": [0.8, 0.85, 0.9],
|
|
233
|
+
}
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
# Should not raise
|
|
237
|
+
display_results(results, tasks=tasks)
|
|
238
|
+
|
|
239
|
+
def test_display_empty_results(self):
|
|
240
|
+
"""Test displaying when no valid results."""
|
|
241
|
+
tasks = [LegacyTask(prompt="Test", mcp_config={})]
|
|
242
|
+
results: list[Trace | None] = [None]
|
|
243
|
+
|
|
244
|
+
# Should not raise
|
|
245
|
+
display_results(results, tasks=tasks)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class TestSubmitRollouts:
|
|
249
|
+
"""Tests for submit_rollouts function."""
|
|
250
|
+
|
|
251
|
+
@pytest.mark.asyncio
|
|
252
|
+
async def test_submit_single_task(self):
|
|
253
|
+
"""Test submitting a single task (v5 format)."""
|
|
254
|
+
from hud.eval.task import Task
|
|
255
|
+
|
|
256
|
+
tasks = [Task(env={"name": "browser"}, scenario="test", id="task-1")]
|
|
257
|
+
|
|
258
|
+
with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
|
|
259
|
+
mock_response = MagicMock()
|
|
260
|
+
mock_response.json.return_value = {"accepted": 1, "rejected": 0}
|
|
261
|
+
mock_response.raise_for_status = MagicMock()
|
|
262
|
+
|
|
263
|
+
mock_client = AsyncMock()
|
|
264
|
+
mock_client.post.return_value = mock_response
|
|
265
|
+
mock_client.__aenter__.return_value = mock_client
|
|
266
|
+
mock_client.__aexit__.return_value = None
|
|
267
|
+
mock_client_cls.return_value = mock_client
|
|
268
|
+
|
|
269
|
+
with patch("hud.datasets.utils.settings") as mock_settings:
|
|
270
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
271
|
+
mock_settings.api_key = "test-key"
|
|
272
|
+
|
|
273
|
+
# submit_rollouts doesn't return a value
|
|
274
|
+
await submit_rollouts(
|
|
275
|
+
tasks=tasks,
|
|
276
|
+
agent_type=AgentType.CLAUDE,
|
|
277
|
+
job_id="job-123",
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
mock_client.post.assert_called_once()
|
|
281
|
+
|
|
282
|
+
@pytest.mark.asyncio
|
|
283
|
+
async def test_submit_with_group_size(self):
|
|
284
|
+
"""Test submitting with group_size > 1 creates multiple requests per task."""
|
|
285
|
+
from hud.eval.task import Task
|
|
286
|
+
|
|
287
|
+
tasks = [Task(env={"name": "browser"}, scenario="test", id="task-1")]
|
|
288
|
+
|
|
289
|
+
with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
|
|
290
|
+
mock_response = MagicMock()
|
|
291
|
+
mock_response.json.return_value = {"accepted": 3, "rejected": 0}
|
|
292
|
+
mock_response.raise_for_status = MagicMock()
|
|
293
|
+
|
|
294
|
+
mock_client = AsyncMock()
|
|
295
|
+
mock_client.post.return_value = mock_response
|
|
296
|
+
mock_client.__aenter__.return_value = mock_client
|
|
297
|
+
mock_client.__aexit__.return_value = None
|
|
298
|
+
mock_client_cls.return_value = mock_client
|
|
299
|
+
|
|
300
|
+
with patch("hud.datasets.utils.settings") as mock_settings:
|
|
301
|
+
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
302
|
+
mock_settings.api_key = "test-key"
|
|
303
|
+
|
|
304
|
+
await submit_rollouts(
|
|
305
|
+
tasks=tasks,
|
|
306
|
+
agent_type=AgentType.CLAUDE,
|
|
307
|
+
job_id="job-123",
|
|
308
|
+
group_size=3,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Verify batch request contains 3 requests (1 task x 3 group_size)
|
|
312
|
+
call_args = mock_client.post.call_args
|
|
313
|
+
assert call_args is not None
|
|
314
|
+
batch_data = call_args.kwargs["json"]
|
|
315
|
+
assert len(batch_data["requests"]) == 3
|