hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Tests for hud.eval.parallel module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.eval.parallel import (
|
|
10
|
+
ASTExtractionError,
|
|
11
|
+
_extract_body,
|
|
12
|
+
_find_async_with,
|
|
13
|
+
_get_end_line,
|
|
14
|
+
expand_variants,
|
|
15
|
+
resolve_group_ids,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TestExpandVariants:
|
|
20
|
+
"""Tests for expand_variants helper."""
|
|
21
|
+
|
|
22
|
+
def test_none_returns_empty_dict(self) -> None:
|
|
23
|
+
"""None variants returns list with empty dict."""
|
|
24
|
+
result = expand_variants(None)
|
|
25
|
+
assert result == [{}]
|
|
26
|
+
|
|
27
|
+
def test_empty_dict_returns_empty_dict(self) -> None:
|
|
28
|
+
"""Empty variants returns list with empty dict."""
|
|
29
|
+
result = expand_variants({})
|
|
30
|
+
assert result == [{}]
|
|
31
|
+
|
|
32
|
+
def test_single_value_stays_single(self) -> None:
|
|
33
|
+
"""Single non-list value stays as single variant."""
|
|
34
|
+
result = expand_variants({"model": "gpt-4o"})
|
|
35
|
+
assert result == [{"model": "gpt-4o"}]
|
|
36
|
+
|
|
37
|
+
def test_list_expands_to_variants(self) -> None:
|
|
38
|
+
"""List value expands to multiple variants."""
|
|
39
|
+
result = expand_variants({"model": ["gpt-4o", "claude"]})
|
|
40
|
+
assert result == [{"model": "gpt-4o"}, {"model": "claude"}]
|
|
41
|
+
|
|
42
|
+
def test_multiple_lists_create_combinations(self) -> None:
|
|
43
|
+
"""Multiple lists create all combinations."""
|
|
44
|
+
result = expand_variants(
|
|
45
|
+
{
|
|
46
|
+
"model": ["a", "b"],
|
|
47
|
+
"temp": [0.0, 1.0],
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
assert len(result) == 4
|
|
52
|
+
assert {"model": "a", "temp": 0.0} in result
|
|
53
|
+
assert {"model": "a", "temp": 1.0} in result
|
|
54
|
+
assert {"model": "b", "temp": 0.0} in result
|
|
55
|
+
assert {"model": "b", "temp": 1.0} in result
|
|
56
|
+
|
|
57
|
+
def test_mixed_single_and_list(self) -> None:
|
|
58
|
+
"""Mixed single values and lists work correctly."""
|
|
59
|
+
result = expand_variants(
|
|
60
|
+
{
|
|
61
|
+
"model": ["gpt-4o", "claude"],
|
|
62
|
+
"temp": 0.7,
|
|
63
|
+
}
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert len(result) == 2
|
|
67
|
+
assert {"model": "gpt-4o", "temp": 0.7} in result
|
|
68
|
+
assert {"model": "claude", "temp": 0.7} in result
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class TestResolveGroupIds:
|
|
72
|
+
"""Tests for resolve_group_ids helper."""
|
|
73
|
+
|
|
74
|
+
def test_uses_provided_group_ids(self) -> None:
|
|
75
|
+
"""Uses provided group_ids when given."""
|
|
76
|
+
result = resolve_group_ids(["a", "b", "c"], 3)
|
|
77
|
+
assert result == ["a", "b", "c"]
|
|
78
|
+
|
|
79
|
+
def test_generates_shared_group_id(self) -> None:
|
|
80
|
+
"""Generates shared group_id when not provided."""
|
|
81
|
+
result = resolve_group_ids(None, 3)
|
|
82
|
+
assert len(result) == 3
|
|
83
|
+
# All should be the same
|
|
84
|
+
assert result[0] == result[1] == result[2]
|
|
85
|
+
# Should be a valid UUID
|
|
86
|
+
assert len(result[0]) == 36
|
|
87
|
+
|
|
88
|
+
def test_raises_on_length_mismatch(self) -> None:
|
|
89
|
+
"""Raises ValueError when group_ids length doesn't match."""
|
|
90
|
+
with pytest.raises(ValueError, match="group_ids length"):
|
|
91
|
+
resolve_group_ids(["a", "b"], 3)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class TestASTHelpers:
|
|
95
|
+
"""Tests for AST helper functions."""
|
|
96
|
+
|
|
97
|
+
def test_find_async_with_finds_correct_node(self) -> None:
|
|
98
|
+
"""_find_async_with finds the async with containing target line."""
|
|
99
|
+
source = """
|
|
100
|
+
async def main():
|
|
101
|
+
x = 1
|
|
102
|
+
async with something as ctx:
|
|
103
|
+
do_stuff()
|
|
104
|
+
more_stuff()
|
|
105
|
+
y = 2
|
|
106
|
+
"""
|
|
107
|
+
tree = ast.parse(source)
|
|
108
|
+
|
|
109
|
+
# Line 5 is inside the async with
|
|
110
|
+
node = _find_async_with(tree, 5)
|
|
111
|
+
assert node is not None
|
|
112
|
+
assert isinstance(node, ast.AsyncWith)
|
|
113
|
+
|
|
114
|
+
def test_find_async_with_returns_none_when_not_found(self) -> None:
|
|
115
|
+
"""_find_async_with returns None when line is outside async with."""
|
|
116
|
+
source = """
|
|
117
|
+
async def main():
|
|
118
|
+
x = 1
|
|
119
|
+
async with something as ctx:
|
|
120
|
+
do_stuff()
|
|
121
|
+
y = 2
|
|
122
|
+
"""
|
|
123
|
+
tree = ast.parse(source)
|
|
124
|
+
|
|
125
|
+
# Line 7 is outside the async with
|
|
126
|
+
node = _find_async_with(tree, 7)
|
|
127
|
+
assert node is None
|
|
128
|
+
|
|
129
|
+
def test_get_end_line(self) -> None:
|
|
130
|
+
"""_get_end_line returns last line of node."""
|
|
131
|
+
source = """
|
|
132
|
+
async with ctx:
|
|
133
|
+
line1()
|
|
134
|
+
line2()
|
|
135
|
+
line3()
|
|
136
|
+
"""
|
|
137
|
+
tree = ast.parse(source)
|
|
138
|
+
async_with = tree.body[0]
|
|
139
|
+
|
|
140
|
+
end_line = _get_end_line(async_with)
|
|
141
|
+
assert end_line >= 4 # At least through line 4
|
|
142
|
+
|
|
143
|
+
def test_extract_body(self) -> None:
|
|
144
|
+
"""_extract_body extracts the body source from async with."""
|
|
145
|
+
source = """async with ctx:
|
|
146
|
+
do_thing()
|
|
147
|
+
more_thing()
|
|
148
|
+
"""
|
|
149
|
+
lines = source.split("\n")
|
|
150
|
+
lines = [line + "\n" for line in lines]
|
|
151
|
+
|
|
152
|
+
tree = ast.parse(source)
|
|
153
|
+
async_with = tree.body[0]
|
|
154
|
+
assert isinstance(async_with, ast.AsyncWith)
|
|
155
|
+
|
|
156
|
+
body = _extract_body(lines, async_with)
|
|
157
|
+
assert "do_thing()" in body
|
|
158
|
+
assert "more_thing()" in body
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class TestASTExtractionError:
|
|
162
|
+
"""Tests for ASTExtractionError."""
|
|
163
|
+
|
|
164
|
+
def test_is_exception(self) -> None:
|
|
165
|
+
"""ASTExtractionError is an exception."""
|
|
166
|
+
error = ASTExtractionError("test message")
|
|
167
|
+
assert isinstance(error, Exception)
|
|
168
|
+
assert str(error) == "test message"
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""Tests for hud.eval.task module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from hud.eval.task import Task, TaskAgentConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestTaskSerialization:
|
|
11
|
+
"""Tests for Task serialization and roundtrip."""
|
|
12
|
+
|
|
13
|
+
def test_v5_task_roundtrip(self) -> None:
|
|
14
|
+
"""v5 Task serializes and deserializes correctly."""
|
|
15
|
+
task = Task(
|
|
16
|
+
env={"name": "browser", "include": ["navigate", "click"]},
|
|
17
|
+
scenario="checkout",
|
|
18
|
+
id="task-1",
|
|
19
|
+
args={"user_id": "alice"},
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Serialize
|
|
23
|
+
data = task.model_dump(mode="json")
|
|
24
|
+
|
|
25
|
+
# Should have v5 format
|
|
26
|
+
assert "env" in data
|
|
27
|
+
assert data["env"]["name"] == "browser"
|
|
28
|
+
assert data["scenario"] == "checkout"
|
|
29
|
+
assert data["id"] == "task-1"
|
|
30
|
+
|
|
31
|
+
# Recreate from serialized data
|
|
32
|
+
task2 = Task(**data)
|
|
33
|
+
|
|
34
|
+
# Serialize again
|
|
35
|
+
data2 = task2.model_dump(mode="json")
|
|
36
|
+
|
|
37
|
+
# Should be identical
|
|
38
|
+
assert data == data2
|
|
39
|
+
|
|
40
|
+
def test_v4_task_roundtrip(self) -> None:
|
|
41
|
+
"""v4 Task serializes (flattens) and deserializes correctly."""
|
|
42
|
+
v4_dict = {
|
|
43
|
+
"prompt": "Go to google.com and search for cats",
|
|
44
|
+
"mcp_config": {
|
|
45
|
+
"browser": {"url": "http://localhost:8080"},
|
|
46
|
+
},
|
|
47
|
+
"evaluate_tool": {"name": "check_url", "arguments": {"contains": "google"}},
|
|
48
|
+
"setup_tool": {"name": "navigate", "arguments": {"url": "about:blank"}},
|
|
49
|
+
"id": "v4-task-1",
|
|
50
|
+
"agent_config": {"system_prompt": "You are a helpful assistant"},
|
|
51
|
+
"metadata": {"category": "navigation"},
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Create Task from v4 dict
|
|
55
|
+
task = Task.from_v4(v4_dict)
|
|
56
|
+
|
|
57
|
+
# Serialize (should flatten to v4 format)
|
|
58
|
+
data = task.model_dump(mode="json")
|
|
59
|
+
|
|
60
|
+
# Should have v4 format (flat, not nested env)
|
|
61
|
+
assert "prompt" in data
|
|
62
|
+
assert "mcp_config" in data
|
|
63
|
+
assert "evaluate_tool" in data
|
|
64
|
+
assert data["prompt"] == "Go to google.com and search for cats"
|
|
65
|
+
assert data["id"] == "v4-task-1"
|
|
66
|
+
|
|
67
|
+
# Recreate from serialized data
|
|
68
|
+
task2 = Task(**data)
|
|
69
|
+
|
|
70
|
+
# Serialize again
|
|
71
|
+
data2 = task2.model_dump(mode="json")
|
|
72
|
+
|
|
73
|
+
# Should be identical
|
|
74
|
+
assert data == data2
|
|
75
|
+
|
|
76
|
+
def test_v4_preserves_agent_config(self) -> None:
|
|
77
|
+
"""v4 Task preserves agent_config through roundtrip."""
|
|
78
|
+
v4_dict = {
|
|
79
|
+
"prompt": "Test prompt",
|
|
80
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
81
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
82
|
+
"agent_config": {"system_prompt": "Custom system prompt"},
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
task = Task.from_v4(v4_dict)
|
|
86
|
+
data = task.model_dump(mode="json")
|
|
87
|
+
|
|
88
|
+
# agent_config should preserve system_prompt and restore tool filters
|
|
89
|
+
agent_config = data.get("agent_config")
|
|
90
|
+
assert agent_config is not None
|
|
91
|
+
assert agent_config["system_prompt"] == "Custom system prompt"
|
|
92
|
+
# allowed_tools defaults to ["*"] when not specified (restored during serialization)
|
|
93
|
+
assert agent_config["allowed_tools"] == ["*"]
|
|
94
|
+
# These have default False values from TaskAgentConfig
|
|
95
|
+
assert agent_config["append_setup_output"] is False
|
|
96
|
+
assert agent_config["append_setup_tool"] is False
|
|
97
|
+
|
|
98
|
+
# Roundtrip
|
|
99
|
+
task2 = Task(**data)
|
|
100
|
+
assert task2.agent_config is not None
|
|
101
|
+
assert isinstance(task2.agent_config, TaskAgentConfig)
|
|
102
|
+
assert task2.agent_config.system_prompt == "Custom system prompt"
|
|
103
|
+
# Tool filters should be on Environment after roundtrip
|
|
104
|
+
assert task2.env is not None
|
|
105
|
+
assert task2.env._agent_include is None # ["*"] → None
|
|
106
|
+
|
|
107
|
+
def test_v4_preserves_metadata(self) -> None:
|
|
108
|
+
"""v4 Task preserves metadata through roundtrip."""
|
|
109
|
+
v4_dict = {
|
|
110
|
+
"prompt": "Test prompt",
|
|
111
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
112
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
113
|
+
"metadata": {"key1": "value1", "key2": 42},
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
task = Task.from_v4(v4_dict)
|
|
117
|
+
data = task.model_dump(mode="json")
|
|
118
|
+
|
|
119
|
+
assert data.get("metadata") == {"key1": "value1", "key2": 42}
|
|
120
|
+
|
|
121
|
+
# Roundtrip
|
|
122
|
+
task2 = Task(**data)
|
|
123
|
+
assert task2.metadata == {"key1": "value1", "key2": 42}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class TestTaskValidation:
|
|
127
|
+
"""Tests for Task validation."""
|
|
128
|
+
|
|
129
|
+
def test_v5_allows_none_env(self) -> None:
|
|
130
|
+
"""v5 Task allows None env (for blank evals)."""
|
|
131
|
+
task = Task(scenario="test") # env=None is valid
|
|
132
|
+
assert task.env is None
|
|
133
|
+
assert task.scenario == "test"
|
|
134
|
+
|
|
135
|
+
def test_v4_requires_evaluate_tool(self) -> None:
|
|
136
|
+
"""v4 Task requires evaluate_tool for validation."""
|
|
137
|
+
from hud.eval.utils import validate_v4_task
|
|
138
|
+
|
|
139
|
+
with pytest.raises(ValueError, match="evaluate_tool"):
|
|
140
|
+
validate_v4_task(
|
|
141
|
+
{
|
|
142
|
+
"prompt": "test",
|
|
143
|
+
"mcp_config": {"server": {}},
|
|
144
|
+
# Missing evaluate_tool
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def test_agent_config_accepts_dict(self) -> None:
|
|
149
|
+
"""agent_config can be provided as dict and gets converted."""
|
|
150
|
+
task = Task(
|
|
151
|
+
env={"name": "browser"},
|
|
152
|
+
agent_config={"system_prompt": "Hello"},
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
assert isinstance(task.agent_config, TaskAgentConfig)
|
|
156
|
+
assert task.agent_config.system_prompt == "Hello"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class TestV4AgentConfigToolFilters:
|
|
160
|
+
"""Tests for v4 agent_config.allowed_tools and disallowed_tools processing."""
|
|
161
|
+
|
|
162
|
+
def test_v4_extracts_allowed_tools(self) -> None:
|
|
163
|
+
"""v4 allowed_tools is extracted and stored on Environment."""
|
|
164
|
+
v4_dict = {
|
|
165
|
+
"prompt": "Test prompt",
|
|
166
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
167
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
168
|
+
"agent_config": {
|
|
169
|
+
"allowed_tools": ["browser_*", "file_read"],
|
|
170
|
+
},
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
task = Task.from_v4(v4_dict)
|
|
174
|
+
|
|
175
|
+
assert task.env is not None
|
|
176
|
+
assert task.env._agent_include == ["browser_*", "file_read"]
|
|
177
|
+
|
|
178
|
+
def test_v4_extracts_disallowed_tools(self) -> None:
|
|
179
|
+
"""v4 disallowed_tools is extracted and stored on Environment."""
|
|
180
|
+
v4_dict = {
|
|
181
|
+
"prompt": "Test prompt",
|
|
182
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
183
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
184
|
+
"agent_config": {
|
|
185
|
+
"disallowed_tools": ["*setup*", "*evaluate*", "checkout_branch"],
|
|
186
|
+
},
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
task = Task.from_v4(v4_dict)
|
|
190
|
+
|
|
191
|
+
assert task.env is not None
|
|
192
|
+
assert task.env._agent_exclude == ["*setup*", "*evaluate*", "checkout_branch"]
|
|
193
|
+
|
|
194
|
+
def test_v4_wildcard_star_allowed_converts_to_none(self) -> None:
|
|
195
|
+
"""v4 allowed_tools=['*'] converts to None (meaning include all)."""
|
|
196
|
+
v4_dict = {
|
|
197
|
+
"prompt": "Test prompt",
|
|
198
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
199
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
200
|
+
"agent_config": {
|
|
201
|
+
"allowed_tools": ["*"],
|
|
202
|
+
},
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
task = Task.from_v4(v4_dict)
|
|
206
|
+
|
|
207
|
+
assert task.env is not None
|
|
208
|
+
# ["*"] should be converted to None
|
|
209
|
+
assert task.env._agent_include is None
|
|
210
|
+
|
|
211
|
+
def test_v4_both_allowed_and_disallowed(self) -> None:
|
|
212
|
+
"""v4 supports both allowed_tools and disallowed_tools together."""
|
|
213
|
+
v4_dict = {
|
|
214
|
+
"prompt": "Test prompt",
|
|
215
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
216
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
217
|
+
"agent_config": {
|
|
218
|
+
"allowed_tools": ["*"],
|
|
219
|
+
"disallowed_tools": ["*setup*", "*evaluate*"],
|
|
220
|
+
},
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
task = Task.from_v4(v4_dict)
|
|
224
|
+
|
|
225
|
+
assert task.env is not None
|
|
226
|
+
assert task.env._agent_include is None # ["*"] → None
|
|
227
|
+
assert task.env._agent_exclude == ["*setup*", "*evaluate*"]
|
|
228
|
+
|
|
229
|
+
@pytest.mark.asyncio
|
|
230
|
+
async def test_v4_tool_filters_applied_in_as_tools(self) -> None:
|
|
231
|
+
"""v4 tool filters are applied when calling env.as_tools()."""
|
|
232
|
+
v4_dict = {
|
|
233
|
+
"prompt": "Test prompt",
|
|
234
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
235
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
236
|
+
"agent_config": {
|
|
237
|
+
"allowed_tools": ["*"],
|
|
238
|
+
"disallowed_tools": ["*setup*"],
|
|
239
|
+
},
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
task = Task.from_v4(v4_dict)
|
|
243
|
+
env = task.env
|
|
244
|
+
assert env is not None
|
|
245
|
+
|
|
246
|
+
# Add local tools to test filtering
|
|
247
|
+
@env.tool()
|
|
248
|
+
def my_setup_tool() -> str:
|
|
249
|
+
"""Should be filtered out."""
|
|
250
|
+
return "setup"
|
|
251
|
+
|
|
252
|
+
@env.tool()
|
|
253
|
+
def run_query() -> str:
|
|
254
|
+
"""Should be visible."""
|
|
255
|
+
return "query"
|
|
256
|
+
|
|
257
|
+
await env._build_routing()
|
|
258
|
+
|
|
259
|
+
tools = env.as_tools()
|
|
260
|
+
tool_names = [t.name for t in tools]
|
|
261
|
+
|
|
262
|
+
assert "my_setup_tool" not in tool_names
|
|
263
|
+
assert "run_query" in tool_names
|
|
264
|
+
|
|
265
|
+
def test_v4_tool_filters_preserved_in_serialization(self) -> None:
|
|
266
|
+
"""v4 tool filters are preserved when serializing for remote execution."""
|
|
267
|
+
v4_dict = {
|
|
268
|
+
"prompt": "Test prompt",
|
|
269
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
270
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
271
|
+
"agent_config": {
|
|
272
|
+
"allowed_tools": ["*"],
|
|
273
|
+
"disallowed_tools": ["*setup*", "*evaluate*", "*grade*"],
|
|
274
|
+
},
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
task = Task.from_v4(v4_dict)
|
|
278
|
+
|
|
279
|
+
# Serialize (this is what gets sent to remote execution)
|
|
280
|
+
data = task.model_dump(mode="json")
|
|
281
|
+
|
|
282
|
+
# agent_config must include the tool filters for remote execution
|
|
283
|
+
assert "agent_config" in data
|
|
284
|
+
assert data["agent_config"]["allowed_tools"] == ["*"]
|
|
285
|
+
assert data["agent_config"]["disallowed_tools"] == ["*setup*", "*evaluate*", "*grade*"]
|
|
286
|
+
|
|
287
|
+
# Verify roundtrip works (remote worker will deserialize this)
|
|
288
|
+
task2 = Task(**data)
|
|
289
|
+
assert task2.env is not None
|
|
290
|
+
assert task2.env._agent_include is None # ["*"] → None
|
|
291
|
+
assert task2.env._agent_exclude == ["*setup*", "*evaluate*", "*grade*"]
|
hud/eval/types.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Types and exceptions for the eval module.
|
|
2
|
+
|
|
3
|
+
Kept separate to avoid circular imports.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
# =============================================================================
|
|
13
|
+
# Exceptions
|
|
14
|
+
# =============================================================================
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ParallelEvalComplete(Exception):
|
|
18
|
+
"""Raised by summary context to skip body re-execution after parallel eval.
|
|
19
|
+
|
|
20
|
+
This is caught by the eval() context manager to cleanly exit.
|
|
21
|
+
The summary context with results is still accessible after the with block.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# =============================================================================
|
|
26
|
+
# Payload Models
|
|
27
|
+
# =============================================================================
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EvalPayload(BaseModel):
|
|
31
|
+
"""Base payload for eval enter/exit."""
|
|
32
|
+
|
|
33
|
+
prompt: str | None = None
|
|
34
|
+
code_snippet: str | None = None
|
|
35
|
+
job_id: str | None = None
|
|
36
|
+
group_id: str | None = None
|
|
37
|
+
variants: dict[str, Any] | None = None
|
|
38
|
+
task_version_id: str | None = None
|
|
39
|
+
metadata: dict[str, Any] | None = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class EvalExitPayload(EvalPayload):
|
|
43
|
+
"""Exit payload with result fields."""
|
|
44
|
+
|
|
45
|
+
reward: float | None = None
|
|
46
|
+
success: bool = True
|
|
47
|
+
error_message: str | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class JobEnterPayload(BaseModel):
|
|
51
|
+
"""Payload for job/{job_id}/enter - sent once at job start."""
|
|
52
|
+
|
|
53
|
+
name: str | None = None
|
|
54
|
+
variants: dict[str, Any] | None = None # Full variant config
|
|
55
|
+
group: int | None = None
|
|
56
|
+
taskset: str | None = None # taskset slug to associate job with
|
|
57
|
+
tasks: list[dict[str, Any]] | None = None # task definitions to add to taskset
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__all__ = [
|
|
61
|
+
"EvalExitPayload",
|
|
62
|
+
"EvalPayload",
|
|
63
|
+
"JobEnterPayload",
|
|
64
|
+
"ParallelEvalComplete",
|
|
65
|
+
]
|