hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Tests for hud.eval.parallel module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from hud.eval.parallel import (
|
|
10
|
+
ASTExtractionError,
|
|
11
|
+
_extract_body,
|
|
12
|
+
_find_async_with,
|
|
13
|
+
_get_end_line,
|
|
14
|
+
expand_variants,
|
|
15
|
+
resolve_group_ids,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TestExpandVariants:
|
|
20
|
+
"""Tests for expand_variants helper."""
|
|
21
|
+
|
|
22
|
+
def test_none_returns_empty_dict(self) -> None:
|
|
23
|
+
"""None variants returns list with empty dict."""
|
|
24
|
+
result = expand_variants(None)
|
|
25
|
+
assert result == [{}]
|
|
26
|
+
|
|
27
|
+
def test_empty_dict_returns_empty_dict(self) -> None:
|
|
28
|
+
"""Empty variants returns list with empty dict."""
|
|
29
|
+
result = expand_variants({})
|
|
30
|
+
assert result == [{}]
|
|
31
|
+
|
|
32
|
+
def test_single_value_stays_single(self) -> None:
|
|
33
|
+
"""Single non-list value stays as single variant."""
|
|
34
|
+
result = expand_variants({"model": "gpt-4o"})
|
|
35
|
+
assert result == [{"model": "gpt-4o"}]
|
|
36
|
+
|
|
37
|
+
def test_list_expands_to_variants(self) -> None:
|
|
38
|
+
"""List value expands to multiple variants."""
|
|
39
|
+
result = expand_variants({"model": ["gpt-4o", "claude"]})
|
|
40
|
+
assert result == [{"model": "gpt-4o"}, {"model": "claude"}]
|
|
41
|
+
|
|
42
|
+
def test_multiple_lists_create_combinations(self) -> None:
|
|
43
|
+
"""Multiple lists create all combinations."""
|
|
44
|
+
result = expand_variants(
|
|
45
|
+
{
|
|
46
|
+
"model": ["a", "b"],
|
|
47
|
+
"temp": [0.0, 1.0],
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
assert len(result) == 4
|
|
52
|
+
assert {"model": "a", "temp": 0.0} in result
|
|
53
|
+
assert {"model": "a", "temp": 1.0} in result
|
|
54
|
+
assert {"model": "b", "temp": 0.0} in result
|
|
55
|
+
assert {"model": "b", "temp": 1.0} in result
|
|
56
|
+
|
|
57
|
+
def test_mixed_single_and_list(self) -> None:
|
|
58
|
+
"""Mixed single values and lists work correctly."""
|
|
59
|
+
result = expand_variants(
|
|
60
|
+
{
|
|
61
|
+
"model": ["gpt-4o", "claude"],
|
|
62
|
+
"temp": 0.7,
|
|
63
|
+
}
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert len(result) == 2
|
|
67
|
+
assert {"model": "gpt-4o", "temp": 0.7} in result
|
|
68
|
+
assert {"model": "claude", "temp": 0.7} in result
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class TestResolveGroupIds:
|
|
72
|
+
"""Tests for resolve_group_ids helper."""
|
|
73
|
+
|
|
74
|
+
def test_uses_provided_group_ids(self) -> None:
|
|
75
|
+
"""Uses provided group_ids when given."""
|
|
76
|
+
result = resolve_group_ids(["a", "b", "c"], 3)
|
|
77
|
+
assert result == ["a", "b", "c"]
|
|
78
|
+
|
|
79
|
+
def test_generates_shared_group_id(self) -> None:
|
|
80
|
+
"""Generates shared group_id when not provided."""
|
|
81
|
+
result = resolve_group_ids(None, 3)
|
|
82
|
+
assert len(result) == 3
|
|
83
|
+
# All should be the same
|
|
84
|
+
assert result[0] == result[1] == result[2]
|
|
85
|
+
# Should be a valid UUID
|
|
86
|
+
assert len(result[0]) == 36
|
|
87
|
+
|
|
88
|
+
def test_raises_on_length_mismatch(self) -> None:
|
|
89
|
+
"""Raises ValueError when group_ids length doesn't match."""
|
|
90
|
+
with pytest.raises(ValueError, match="group_ids length"):
|
|
91
|
+
resolve_group_ids(["a", "b"], 3)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class TestASTHelpers:
|
|
95
|
+
"""Tests for AST helper functions."""
|
|
96
|
+
|
|
97
|
+
def test_find_async_with_finds_correct_node(self) -> None:
|
|
98
|
+
"""_find_async_with finds the async with containing target line."""
|
|
99
|
+
source = """
|
|
100
|
+
async def main():
|
|
101
|
+
x = 1
|
|
102
|
+
async with something as ctx:
|
|
103
|
+
do_stuff()
|
|
104
|
+
more_stuff()
|
|
105
|
+
y = 2
|
|
106
|
+
"""
|
|
107
|
+
tree = ast.parse(source)
|
|
108
|
+
|
|
109
|
+
# Line 5 is inside the async with
|
|
110
|
+
node = _find_async_with(tree, 5)
|
|
111
|
+
assert node is not None
|
|
112
|
+
assert isinstance(node, ast.AsyncWith)
|
|
113
|
+
|
|
114
|
+
def test_find_async_with_returns_none_when_not_found(self) -> None:
|
|
115
|
+
"""_find_async_with returns None when line is outside async with."""
|
|
116
|
+
source = """
|
|
117
|
+
async def main():
|
|
118
|
+
x = 1
|
|
119
|
+
async with something as ctx:
|
|
120
|
+
do_stuff()
|
|
121
|
+
y = 2
|
|
122
|
+
"""
|
|
123
|
+
tree = ast.parse(source)
|
|
124
|
+
|
|
125
|
+
# Line 7 is outside the async with
|
|
126
|
+
node = _find_async_with(tree, 7)
|
|
127
|
+
assert node is None
|
|
128
|
+
|
|
129
|
+
def test_get_end_line(self) -> None:
|
|
130
|
+
"""_get_end_line returns last line of node."""
|
|
131
|
+
source = """
|
|
132
|
+
async with ctx:
|
|
133
|
+
line1()
|
|
134
|
+
line2()
|
|
135
|
+
line3()
|
|
136
|
+
"""
|
|
137
|
+
tree = ast.parse(source)
|
|
138
|
+
async_with = tree.body[0]
|
|
139
|
+
|
|
140
|
+
end_line = _get_end_line(async_with)
|
|
141
|
+
assert end_line >= 4 # At least through line 4
|
|
142
|
+
|
|
143
|
+
def test_extract_body(self) -> None:
|
|
144
|
+
"""_extract_body extracts the body source from async with."""
|
|
145
|
+
source = """async with ctx:
|
|
146
|
+
do_thing()
|
|
147
|
+
more_thing()
|
|
148
|
+
"""
|
|
149
|
+
lines = source.split("\n")
|
|
150
|
+
lines = [line + "\n" for line in lines]
|
|
151
|
+
|
|
152
|
+
tree = ast.parse(source)
|
|
153
|
+
async_with = tree.body[0]
|
|
154
|
+
assert isinstance(async_with, ast.AsyncWith)
|
|
155
|
+
|
|
156
|
+
body = _extract_body(lines, async_with)
|
|
157
|
+
assert "do_thing()" in body
|
|
158
|
+
assert "more_thing()" in body
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class TestASTExtractionError:
|
|
162
|
+
"""Tests for ASTExtractionError."""
|
|
163
|
+
|
|
164
|
+
def test_is_exception(self) -> None:
|
|
165
|
+
"""ASTExtractionError is an exception."""
|
|
166
|
+
error = ASTExtractionError("test message")
|
|
167
|
+
assert isinstance(error, Exception)
|
|
168
|
+
assert str(error) == "test message"
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Tests for hud.eval.task module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from hud.eval.task import Task, TaskAgentConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestTaskSerialization:
|
|
11
|
+
"""Tests for Task serialization and roundtrip."""
|
|
12
|
+
|
|
13
|
+
def test_v5_task_roundtrip(self) -> None:
|
|
14
|
+
"""v5 Task serializes and deserializes correctly."""
|
|
15
|
+
task = Task(
|
|
16
|
+
env={"name": "browser", "include": ["navigate", "click"]},
|
|
17
|
+
scenario="checkout",
|
|
18
|
+
id="task-1",
|
|
19
|
+
args={"user_id": "alice"},
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Serialize
|
|
23
|
+
data = task.model_dump(mode="json")
|
|
24
|
+
|
|
25
|
+
# Should have v5 format
|
|
26
|
+
assert "env" in data
|
|
27
|
+
assert data["env"]["name"] == "browser"
|
|
28
|
+
assert data["scenario"] == "checkout"
|
|
29
|
+
assert data["id"] == "task-1"
|
|
30
|
+
|
|
31
|
+
# Recreate from serialized data
|
|
32
|
+
task2 = Task(**data)
|
|
33
|
+
|
|
34
|
+
# Serialize again
|
|
35
|
+
data2 = task2.model_dump(mode="json")
|
|
36
|
+
|
|
37
|
+
# Should be identical
|
|
38
|
+
assert data == data2
|
|
39
|
+
|
|
40
|
+
def test_v4_task_roundtrip(self) -> None:
|
|
41
|
+
"""v4 Task serializes (flattens) and deserializes correctly."""
|
|
42
|
+
v4_dict = {
|
|
43
|
+
"prompt": "Go to google.com and search for cats",
|
|
44
|
+
"mcp_config": {
|
|
45
|
+
"browser": {"url": "http://localhost:8080"},
|
|
46
|
+
},
|
|
47
|
+
"evaluate_tool": {"name": "check_url", "arguments": {"contains": "google"}},
|
|
48
|
+
"setup_tool": {"name": "navigate", "arguments": {"url": "about:blank"}},
|
|
49
|
+
"id": "v4-task-1",
|
|
50
|
+
"agent_config": {"system_prompt": "You are a helpful assistant"},
|
|
51
|
+
"metadata": {"category": "navigation"},
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Create Task from v4 dict
|
|
55
|
+
task = Task.from_v4(v4_dict)
|
|
56
|
+
|
|
57
|
+
# Serialize (should flatten to v4 format)
|
|
58
|
+
data = task.model_dump(mode="json")
|
|
59
|
+
|
|
60
|
+
# Should have v4 format (flat, not nested env)
|
|
61
|
+
assert "prompt" in data
|
|
62
|
+
assert "mcp_config" in data
|
|
63
|
+
assert "evaluate_tool" in data
|
|
64
|
+
assert data["prompt"] == "Go to google.com and search for cats"
|
|
65
|
+
assert data["id"] == "v4-task-1"
|
|
66
|
+
|
|
67
|
+
# Recreate from serialized data
|
|
68
|
+
task2 = Task(**data)
|
|
69
|
+
|
|
70
|
+
# Serialize again
|
|
71
|
+
data2 = task2.model_dump(mode="json")
|
|
72
|
+
|
|
73
|
+
# Should be identical
|
|
74
|
+
assert data == data2
|
|
75
|
+
|
|
76
|
+
def test_v4_preserves_agent_config(self) -> None:
|
|
77
|
+
"""v4 Task preserves agent_config through roundtrip."""
|
|
78
|
+
v4_dict = {
|
|
79
|
+
"prompt": "Test prompt",
|
|
80
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
81
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
82
|
+
"agent_config": {"system_prompt": "Custom system prompt"},
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
task = Task.from_v4(v4_dict)
|
|
86
|
+
data = task.model_dump(mode="json")
|
|
87
|
+
|
|
88
|
+
assert data.get("agent_config") == {"system_prompt": "Custom system prompt"}
|
|
89
|
+
|
|
90
|
+
# Roundtrip
|
|
91
|
+
task2 = Task(**data)
|
|
92
|
+
assert task2.agent_config is not None
|
|
93
|
+
assert isinstance(task2.agent_config, TaskAgentConfig)
|
|
94
|
+
assert task2.agent_config.system_prompt == "Custom system prompt"
|
|
95
|
+
|
|
96
|
+
def test_v4_preserves_metadata(self) -> None:
|
|
97
|
+
"""v4 Task preserves metadata through roundtrip."""
|
|
98
|
+
v4_dict = {
|
|
99
|
+
"prompt": "Test prompt",
|
|
100
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
101
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
102
|
+
"metadata": {"key1": "value1", "key2": 42},
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
task = Task.from_v4(v4_dict)
|
|
106
|
+
data = task.model_dump(mode="json")
|
|
107
|
+
|
|
108
|
+
assert data.get("metadata") == {"key1": "value1", "key2": 42}
|
|
109
|
+
|
|
110
|
+
# Roundtrip
|
|
111
|
+
task2 = Task(**data)
|
|
112
|
+
assert task2.metadata == {"key1": "value1", "key2": 42}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class TestTaskValidation:
|
|
116
|
+
"""Tests for Task validation."""
|
|
117
|
+
|
|
118
|
+
def test_v5_allows_none_env(self) -> None:
|
|
119
|
+
"""v5 Task allows None env (for blank evals)."""
|
|
120
|
+
task = Task(scenario="test") # env=None is valid
|
|
121
|
+
assert task.env is None
|
|
122
|
+
assert task.scenario == "test"
|
|
123
|
+
|
|
124
|
+
def test_v4_requires_evaluate_tool(self) -> None:
|
|
125
|
+
"""v4 Task requires evaluate_tool for validation."""
|
|
126
|
+
from hud.eval.utils import validate_v4_task
|
|
127
|
+
|
|
128
|
+
with pytest.raises(ValueError, match="evaluate_tool"):
|
|
129
|
+
validate_v4_task(
|
|
130
|
+
{
|
|
131
|
+
"prompt": "test",
|
|
132
|
+
"mcp_config": {"server": {}},
|
|
133
|
+
# Missing evaluate_tool
|
|
134
|
+
}
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def test_agent_config_accepts_dict(self) -> None:
|
|
138
|
+
"""agent_config can be provided as dict and gets converted."""
|
|
139
|
+
task = Task(
|
|
140
|
+
env={"name": "browser"},
|
|
141
|
+
agent_config={"system_prompt": "Hello"},
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
assert isinstance(task.agent_config, TaskAgentConfig)
|
|
145
|
+
assert task.agent_config.system_prompt == "Hello"
|
hud/eval/types.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Types and exceptions for the eval module.
|
|
2
|
+
|
|
3
|
+
Kept separate to avoid circular imports.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
# =============================================================================
|
|
13
|
+
# Exceptions
|
|
14
|
+
# =============================================================================
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ParallelEvalComplete(Exception):
|
|
18
|
+
"""Raised by summary context to skip body re-execution after parallel eval.
|
|
19
|
+
|
|
20
|
+
This is caught by the eval() context manager to cleanly exit.
|
|
21
|
+
The summary context with results is still accessible after the with block.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# =============================================================================
|
|
26
|
+
# Payload Models
|
|
27
|
+
# =============================================================================
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EvalPayload(BaseModel):
|
|
31
|
+
"""Base payload for eval enter/exit."""
|
|
32
|
+
|
|
33
|
+
prompt: str | None = None
|
|
34
|
+
code_snippet: str | None = None
|
|
35
|
+
job_id: str | None = None
|
|
36
|
+
group_id: str | None = None
|
|
37
|
+
variants: dict[str, Any] | None = None
|
|
38
|
+
task_version_id: str | None = None
|
|
39
|
+
metadata: dict[str, Any] | None = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class EvalExitPayload(EvalPayload):
|
|
43
|
+
"""Exit payload with result fields."""
|
|
44
|
+
|
|
45
|
+
reward: float | None = None
|
|
46
|
+
success: bool = True
|
|
47
|
+
error_message: str | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class JobEnterPayload(BaseModel):
|
|
51
|
+
"""Payload for job/{job_id}/enter - sent once at job start."""
|
|
52
|
+
|
|
53
|
+
name: str | None = None
|
|
54
|
+
variants: dict[str, Any] | None = None # Full variant config
|
|
55
|
+
group: int | None = None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
__all__ = [
|
|
59
|
+
"EvalExitPayload",
|
|
60
|
+
"EvalPayload",
|
|
61
|
+
"JobEnterPayload",
|
|
62
|
+
"ParallelEvalComplete",
|
|
63
|
+
]
|
hud/eval/utils.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Utility functions for the eval module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import warnings
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
__all__ = ["build_env_from_v4", "is_v4_format", "validate_v4_task"]
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def is_v4_format(data: dict[str, Any]) -> bool:
|
|
15
|
+
"""Detect if dict looks like v4 LegacyTask format.
|
|
16
|
+
|
|
17
|
+
Used for branching logic. Checks if data has the core v4 fields
|
|
18
|
+
(prompt AND mcp_config). Does NOT validate completeness.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
data: Dict to check
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
True if looks like v4 format, False otherwise
|
|
25
|
+
"""
|
|
26
|
+
if not isinstance(data, dict):
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
# Core v4 detection: prompt + mcp_config
|
|
30
|
+
return bool(data.get("prompt")) and bool(data.get("mcp_config"))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def validate_v4_task(data: dict[str, Any]) -> None:
|
|
34
|
+
"""Validate v4 task has all required fields.
|
|
35
|
+
|
|
36
|
+
A valid v4 task must have all three required fields:
|
|
37
|
+
- prompt: The task instruction
|
|
38
|
+
- mcp_config: MCP server configuration
|
|
39
|
+
- evaluate_tool: How to evaluate success
|
|
40
|
+
|
|
41
|
+
Call this after is_v4_format() when you need to ensure completeness.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
data: Dict to validate
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If any required fields are missing
|
|
48
|
+
"""
|
|
49
|
+
missing = []
|
|
50
|
+
if not data.get("prompt"):
|
|
51
|
+
missing.append("prompt")
|
|
52
|
+
if not data.get("mcp_config"):
|
|
53
|
+
missing.append("mcp_config")
|
|
54
|
+
if not data.get("evaluate_tool"):
|
|
55
|
+
missing.append("evaluate_tool")
|
|
56
|
+
|
|
57
|
+
if missing:
|
|
58
|
+
raise ValueError(f"v4 task missing required fields: {', '.join(missing)}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
|
|
62
|
+
"""Build Environment from v4 LegacyTask format.
|
|
63
|
+
|
|
64
|
+
Creates an Environment configured with the legacy task's fields.
|
|
65
|
+
Returns a dict ready to be passed to Task() constructor.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
source: dict or LegacyTask with v4 fields (prompt, mcp_config, etc.)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Dict with Task fields: env, id, scenario, args, validation, system_prompt, metadata
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
TypeError: If source is not a dict or LegacyTask
|
|
75
|
+
"""
|
|
76
|
+
from hud.environment import Environment
|
|
77
|
+
from hud.types import LegacyTask, MCPToolCall
|
|
78
|
+
|
|
79
|
+
# Convert dict to LegacyTask if needed
|
|
80
|
+
if isinstance(source, dict):
|
|
81
|
+
with warnings.catch_warnings():
|
|
82
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
83
|
+
legacy = LegacyTask(**source)
|
|
84
|
+
elif isinstance(source, LegacyTask):
|
|
85
|
+
legacy = source
|
|
86
|
+
else:
|
|
87
|
+
raise TypeError(f"Expected dict or LegacyTask, got {type(source).__name__}")
|
|
88
|
+
|
|
89
|
+
# Warn if using local MCP configs (command without url)
|
|
90
|
+
_warn_local_mcp(legacy.mcp_config)
|
|
91
|
+
|
|
92
|
+
# Extract tool filters from agent_config (v4 style)
|
|
93
|
+
# These are agent-level filters, not connection-level
|
|
94
|
+
include_tools: list[str] | None = None
|
|
95
|
+
exclude_tools: list[str] | None = None
|
|
96
|
+
if legacy.agent_config:
|
|
97
|
+
include_tools = legacy.agent_config.allowed_tools
|
|
98
|
+
exclude_tools = legacy.agent_config.disallowed_tools
|
|
99
|
+
|
|
100
|
+
# Convert ["*"] wildcard to None (meaning include all)
|
|
101
|
+
if include_tools == ["*"]:
|
|
102
|
+
include_tools = None
|
|
103
|
+
|
|
104
|
+
# Create Environment - NO connections made here, just config stored
|
|
105
|
+
env = Environment(legacy.id or "v4-legacy")
|
|
106
|
+
env.connect_mcp_config(legacy.mcp_config)
|
|
107
|
+
|
|
108
|
+
# Store agent-level tool filters on Environment (applied in as_tools())
|
|
109
|
+
# This allows Environment to call setup/evaluate while hiding them from agent
|
|
110
|
+
env._agent_include = include_tools
|
|
111
|
+
env._agent_exclude = exclude_tools
|
|
112
|
+
|
|
113
|
+
# Set the prompt
|
|
114
|
+
env.prompt = legacy.prompt
|
|
115
|
+
|
|
116
|
+
# Add setup_tool calls (stored, not executed)
|
|
117
|
+
if legacy.setup_tool:
|
|
118
|
+
setup_calls = legacy.setup_tool
|
|
119
|
+
if not isinstance(setup_calls, list):
|
|
120
|
+
setup_calls = [setup_calls]
|
|
121
|
+
for call in setup_calls:
|
|
122
|
+
env.setup_tool(call.name, **(call.arguments or {}))
|
|
123
|
+
|
|
124
|
+
# Add evaluate_tool calls (stored, not executed)
|
|
125
|
+
if legacy.evaluate_tool:
|
|
126
|
+
eval_calls = legacy.evaluate_tool
|
|
127
|
+
if not isinstance(eval_calls, list):
|
|
128
|
+
eval_calls = [eval_calls]
|
|
129
|
+
for call in eval_calls:
|
|
130
|
+
env.evaluate_tool(call.name, **(call.arguments or {}))
|
|
131
|
+
|
|
132
|
+
# Build Task fields dict
|
|
133
|
+
result: dict[str, Any] = {
|
|
134
|
+
"env": env,
|
|
135
|
+
"id": legacy.id,
|
|
136
|
+
"scenario": None, # v4 uses prompt, not scenarios
|
|
137
|
+
"args": {},
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Map integration_test_tool → validation (same concept: tool calls to verify)
|
|
141
|
+
if legacy.integration_test_tool:
|
|
142
|
+
int_test = legacy.integration_test_tool
|
|
143
|
+
if not isinstance(int_test, list):
|
|
144
|
+
int_test = [int_test]
|
|
145
|
+
# Convert to MCPToolCall if needed
|
|
146
|
+
result["validation"] = [
|
|
147
|
+
call if isinstance(call, MCPToolCall) else MCPToolCall(**call.model_dump())
|
|
148
|
+
for call in int_test
|
|
149
|
+
]
|
|
150
|
+
|
|
151
|
+
# Extract agent_config (just system_prompt for now)
|
|
152
|
+
if legacy.agent_config and legacy.agent_config.system_prompt:
|
|
153
|
+
result["agent_config"] = {"system_prompt": legacy.agent_config.system_prompt}
|
|
154
|
+
|
|
155
|
+
# Preserve metadata
|
|
156
|
+
if legacy.metadata:
|
|
157
|
+
result["metadata"] = legacy.metadata
|
|
158
|
+
|
|
159
|
+
return result
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _warn_local_mcp(mcp_config: dict[str, Any] | None) -> None:
|
|
163
|
+
"""Warn if mcp_config uses local MCP servers (command without url).
|
|
164
|
+
|
|
165
|
+
Local MCP servers can cause port conflicts when running tasks concurrently.
|
|
166
|
+
"""
|
|
167
|
+
if not mcp_config:
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
has_local = any(
|
|
171
|
+
isinstance(server_cfg, dict) and "command" in server_cfg and not server_cfg.get("url")
|
|
172
|
+
for server_cfg in mcp_config.values()
|
|
173
|
+
if isinstance(server_cfg, dict)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
if has_local:
|
|
177
|
+
warnings.warn(
|
|
178
|
+
"Task uses local MCP configuration (command without url). "
|
|
179
|
+
"This may cause port conflicts when running tasks concurrently. "
|
|
180
|
+
"Consider using remote MCP servers for parallel execution.",
|
|
181
|
+
UserWarning,
|
|
182
|
+
stacklevel=4,
|
|
183
|
+
)
|
hud/patches/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HUD runtime patches for third-party libraries.
|
|
3
|
+
|
|
4
|
+
This module applies monkey-patches to fix issues in dependencies
|
|
5
|
+
without requiring forked packages.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from hud.patches.mcp_patches import apply_all_patches, suppress_fastmcp_logging
|
|
9
|
+
from hud.patches.warnings import apply_default_warning_filters, suppress_mcp_use_import_warnings
|
|
10
|
+
|
|
11
|
+
# Apply patches on import
|
|
12
|
+
apply_all_patches()
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"apply_all_patches",
|
|
16
|
+
"apply_default_warning_filters",
|
|
17
|
+
"suppress_fastmcp_logging",
|
|
18
|
+
"suppress_mcp_use_import_warnings",
|
|
19
|
+
]
|