hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/eval/parallel.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""Parallel execution support for evaluations.
|
|
2
|
+
|
|
3
|
+
This module provides AST extraction and parallel execution for running
|
|
4
|
+
the same eval body N times concurrently.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import ast
|
|
10
|
+
import inspect
|
|
11
|
+
import itertools
|
|
12
|
+
import linecache
|
|
13
|
+
import logging
|
|
14
|
+
import textwrap
|
|
15
|
+
import uuid
|
|
16
|
+
from typing import TYPE_CHECKING, Any
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from types import FrameType
|
|
20
|
+
|
|
21
|
+
from hud.eval.context import EvalContext
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# Frames to skip when walking the call stack to find user code
|
|
26
|
+
# These are internal implementation details that shouldn't be considered user code
|
|
27
|
+
_SKIP_FRAME_PATTERNS = (
|
|
28
|
+
# Python stdlib
|
|
29
|
+
"contextlib.py",
|
|
30
|
+
"asyncio",
|
|
31
|
+
# HUD eval internals (both Unix and Windows paths)
|
|
32
|
+
"hud/eval/mixin.py",
|
|
33
|
+
"hud/eval/manager.py",
|
|
34
|
+
"hud/eval/parallel.py",
|
|
35
|
+
"hud\\eval\\mixin.py",
|
|
36
|
+
"hud\\eval\\manager.py",
|
|
37
|
+
"hud\\eval\\parallel.py",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Frames that should NOT be skipped even if in site-packages
|
|
41
|
+
# These contain legitimate async with hud.eval() calls
|
|
42
|
+
_ALLOWED_FRAME_PATTERNS = (
|
|
43
|
+
"hud/datasets/runner.py",
|
|
44
|
+
"hud\\datasets\\runner.py",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def find_user_frame() -> FrameType:
|
|
49
|
+
"""Walk the call stack to find the first user code frame.
|
|
50
|
+
|
|
51
|
+
Skips internal frames from contextlib, asyncio, and hud.eval internals.
|
|
52
|
+
Frames in site-packages are skipped UNLESS they match _ALLOWED_FRAME_PATTERNS.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
The frame containing user code (typically the async with statement).
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ASTExtractionError: If no user code frame can be found.
|
|
59
|
+
"""
|
|
60
|
+
frame = inspect.currentframe()
|
|
61
|
+
if frame is None:
|
|
62
|
+
raise ASTExtractionError("Cannot get current frame")
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
caller_frame = frame.f_back
|
|
66
|
+
while caller_frame is not None:
|
|
67
|
+
filename = caller_frame.f_code.co_filename
|
|
68
|
+
|
|
69
|
+
# Check if this is an explicitly allowed frame (e.g., hud/datasets/runner.py)
|
|
70
|
+
if any(pattern in filename for pattern in _ALLOWED_FRAME_PATTERNS):
|
|
71
|
+
return caller_frame
|
|
72
|
+
|
|
73
|
+
# Skip internal frames, but also skip site-packages unless allowed above
|
|
74
|
+
is_internal = any(pattern in filename for pattern in _SKIP_FRAME_PATTERNS)
|
|
75
|
+
is_site_packages = "site-packages" in filename
|
|
76
|
+
|
|
77
|
+
if not is_internal and not is_site_packages:
|
|
78
|
+
return caller_frame
|
|
79
|
+
|
|
80
|
+
caller_frame = caller_frame.f_back
|
|
81
|
+
|
|
82
|
+
raise ASTExtractionError("Cannot find user code frame in call stack")
|
|
83
|
+
finally:
|
|
84
|
+
del frame
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def expand_variants(
|
|
88
|
+
variants: dict[str, Any] | None,
|
|
89
|
+
) -> list[dict[str, Any]]:
|
|
90
|
+
"""Expand variants dict into all combinations.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
variants: Dict where values can be:
|
|
94
|
+
- Single value: {"model": "gpt-4o"} → fixed
|
|
95
|
+
- List: {"model": ["gpt-4o", "claude"]} → expand
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
List of variant assignments, one per combination.
|
|
99
|
+
|
|
100
|
+
Examples:
|
|
101
|
+
>>> expand_variants(None)
|
|
102
|
+
[{}]
|
|
103
|
+
>>> expand_variants({"model": "gpt-4o"})
|
|
104
|
+
[{"model": "gpt-4o"}]
|
|
105
|
+
>>> expand_variants({"model": ["gpt-4o", "claude"]})
|
|
106
|
+
[{"model": "gpt-4o"}, {"model": "claude"}]
|
|
107
|
+
"""
|
|
108
|
+
if not variants:
|
|
109
|
+
return [{}]
|
|
110
|
+
|
|
111
|
+
expanded: dict[str, list[Any]] = {}
|
|
112
|
+
for key, value in variants.items():
|
|
113
|
+
if isinstance(value, list):
|
|
114
|
+
expanded[key] = value
|
|
115
|
+
else:
|
|
116
|
+
expanded[key] = [value]
|
|
117
|
+
|
|
118
|
+
keys = list(expanded.keys())
|
|
119
|
+
value_lists = [expanded[k] for k in keys]
|
|
120
|
+
|
|
121
|
+
return [dict(zip(keys, combo, strict=True)) for combo in itertools.product(*value_lists)]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def resolve_group_ids(
|
|
125
|
+
group_ids: list[str] | None,
|
|
126
|
+
total_count: int,
|
|
127
|
+
) -> list[str]:
|
|
128
|
+
"""Resolve group IDs for parallel execution.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
group_ids: Optional list of group IDs (must match total_count if provided)
|
|
132
|
+
total_count: Total number of evals
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
List of group IDs (one per eval)
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
ValueError: If group_ids length doesn't match total_count
|
|
139
|
+
"""
|
|
140
|
+
if group_ids:
|
|
141
|
+
if len(group_ids) != total_count:
|
|
142
|
+
raise ValueError(
|
|
143
|
+
f"group_ids length ({len(group_ids)}) must match total evals ({total_count})"
|
|
144
|
+
)
|
|
145
|
+
return group_ids
|
|
146
|
+
else:
|
|
147
|
+
shared_group_id = str(uuid.uuid4())
|
|
148
|
+
return [shared_group_id] * total_count
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def log_eval_stats(completed: list[EvalContext], context: str = "") -> None:
|
|
152
|
+
"""Log statistics for completed evaluations.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
completed: List of completed EvalContext objects
|
|
156
|
+
context: Optional context string for the log message
|
|
157
|
+
"""
|
|
158
|
+
rewards = [ctx.reward for ctx in completed if ctx.reward is not None]
|
|
159
|
+
mean_reward = sum(rewards) / len(rewards) if rewards else 0.0
|
|
160
|
+
success_count = sum(1 for ctx in completed if ctx.success)
|
|
161
|
+
|
|
162
|
+
logger.info(
|
|
163
|
+
"Evals complete%s: %d/%d succeeded, mean_reward=%.3f",
|
|
164
|
+
f" ({context})" if context else "",
|
|
165
|
+
success_count,
|
|
166
|
+
len(completed),
|
|
167
|
+
mean_reward,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class ASTExtractionError(Exception):
|
|
172
|
+
"""Error extracting AST from source."""
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def get_with_block_body(frame: Any) -> tuple[str, dict[str, Any], str]:
|
|
176
|
+
"""Extract the body of a with-block from the calling frame.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
frame: The calling frame (from inspect.currentframe())
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Tuple of (body_source, captured_locals, context_var_name)
|
|
183
|
+
"""
|
|
184
|
+
filename = frame.f_code.co_filename
|
|
185
|
+
lineno = frame.f_lineno
|
|
186
|
+
|
|
187
|
+
# Check for interactive session
|
|
188
|
+
if filename.startswith("<") or filename in ("<stdin>", "<string>"):
|
|
189
|
+
raise ASTExtractionError("Cannot extract source from interactive session. Use a .py file.")
|
|
190
|
+
|
|
191
|
+
# Read and parse source
|
|
192
|
+
lines = linecache.getlines(filename)
|
|
193
|
+
if not lines:
|
|
194
|
+
with open(filename, encoding="utf-8") as f:
|
|
195
|
+
lines = f.readlines()
|
|
196
|
+
|
|
197
|
+
source = "".join(lines)
|
|
198
|
+
tree = ast.parse(source, filename=filename)
|
|
199
|
+
|
|
200
|
+
# Find the async with containing this line
|
|
201
|
+
with_node = _find_async_with(tree, lineno)
|
|
202
|
+
if with_node is None:
|
|
203
|
+
raise ASTExtractionError(f"Cannot find 'async with' statement at line {lineno}")
|
|
204
|
+
|
|
205
|
+
# Extract body source
|
|
206
|
+
body_source = _extract_body(lines, with_node)
|
|
207
|
+
|
|
208
|
+
# Extract the context variable name from 'as' clause
|
|
209
|
+
context_var = _extract_context_var(with_node)
|
|
210
|
+
|
|
211
|
+
# Capture both globals (imports) and locals (variables in scope)
|
|
212
|
+
captured = {**frame.f_globals, **frame.f_locals}
|
|
213
|
+
|
|
214
|
+
return body_source, captured, context_var
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _extract_context_var(with_node: ast.AsyncWith) -> str:
|
|
218
|
+
"""Extract the variable name from the 'as' clause of an async with statement."""
|
|
219
|
+
if not with_node.items or not with_node.items[0].optional_vars:
|
|
220
|
+
raise ASTExtractionError("async with statement must use 'as' clause for parallel execution")
|
|
221
|
+
|
|
222
|
+
var_node = with_node.items[0].optional_vars
|
|
223
|
+
if not isinstance(var_node, ast.Name):
|
|
224
|
+
raise ASTExtractionError("async with 'as' clause must be a simple variable name")
|
|
225
|
+
|
|
226
|
+
return var_node.id
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _find_async_with(tree: ast.AST, target_line: int) -> ast.AsyncWith | None:
|
|
230
|
+
"""Find AsyncWith node containing the target line."""
|
|
231
|
+
for node in ast.walk(tree):
|
|
232
|
+
if isinstance(node, ast.AsyncWith):
|
|
233
|
+
end_line = _get_end_line(node)
|
|
234
|
+
if node.lineno <= target_line <= end_line:
|
|
235
|
+
return node
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _get_end_line(node: ast.AST) -> int:
|
|
240
|
+
"""Get the last line number of an AST node."""
|
|
241
|
+
end = getattr(node, "end_lineno", getattr(node, "lineno", 0))
|
|
242
|
+
for child in ast.walk(node):
|
|
243
|
+
child_end = getattr(child, "end_lineno", 0)
|
|
244
|
+
if child_end > end:
|
|
245
|
+
end = child_end
|
|
246
|
+
return end
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _extract_body(lines: list[str], with_node: ast.AsyncWith) -> str:
|
|
250
|
+
"""Extract the body source from an AsyncWith node."""
|
|
251
|
+
if not with_node.body:
|
|
252
|
+
return "pass"
|
|
253
|
+
|
|
254
|
+
start = with_node.body[0].lineno - 1
|
|
255
|
+
end = _get_end_line(with_node.body[-1])
|
|
256
|
+
|
|
257
|
+
body = "".join(lines[start:end])
|
|
258
|
+
return textwrap.dedent(body)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
__all__ = [
|
|
262
|
+
"ASTExtractionError",
|
|
263
|
+
"expand_variants",
|
|
264
|
+
"find_user_frame",
|
|
265
|
+
"get_with_block_body",
|
|
266
|
+
"log_eval_stats",
|
|
267
|
+
"resolve_group_ids",
|
|
268
|
+
]
|
hud/eval/task.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""Task - A runnable evaluation unit (Pydantic model).
|
|
2
|
+
|
|
3
|
+
A Task holds the configuration needed to run an evaluation:
|
|
4
|
+
- Environment configuration (how to create/connect)
|
|
5
|
+
- Optional scenario name and args
|
|
6
|
+
|
|
7
|
+
When entered as a context manager, it creates an EvalContext.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
env = Environment("my-env").connect_hub("browser")
|
|
11
|
+
|
|
12
|
+
# Empty - just env
|
|
13
|
+
async with env() as ctx:
|
|
14
|
+
await ctx.call_tool("navigate", url="...")
|
|
15
|
+
|
|
16
|
+
# With scenario
|
|
17
|
+
async with env("checkout", user_id="alice") as ctx:
|
|
18
|
+
await agent.run(ctx.prompt)
|
|
19
|
+
|
|
20
|
+
# Orchestrated via hud.eval
|
|
21
|
+
tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
|
|
22
|
+
async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
|
|
23
|
+
...
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import logging
|
|
29
|
+
from typing import TYPE_CHECKING, Any
|
|
30
|
+
|
|
31
|
+
from pydantic import (
|
|
32
|
+
BaseModel,
|
|
33
|
+
ConfigDict,
|
|
34
|
+
Field,
|
|
35
|
+
field_serializer,
|
|
36
|
+
field_validator,
|
|
37
|
+
model_serializer,
|
|
38
|
+
model_validator,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
from hud.types import MCPToolCall
|
|
42
|
+
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from hud.environment import Environment
|
|
45
|
+
from hud.environment.types import EnvConfig
|
|
46
|
+
|
|
47
|
+
__all__ = ["Task", "TaskAgentConfig", "build_eval_name"]
|
|
48
|
+
|
|
49
|
+
logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TaskAgentConfig(BaseModel):
|
|
53
|
+
"""Agent configuration for a Task.
|
|
54
|
+
|
|
55
|
+
Contains settings that should be passed to the agent when running this task.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
model_config = ConfigDict(extra="ignore")
|
|
59
|
+
|
|
60
|
+
system_prompt: str | None = Field(
|
|
61
|
+
default=None,
|
|
62
|
+
description="Custom system prompt to pass to the agent",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@model_validator(mode="before")
|
|
66
|
+
@classmethod
|
|
67
|
+
def warn_extra_fields(cls, data: Any) -> Any:
|
|
68
|
+
"""Warn about extra fields that will be ignored."""
|
|
69
|
+
if isinstance(data, dict):
|
|
70
|
+
known_fields = {"system_prompt"}
|
|
71
|
+
extra = set(data.keys()) - known_fields
|
|
72
|
+
if extra:
|
|
73
|
+
logger.warning(
|
|
74
|
+
"Deprecated or unknown fields in agent_config will be ignored: %s",
|
|
75
|
+
", ".join(sorted(extra)),
|
|
76
|
+
)
|
|
77
|
+
return data
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def build_eval_name(scenario: str | None, args: dict[str, Any] | None) -> str:
|
|
81
|
+
"""Build descriptive name: 'scenario with val1, val2, ...'"""
|
|
82
|
+
if not scenario:
|
|
83
|
+
return "eval"
|
|
84
|
+
if not args:
|
|
85
|
+
return scenario
|
|
86
|
+
|
|
87
|
+
val_parts = []
|
|
88
|
+
for v in list(args.values())[:3]: # Max 3 values
|
|
89
|
+
v_str = repr(v) if isinstance(v, str) else str(v)
|
|
90
|
+
if len(v_str) > 25:
|
|
91
|
+
v_str = v_str[:22] + "..."
|
|
92
|
+
val_parts.append(v_str)
|
|
93
|
+
|
|
94
|
+
if val_parts:
|
|
95
|
+
return f"{scenario} with {', '.join(val_parts)}"
|
|
96
|
+
return scenario
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Task(BaseModel):
|
|
100
|
+
"""A runnable evaluation unit (Pydantic model).
|
|
101
|
+
|
|
102
|
+
Simplified v5 Task format:
|
|
103
|
+
- env: Environment instance OR EnvConfig with hub name + filters
|
|
104
|
+
- scenario: Scenario name to run
|
|
105
|
+
- args: Scenario arguments
|
|
106
|
+
- validation: Optional list of tool calls representing successful completion
|
|
107
|
+
|
|
108
|
+
When entered as a context manager, creates an EvalContext.
|
|
109
|
+
|
|
110
|
+
Attributes:
|
|
111
|
+
id: Optional task identifier for filtering/tracking
|
|
112
|
+
env: Environment instance (auto-created from dict/EnvConfig via validator)
|
|
113
|
+
scenario: Scenario name to run (from @env.scenario)
|
|
114
|
+
args: Scenario arguments
|
|
115
|
+
validation: Optional list of MCPToolCall objects representing successful completion
|
|
116
|
+
|
|
117
|
+
Example (v5 format):
|
|
118
|
+
```python
|
|
119
|
+
from hud.eval import Task
|
|
120
|
+
|
|
121
|
+
# Pass dict - auto-converts to Environment
|
|
122
|
+
task = Task(
|
|
123
|
+
env={"name": "browser", "include": ["navigate", "screenshot"]},
|
|
124
|
+
scenario="checkout",
|
|
125
|
+
args={"user_id": "alice"},
|
|
126
|
+
validation=[{"name": "check_cart", "arguments": {}}],
|
|
127
|
+
)
|
|
128
|
+
# task.env is now Environment connected to browser hub!
|
|
129
|
+
|
|
130
|
+
# Or pass live Environment directly
|
|
131
|
+
env = Environment("my-env").connect_hub("browser")
|
|
132
|
+
task = Task(env=env, scenario="checkout", args={"user_id": "alice"})
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Migration from v4:
|
|
136
|
+
Use Task.from_v4() to convert LegacyTask objects:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
task = Task.from_v4(legacy_task)
|
|
140
|
+
# or
|
|
141
|
+
task = Task.from_v4({"prompt": "...", "mcp_config": {...}, ...})
|
|
142
|
+
```
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
146
|
+
|
|
147
|
+
# Fields - env accepts Environment | EnvConfig | dict, auto-converts to Environment
|
|
148
|
+
env: Any = Field(default=None) # Typed as Any for input flexibility, validated below
|
|
149
|
+
scenario: str | None = None
|
|
150
|
+
id: str | None = None
|
|
151
|
+
args: dict[str, Any] = Field(default_factory=dict)
|
|
152
|
+
validation: list[MCPToolCall] | None = None
|
|
153
|
+
|
|
154
|
+
# Agent config - settings passed to agent (system_prompt, etc.)
|
|
155
|
+
# Accepts TaskAgentConfig or dict (auto-converted via validator)
|
|
156
|
+
agent_config: TaskAgentConfig | dict[str, Any] | None = None
|
|
157
|
+
|
|
158
|
+
# Task metadata - for tracking/filtering, not used by agent
|
|
159
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
160
|
+
|
|
161
|
+
@field_validator("agent_config", mode="before")
|
|
162
|
+
@classmethod
|
|
163
|
+
def convert_agent_config(
|
|
164
|
+
cls, v: TaskAgentConfig | dict[str, Any] | None
|
|
165
|
+
) -> TaskAgentConfig | None:
|
|
166
|
+
"""Auto-convert dict to TaskAgentConfig."""
|
|
167
|
+
if v is None:
|
|
168
|
+
return None
|
|
169
|
+
if isinstance(v, TaskAgentConfig):
|
|
170
|
+
return v
|
|
171
|
+
if isinstance(v, dict):
|
|
172
|
+
return TaskAgentConfig(**v)
|
|
173
|
+
raise TypeError(
|
|
174
|
+
f"Task.agent_config must be TaskAgentConfig or dict. Got {type(v).__name__}"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
@model_validator(mode="before")
|
|
178
|
+
@classmethod
|
|
179
|
+
def detect_v4_format(cls, data: Any) -> Any:
|
|
180
|
+
"""Auto-detect v4 LegacyTask format and convert to v5 Task format.
|
|
181
|
+
|
|
182
|
+
If the input dict is a valid v4 format (has prompt, mcp_config, evaluate_tool),
|
|
183
|
+
it's converted using build_env_from_v4().
|
|
184
|
+
|
|
185
|
+
This allows Task(**v4_dict) to work seamlessly.
|
|
186
|
+
"""
|
|
187
|
+
from hud.eval.utils import build_env_from_v4, is_v4_format, validate_v4_task
|
|
188
|
+
|
|
189
|
+
if not isinstance(data, dict):
|
|
190
|
+
return data
|
|
191
|
+
|
|
192
|
+
if is_v4_format(data):
|
|
193
|
+
# Validate completeness before conversion
|
|
194
|
+
validate_v4_task(data)
|
|
195
|
+
# build_env_from_v4 returns a dict with all Task fields
|
|
196
|
+
return build_env_from_v4(data)
|
|
197
|
+
|
|
198
|
+
return data
|
|
199
|
+
|
|
200
|
+
@field_validator("env", mode="before")
|
|
201
|
+
@classmethod
|
|
202
|
+
def convert_env(cls, v: Environment | EnvConfig | dict[str, Any] | None) -> Environment | None:
|
|
203
|
+
"""Auto-convert dict/EnvConfig to Environment.
|
|
204
|
+
|
|
205
|
+
Format: {"name": "browser", "include": [...], "exclude": [...]}
|
|
206
|
+
"""
|
|
207
|
+
from hud.environment import Environment
|
|
208
|
+
from hud.environment.types import EnvConfig
|
|
209
|
+
|
|
210
|
+
if v is None:
|
|
211
|
+
return None
|
|
212
|
+
if isinstance(v, Environment):
|
|
213
|
+
return v
|
|
214
|
+
if isinstance(v, dict):
|
|
215
|
+
try:
|
|
216
|
+
config = EnvConfig(**v)
|
|
217
|
+
except Exception as e:
|
|
218
|
+
raise ValueError(
|
|
219
|
+
f"Invalid env config: {e}. Expected fields: name (str), "
|
|
220
|
+
f"include (list[str] | None), exclude (list[str] | None)"
|
|
221
|
+
) from e
|
|
222
|
+
env = Environment(config.name)
|
|
223
|
+
env.connect_hub(config.name, include=config.include, exclude=config.exclude)
|
|
224
|
+
return env
|
|
225
|
+
if isinstance(v, EnvConfig):
|
|
226
|
+
env = Environment(v.name)
|
|
227
|
+
env.connect_hub(v.name, include=v.include, exclude=v.exclude)
|
|
228
|
+
return env
|
|
229
|
+
raise TypeError(f"Task.env must be Environment, EnvConfig, or dict. Got {type(v).__name__}")
|
|
230
|
+
|
|
231
|
+
@field_validator("validation", mode="before")
|
|
232
|
+
@classmethod
|
|
233
|
+
def convert_validation(
|
|
234
|
+
cls, v: list[MCPToolCall | dict[str, Any]] | None
|
|
235
|
+
) -> list[MCPToolCall] | None:
|
|
236
|
+
"""Auto-convert validation dicts to MCPToolCall objects."""
|
|
237
|
+
if v is None:
|
|
238
|
+
return None
|
|
239
|
+
if not isinstance(v, list):
|
|
240
|
+
raise TypeError(f"validation must be a list, got {type(v).__name__}")
|
|
241
|
+
|
|
242
|
+
converted = []
|
|
243
|
+
for item in v:
|
|
244
|
+
if isinstance(item, dict):
|
|
245
|
+
converted.append(MCPToolCall(**item))
|
|
246
|
+
elif isinstance(item, MCPToolCall):
|
|
247
|
+
converted.append(item)
|
|
248
|
+
else:
|
|
249
|
+
raise TypeError(
|
|
250
|
+
f"validation items must be dict or MCPToolCall, got {type(item).__name__}"
|
|
251
|
+
)
|
|
252
|
+
return converted
|
|
253
|
+
|
|
254
|
+
@field_serializer("env")
|
|
255
|
+
def serialize_env(self, env: Environment | None) -> dict[str, Any] | None:
|
|
256
|
+
"""Serialize Environment to config dict via to_config()."""
|
|
257
|
+
if env is None:
|
|
258
|
+
return None
|
|
259
|
+
return env.to_config()
|
|
260
|
+
|
|
261
|
+
@model_serializer(mode="wrap")
|
|
262
|
+
def _serialize_task(
|
|
263
|
+
self,
|
|
264
|
+
handler: Any, # SerializerFunctionWrapHandler
|
|
265
|
+
) -> dict[str, Any]:
|
|
266
|
+
"""Custom serializer for v4 format flattening.
|
|
267
|
+
|
|
268
|
+
For v5 tasks: uses default serialization (env field handled by field_serializer)
|
|
269
|
+
For v4 tasks: flattens {"prompt": ..., "mcp_config": ..., "evaluate_tool": ...}
|
|
270
|
+
"""
|
|
271
|
+
# Get default serialization (env is already converted by field_serializer)
|
|
272
|
+
data = handler(self)
|
|
273
|
+
|
|
274
|
+
# Check if this is a v4 task (env config has mcp_config)
|
|
275
|
+
env_config = data.get("env")
|
|
276
|
+
if env_config and isinstance(env_config, dict) and "mcp_config" in env_config:
|
|
277
|
+
# v4 format - flatten into top-level dict
|
|
278
|
+
result = env_config.copy()
|
|
279
|
+
|
|
280
|
+
# Map validation → integration_test_tool
|
|
281
|
+
if self.validation:
|
|
282
|
+
result["integration_test_tool"] = [
|
|
283
|
+
{"name": v.name, "arguments": v.arguments or {}} for v in self.validation
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
# Preserve agent_config
|
|
287
|
+
if data.get("agent_config"):
|
|
288
|
+
result["agent_config"] = data["agent_config"]
|
|
289
|
+
|
|
290
|
+
# Preserve metadata
|
|
291
|
+
if data.get("metadata"):
|
|
292
|
+
result["metadata"] = data["metadata"]
|
|
293
|
+
|
|
294
|
+
# Preserve id
|
|
295
|
+
if data.get("id"):
|
|
296
|
+
result["id"] = data["id"]
|
|
297
|
+
|
|
298
|
+
return result
|
|
299
|
+
|
|
300
|
+
return data
|
|
301
|
+
|
|
302
|
+
@classmethod
|
|
303
|
+
def from_v4(cls, source: Any) -> Task:
|
|
304
|
+
"""Convert v4 LegacyTask format to v5 Task.
|
|
305
|
+
|
|
306
|
+
This is a convenience wrapper. You can also use Task(**dict) directly
|
|
307
|
+
since the model validator auto-detects v4 format.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
source: LegacyTask, dict, or JSON string with v4 fields
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Task configured for v4 behavior
|
|
314
|
+
"""
|
|
315
|
+
import json as json_module
|
|
316
|
+
|
|
317
|
+
# JSON string → dict
|
|
318
|
+
if isinstance(source, str):
|
|
319
|
+
source = json_module.loads(source)
|
|
320
|
+
|
|
321
|
+
# LegacyTask → dict (import only when needed)
|
|
322
|
+
if hasattr(source, "model_dump"):
|
|
323
|
+
source = source.model_dump()
|
|
324
|
+
|
|
325
|
+
# Model validator handles v4 detection and conversion
|
|
326
|
+
return cls(**source)
|
|
327
|
+
|
|
328
|
+
def copy(self) -> Task:
|
|
329
|
+
"""Create a copy of this Task config.
|
|
330
|
+
|
|
331
|
+
Note: env is shared (not deep copied) since Environment instances
|
|
332
|
+
should be reused. Args and validation are deep copied.
|
|
333
|
+
"""
|
|
334
|
+
return Task(
|
|
335
|
+
id=self.id,
|
|
336
|
+
env=self.env, # Share reference
|
|
337
|
+
scenario=self.scenario,
|
|
338
|
+
args=self.args.copy() if self.args else {},
|
|
339
|
+
validation=self.validation.copy() if self.validation else None,
|
|
340
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for hud.eval module."""
|