hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/eval/manager.py
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
"""Standalone eval() context manager.
|
|
2
|
+
|
|
3
|
+
Provides hud.eval() for task-based evaluation without needing an existing environment.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import inspect
|
|
9
|
+
import logging
|
|
10
|
+
import uuid
|
|
11
|
+
from contextlib import asynccontextmanager
|
|
12
|
+
from typing import TYPE_CHECKING, Any
|
|
13
|
+
|
|
14
|
+
from hud.eval.display import print_complete, print_eval_stats, print_link
|
|
15
|
+
from hud.eval.parallel import (
|
|
16
|
+
ASTExtractionError,
|
|
17
|
+
expand_variants,
|
|
18
|
+
find_user_frame,
|
|
19
|
+
get_with_block_body,
|
|
20
|
+
resolve_group_ids,
|
|
21
|
+
)
|
|
22
|
+
from hud.eval.types import ParallelEvalComplete
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from collections.abc import AsyncGenerator
|
|
26
|
+
|
|
27
|
+
from hud.eval.context import EvalContext
|
|
28
|
+
from hud.eval.task import Task
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _get_eval_name(tasks: list[Task] | None = None) -> str:
|
|
34
|
+
"""Extract a nice name for job display.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
tasks: List of Task objects
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Name like "scenario with val1, val2" or "eval" if no tasks
|
|
41
|
+
"""
|
|
42
|
+
from hud.eval.task import build_eval_name
|
|
43
|
+
|
|
44
|
+
# If we have Task objects, derive name from first one
|
|
45
|
+
if tasks:
|
|
46
|
+
if tasks[0].scenario:
|
|
47
|
+
return build_eval_name(tasks[0].scenario, tasks[0].args)
|
|
48
|
+
# Fall back to env name or prompt
|
|
49
|
+
if tasks[0].env and hasattr(tasks[0].env, "name"):
|
|
50
|
+
return tasks[0].env.name
|
|
51
|
+
if tasks[0].env and hasattr(tasks[0].env, "prompt") and tasks[0].env.prompt:
|
|
52
|
+
return tasks[0].env.prompt[:30].strip()
|
|
53
|
+
if tasks[0].id:
|
|
54
|
+
return tasks[0].id
|
|
55
|
+
|
|
56
|
+
return "eval"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
async def _send_job_enter(
|
|
60
|
+
job_id: str,
|
|
61
|
+
name: str,
|
|
62
|
+
variants: dict[str, Any] | None,
|
|
63
|
+
group: int,
|
|
64
|
+
api_key: str | None,
|
|
65
|
+
taskset: str | None = None,
|
|
66
|
+
tasks: list[dict[str, Any]] | None = None,
|
|
67
|
+
) -> list[str] | None:
|
|
68
|
+
"""Send job enter payload (async request before traces start)."""
|
|
69
|
+
import httpx
|
|
70
|
+
|
|
71
|
+
from hud.eval.types import JobEnterPayload
|
|
72
|
+
from hud.settings import settings
|
|
73
|
+
|
|
74
|
+
api_key = api_key or settings.api_key
|
|
75
|
+
if not settings.telemetry_enabled or not api_key:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
payload = JobEnterPayload(
|
|
79
|
+
name=name,
|
|
80
|
+
variants=variants,
|
|
81
|
+
group=group,
|
|
82
|
+
taskset=taskset,
|
|
83
|
+
tasks=tasks if taskset else None, # only send tasks if taskset specified
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
88
|
+
resp = await client.post(
|
|
89
|
+
f"{settings.hud_api_url}/trace/job/{job_id}/enter",
|
|
90
|
+
json=payload.model_dump(exclude_none=True),
|
|
91
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
92
|
+
)
|
|
93
|
+
if resp.is_success:
|
|
94
|
+
try:
|
|
95
|
+
data = resp.json()
|
|
96
|
+
except Exception:
|
|
97
|
+
return None
|
|
98
|
+
if isinstance(data, dict):
|
|
99
|
+
ids = data.get("task_version_ids")
|
|
100
|
+
if isinstance(ids, list) and all(isinstance(x, str) for x in ids):
|
|
101
|
+
return ids
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.warning("Failed to send job enter: %s", e)
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@asynccontextmanager
|
|
108
|
+
async def run_eval(
|
|
109
|
+
source: Task | list[Task] | None = None,
|
|
110
|
+
*,
|
|
111
|
+
name: str | None = None,
|
|
112
|
+
variants: dict[str, Any] | None = None,
|
|
113
|
+
group: int = 1,
|
|
114
|
+
group_ids: list[str] | None = None,
|
|
115
|
+
job_id: str | None = None,
|
|
116
|
+
group_id: str | None = None,
|
|
117
|
+
trace_id: str | None = None,
|
|
118
|
+
api_key: str | None = None,
|
|
119
|
+
max_concurrent: int | None = None,
|
|
120
|
+
trace: bool = True,
|
|
121
|
+
quiet: bool = False,
|
|
122
|
+
taskset: str | None = None,
|
|
123
|
+
) -> AsyncGenerator[EvalContext, None]:
|
|
124
|
+
"""Standalone eval context manager.
|
|
125
|
+
|
|
126
|
+
Creates an EvalContext for evaluation using Task objects (or deprecated LegacyTask).
|
|
127
|
+
For loading tasks from datasets, use load_tasks() first.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
source: Task source. Can be:
|
|
131
|
+
- None: Create blank eval context
|
|
132
|
+
- Task: Single Task object (from env() or load_tasks())
|
|
133
|
+
- list[Task]: List of Task objects
|
|
134
|
+
- LegacyTask: Single LegacyTask object (deprecated, use Task.from_v4())
|
|
135
|
+
- list[LegacyTask]: List of LegacyTask objects (deprecated)
|
|
136
|
+
name: Optional name for the eval (used in trace)
|
|
137
|
+
variants: A/B test configuration (dict with list values expanded)
|
|
138
|
+
group: Runs per variant for statistical significance
|
|
139
|
+
group_ids: Optional list of group IDs
|
|
140
|
+
job_id: Job ID to link to
|
|
141
|
+
group_id: Group ID for parallel evaluations
|
|
142
|
+
trace_id: Pre-assigned trace ID (auto-generated if not provided)
|
|
143
|
+
api_key: API key for backend calls
|
|
144
|
+
max_concurrent: Maximum concurrent evals (None = unlimited)
|
|
145
|
+
trace: Whether to send trace data to backend (default True)
|
|
146
|
+
quiet: Whether to suppress printing links (default False)
|
|
147
|
+
|
|
148
|
+
Yields:
|
|
149
|
+
EvalContext: Environment with evaluation tracking
|
|
150
|
+
|
|
151
|
+
Example:
|
|
152
|
+
```python
|
|
153
|
+
from hud.datasets import load_tasks
|
|
154
|
+
|
|
155
|
+
# Blank eval (for manual reward)
|
|
156
|
+
async with hud.eval() as ctx:
|
|
157
|
+
ctx.reward = compute_reward()
|
|
158
|
+
|
|
159
|
+
# With Task objects (from env())
|
|
160
|
+
env = Environment("my-env").connect_hub("browser")
|
|
161
|
+
tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
|
|
162
|
+
async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
|
|
163
|
+
await agent.run(ctx.prompt)
|
|
164
|
+
|
|
165
|
+
# Load tasks from file or API
|
|
166
|
+
tasks = load_tasks("hud-evals/SheetBench-50")
|
|
167
|
+
async with hud.eval(tasks) as ctx:
|
|
168
|
+
await agent.run(ctx)
|
|
169
|
+
|
|
170
|
+
# With variants and group
|
|
171
|
+
async with hud.eval(
|
|
172
|
+
tasks,
|
|
173
|
+
variants={"model": ["gpt-4o", "claude"]},
|
|
174
|
+
group=3,
|
|
175
|
+
) as ctx:
|
|
176
|
+
model = ctx.variants["model"]
|
|
177
|
+
await run_agent(model)
|
|
178
|
+
ctx.reward = evaluate()
|
|
179
|
+
|
|
180
|
+
# With concurrency limit
|
|
181
|
+
async with hud.eval(tasks, max_concurrent=10) as ctx:
|
|
182
|
+
await agent.run(ctx)
|
|
183
|
+
|
|
184
|
+
# Access results after parallel run
|
|
185
|
+
for e in ctx.results:
|
|
186
|
+
print(f"{e.variants}: reward={e.reward}")
|
|
187
|
+
```
|
|
188
|
+
"""
|
|
189
|
+
from hud.eval.task import Task
|
|
190
|
+
from hud.types import LegacyTask
|
|
191
|
+
|
|
192
|
+
if group <= 0:
|
|
193
|
+
raise ValueError("group must be >= 1")
|
|
194
|
+
|
|
195
|
+
# Expand variants
|
|
196
|
+
variant_combos = expand_variants(variants)
|
|
197
|
+
|
|
198
|
+
# Parse source into tasks list - only Task objects accepted
|
|
199
|
+
tasks: list[Task] = []
|
|
200
|
+
|
|
201
|
+
if source is not None:
|
|
202
|
+
if isinstance(source, Task):
|
|
203
|
+
# Single Task object
|
|
204
|
+
tasks = [source]
|
|
205
|
+
elif isinstance(source, list) and source and isinstance(source[0], Task):
|
|
206
|
+
# List of Task objects
|
|
207
|
+
tasks = source # type: ignore[assignment]
|
|
208
|
+
elif isinstance(source, LegacyTask) or (
|
|
209
|
+
isinstance(source, list) and source and isinstance(source[0], LegacyTask)
|
|
210
|
+
):
|
|
211
|
+
# LegacyTask no longer accepted - user must convert first
|
|
212
|
+
raise TypeError(
|
|
213
|
+
"LegacyTask is no longer accepted by hud.eval(). "
|
|
214
|
+
"Convert first with Task.from_v4(legacy_task), or use load_tasks()."
|
|
215
|
+
)
|
|
216
|
+
elif isinstance(source, str):
|
|
217
|
+
# String slugs no longer supported - use load_dataset()
|
|
218
|
+
raise TypeError(
|
|
219
|
+
f"String slugs are no longer supported in hud.eval(). "
|
|
220
|
+
f"Use load_tasks('{source}') first, then pass the tasks list."
|
|
221
|
+
)
|
|
222
|
+
elif isinstance(source, list) and source and isinstance(source[0], str):
|
|
223
|
+
# List of string slugs no longer supported
|
|
224
|
+
raise TypeError(
|
|
225
|
+
"String slugs are no longer supported in hud.eval(). "
|
|
226
|
+
"Use load_tasks() first, then pass the tasks list."
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Calculate total evaluations
|
|
230
|
+
# Each task gets (variants x group) runs; no tasks = single blank eval
|
|
231
|
+
base_count = len(tasks) or 1
|
|
232
|
+
total_evals = base_count * len(variant_combos) * group
|
|
233
|
+
|
|
234
|
+
# Capture code snippet for parallel execution
|
|
235
|
+
code_snippet: str | None = None
|
|
236
|
+
if total_evals > 1:
|
|
237
|
+
frame = inspect.currentframe()
|
|
238
|
+
if frame is not None:
|
|
239
|
+
try:
|
|
240
|
+
caller = frame.f_back
|
|
241
|
+
if caller is not None:
|
|
242
|
+
code_snippet, _, _ = get_with_block_body(caller)
|
|
243
|
+
except ASTExtractionError:
|
|
244
|
+
pass
|
|
245
|
+
finally:
|
|
246
|
+
del frame
|
|
247
|
+
|
|
248
|
+
# Lazy import to avoid circular dependency
|
|
249
|
+
from hud.eval.context import EvalContext
|
|
250
|
+
|
|
251
|
+
if total_evals == 1:
|
|
252
|
+
if tasks:
|
|
253
|
+
# Even for single-task evals, --taskset requires a job_enter call so the run
|
|
254
|
+
# and task are linked to the taskset (via job_id + task_version_id).
|
|
255
|
+
job_id_for_run = job_id
|
|
256
|
+
if taskset:
|
|
257
|
+
eval_name = _get_eval_name(tasks=tasks)
|
|
258
|
+
if job_id_for_run is None:
|
|
259
|
+
job_id_for_run = str(uuid.uuid4())
|
|
260
|
+
|
|
261
|
+
task_data = None
|
|
262
|
+
if not tasks[0].id:
|
|
263
|
+
task_data = [tasks[0].model_dump(mode="json", exclude_none=True)]
|
|
264
|
+
|
|
265
|
+
created_task_version_ids = await _send_job_enter(
|
|
266
|
+
job_id=job_id_for_run,
|
|
267
|
+
name=eval_name,
|
|
268
|
+
variants=variants,
|
|
269
|
+
group=group,
|
|
270
|
+
api_key=api_key,
|
|
271
|
+
taskset=taskset,
|
|
272
|
+
tasks=task_data,
|
|
273
|
+
)
|
|
274
|
+
if created_task_version_ids and not tasks[0].id:
|
|
275
|
+
tasks[0].id = created_task_version_ids[0]
|
|
276
|
+
|
|
277
|
+
# Single task - use EvalContext.from_task()
|
|
278
|
+
ctx = EvalContext.from_task(
|
|
279
|
+
tasks[0],
|
|
280
|
+
name=name,
|
|
281
|
+
trace_id=trace_id,
|
|
282
|
+
api_key=api_key,
|
|
283
|
+
job_id=job_id_for_run,
|
|
284
|
+
group_id=group_id,
|
|
285
|
+
variants=variant_combos[0],
|
|
286
|
+
code_snippet=code_snippet,
|
|
287
|
+
trace=trace,
|
|
288
|
+
quiet=quiet,
|
|
289
|
+
)
|
|
290
|
+
async with ctx:
|
|
291
|
+
yield ctx
|
|
292
|
+
else:
|
|
293
|
+
# Blank eval - use EvalContext directly
|
|
294
|
+
ctx = EvalContext(
|
|
295
|
+
name=name or "eval",
|
|
296
|
+
trace_id=trace_id,
|
|
297
|
+
api_key=api_key,
|
|
298
|
+
job_id=job_id,
|
|
299
|
+
group_id=group_id,
|
|
300
|
+
variants=variant_combos[0],
|
|
301
|
+
code_snippet=code_snippet,
|
|
302
|
+
trace=trace,
|
|
303
|
+
quiet=quiet,
|
|
304
|
+
)
|
|
305
|
+
async with ctx:
|
|
306
|
+
yield ctx
|
|
307
|
+
|
|
308
|
+
else:
|
|
309
|
+
# Parallel execution: create implicit job to group traces
|
|
310
|
+
eval_name = _get_eval_name(tasks=tasks)
|
|
311
|
+
implicit_job_id = job_id or str(uuid.uuid4())
|
|
312
|
+
job_url = f"https://hud.ai/jobs/{implicit_job_id}"
|
|
313
|
+
|
|
314
|
+
# Send job enter (sync request before traces start)
|
|
315
|
+
# Serialize tasks for auto-add to taskset (only tasks without existing backend id).
|
|
316
|
+
# For v5 scenario tasks, the backend task_version_id is carried in Task.id.
|
|
317
|
+
tasks_data = None
|
|
318
|
+
tasks_to_create: list[Task] = []
|
|
319
|
+
if taskset and tasks:
|
|
320
|
+
tasks_to_create = [t for t in tasks if not t.id]
|
|
321
|
+
tasks_data = (
|
|
322
|
+
[t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
|
|
323
|
+
if tasks_to_create
|
|
324
|
+
else None
|
|
325
|
+
)
|
|
326
|
+
created_task_version_ids = await _send_job_enter(
|
|
327
|
+
job_id=implicit_job_id,
|
|
328
|
+
name=eval_name,
|
|
329
|
+
variants=variants,
|
|
330
|
+
group=group,
|
|
331
|
+
api_key=api_key,
|
|
332
|
+
taskset=taskset,
|
|
333
|
+
tasks=tasks_data,
|
|
334
|
+
)
|
|
335
|
+
if created_task_version_ids and tasks_to_create:
|
|
336
|
+
# Assign backend IDs back onto the in-memory tasks so trace enter includes
|
|
337
|
+
# task_version_id.
|
|
338
|
+
# Platform guarantees ordered one-to-one mapping, but warn if counts differ.
|
|
339
|
+
if len(created_task_version_ids) != len(tasks_to_create):
|
|
340
|
+
logger.warning(
|
|
341
|
+
"Task count mismatch: sent %d tasks, received %d IDs. "
|
|
342
|
+
"Some tasks may not be linked to the taskset.",
|
|
343
|
+
len(tasks_to_create),
|
|
344
|
+
len(created_task_version_ids),
|
|
345
|
+
)
|
|
346
|
+
for task_obj, task_version_id in zip(
|
|
347
|
+
tasks_to_create, created_task_version_ids, strict=False
|
|
348
|
+
):
|
|
349
|
+
task_obj.id = task_version_id
|
|
350
|
+
|
|
351
|
+
# Print job URL (not individual trace URLs)
|
|
352
|
+
if not quiet:
|
|
353
|
+
print_link(job_url, f"🚀 {eval_name}")
|
|
354
|
+
|
|
355
|
+
error_occurred = False
|
|
356
|
+
try:
|
|
357
|
+
# Run parallel evals with job_id
|
|
358
|
+
completed = await _run_parallel_eval(
|
|
359
|
+
tasks=tasks,
|
|
360
|
+
variant_combos=variant_combos,
|
|
361
|
+
group=group,
|
|
362
|
+
group_ids=group_ids,
|
|
363
|
+
job_id=implicit_job_id, # Propagate job_id to child traces
|
|
364
|
+
api_key=api_key,
|
|
365
|
+
code_snippet=code_snippet,
|
|
366
|
+
max_concurrent=max_concurrent,
|
|
367
|
+
trace=trace,
|
|
368
|
+
quiet=quiet,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Create summary context (no trace, just aggregates results)
|
|
372
|
+
if tasks:
|
|
373
|
+
# Create summary from first task
|
|
374
|
+
ctx = EvalContext(
|
|
375
|
+
name=eval_name, # Use the same smart name
|
|
376
|
+
api_key=api_key,
|
|
377
|
+
job_id=implicit_job_id,
|
|
378
|
+
)
|
|
379
|
+
else:
|
|
380
|
+
ctx = EvalContext(
|
|
381
|
+
name="eval",
|
|
382
|
+
api_key=api_key,
|
|
383
|
+
job_id=implicit_job_id,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
ctx._is_summary = True # Skip trace tracking
|
|
387
|
+
ctx.results = completed
|
|
388
|
+
|
|
389
|
+
# Compute aggregate reward
|
|
390
|
+
rewards = [e.reward for e in completed if e.reward is not None]
|
|
391
|
+
if rewards:
|
|
392
|
+
ctx.reward = sum(rewards) / len(rewards)
|
|
393
|
+
|
|
394
|
+
# Check if any failed
|
|
395
|
+
error_occurred = any(e.error is not None for e in completed)
|
|
396
|
+
|
|
397
|
+
yield ctx
|
|
398
|
+
except ParallelEvalComplete:
|
|
399
|
+
# Expected - body re-executed on summary context, skip it
|
|
400
|
+
pass
|
|
401
|
+
except Exception:
|
|
402
|
+
error_occurred = True
|
|
403
|
+
raise
|
|
404
|
+
finally:
|
|
405
|
+
print_complete(job_url, eval_name, error=error_occurred)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
async def _run_parallel_eval(
|
|
409
|
+
tasks: list[Task],
|
|
410
|
+
variant_combos: list[dict[str, Any]],
|
|
411
|
+
group: int,
|
|
412
|
+
group_ids: list[str] | None,
|
|
413
|
+
job_id: str | None,
|
|
414
|
+
api_key: str | None,
|
|
415
|
+
code_snippet: str | None,
|
|
416
|
+
max_concurrent: int | None,
|
|
417
|
+
trace: bool = True,
|
|
418
|
+
quiet: bool = False,
|
|
419
|
+
) -> list[EvalContext]:
|
|
420
|
+
"""Run parallel evaluation.
|
|
421
|
+
|
|
422
|
+
Creates EvalContexts from Tasks (or blank) and runs them in parallel.
|
|
423
|
+
"""
|
|
424
|
+
import asyncio
|
|
425
|
+
import textwrap
|
|
426
|
+
|
|
427
|
+
from hud.eval.parallel import log_eval_stats
|
|
428
|
+
|
|
429
|
+
# Find user code frame and extract the with block body
|
|
430
|
+
caller_frame = find_user_frame()
|
|
431
|
+
body_source, captured_locals, context_var = get_with_block_body(caller_frame)
|
|
432
|
+
|
|
433
|
+
# Calculate total evals and resolve group IDs
|
|
434
|
+
base_count = len(tasks) or 1
|
|
435
|
+
total_evals = base_count * len(variant_combos) * group
|
|
436
|
+
resolved_group_ids = resolve_group_ids(group_ids, total_evals)
|
|
437
|
+
|
|
438
|
+
# Build list of (task_or_none, runtime_params) for each parallel eval
|
|
439
|
+
from hud.eval.context import EvalContext
|
|
440
|
+
|
|
441
|
+
eval_configs: list[tuple[Task | None, dict[str, Any]]] = []
|
|
442
|
+
idx = 0
|
|
443
|
+
|
|
444
|
+
if tasks:
|
|
445
|
+
for base_task in tasks:
|
|
446
|
+
for variant in variant_combos:
|
|
447
|
+
for _ in range(group):
|
|
448
|
+
runtime_params = {
|
|
449
|
+
"api_key": api_key,
|
|
450
|
+
"job_id": job_id,
|
|
451
|
+
"group_id": resolved_group_ids[idx],
|
|
452
|
+
"index": idx,
|
|
453
|
+
"variants": variant,
|
|
454
|
+
"code_snippet": code_snippet,
|
|
455
|
+
"trace": trace,
|
|
456
|
+
"quiet": True, # Individual traces don't print links
|
|
457
|
+
}
|
|
458
|
+
eval_configs.append((base_task, runtime_params))
|
|
459
|
+
idx += 1
|
|
460
|
+
else:
|
|
461
|
+
for variant in variant_combos:
|
|
462
|
+
for _ in range(group):
|
|
463
|
+
runtime_params = {
|
|
464
|
+
"api_key": api_key,
|
|
465
|
+
"job_id": job_id,
|
|
466
|
+
"group_id": resolved_group_ids[idx],
|
|
467
|
+
"index": idx,
|
|
468
|
+
"variants": variant,
|
|
469
|
+
"code_snippet": code_snippet,
|
|
470
|
+
"trace": trace,
|
|
471
|
+
"quiet": True,
|
|
472
|
+
}
|
|
473
|
+
eval_configs.append((None, runtime_params))
|
|
474
|
+
idx += 1
|
|
475
|
+
|
|
476
|
+
# Create runner function using the actual variable name from the 'as' clause
|
|
477
|
+
wrapped = f"async def __runner__({context_var}):\n{textwrap.indent(body_source, ' ')}"
|
|
478
|
+
code = compile(wrapped, "<parallel_eval>", "exec")
|
|
479
|
+
namespace = captured_locals.copy()
|
|
480
|
+
exec(code, namespace) # noqa: S102
|
|
481
|
+
runner = namespace["__runner__"]
|
|
482
|
+
|
|
483
|
+
# Create semaphore for concurrency control
|
|
484
|
+
sem = asyncio.Semaphore(max_concurrent) if max_concurrent else None
|
|
485
|
+
|
|
486
|
+
async def run_one(config: tuple[Task | None, dict[str, Any]]) -> EvalContext:
|
|
487
|
+
"""Run a single eval and return its EvalContext."""
|
|
488
|
+
task, params = config
|
|
489
|
+
idx = params["index"]
|
|
490
|
+
|
|
491
|
+
# Create context from task or blank
|
|
492
|
+
if task is not None:
|
|
493
|
+
ctx = EvalContext.from_task(task, **params)
|
|
494
|
+
else:
|
|
495
|
+
ctx = EvalContext(name="eval", **params)
|
|
496
|
+
|
|
497
|
+
# Remove sensitive data from params after context creation to prevent
|
|
498
|
+
# accidental logging if an exception includes local variables
|
|
499
|
+
params.pop("api_key", None)
|
|
500
|
+
|
|
501
|
+
try:
|
|
502
|
+
if sem:
|
|
503
|
+
async with sem, ctx:
|
|
504
|
+
await runner(ctx)
|
|
505
|
+
else:
|
|
506
|
+
async with ctx:
|
|
507
|
+
await runner(ctx)
|
|
508
|
+
return ctx
|
|
509
|
+
except Exception as e:
|
|
510
|
+
logger.warning("Parallel eval %d failed: %s", idx, e)
|
|
511
|
+
ctx.error = e
|
|
512
|
+
return ctx
|
|
513
|
+
|
|
514
|
+
# Run in parallel
|
|
515
|
+
logger.info(
|
|
516
|
+
"Running %d evals (%d base x %d variants x %d runs)%s",
|
|
517
|
+
len(eval_configs),
|
|
518
|
+
base_count,
|
|
519
|
+
len(variant_combos),
|
|
520
|
+
group,
|
|
521
|
+
f", max_concurrent={max_concurrent}" if max_concurrent else "",
|
|
522
|
+
)
|
|
523
|
+
completed = await asyncio.gather(*[run_one(cfg) for cfg in eval_configs])
|
|
524
|
+
|
|
525
|
+
# Log and print stats
|
|
526
|
+
eval_name = completed[0].eval_name if completed else "eval"
|
|
527
|
+
log_eval_stats(completed)
|
|
528
|
+
print_eval_stats(completed, name=eval_name)
|
|
529
|
+
|
|
530
|
+
return list(completed)
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
__all__ = ["run_eval"]
|