hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/eval/manager.py ADDED
@@ -0,0 +1,533 @@
1
+ """Standalone eval() context manager.
2
+
3
+ Provides hud.eval() for task-based evaluation without needing an existing environment.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import inspect
9
+ import logging
10
+ import uuid
11
+ from contextlib import asynccontextmanager
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ from hud.eval.display import print_complete, print_eval_stats, print_link
15
+ from hud.eval.parallel import (
16
+ ASTExtractionError,
17
+ expand_variants,
18
+ find_user_frame,
19
+ get_with_block_body,
20
+ resolve_group_ids,
21
+ )
22
+ from hud.eval.types import ParallelEvalComplete
23
+
24
+ if TYPE_CHECKING:
25
+ from collections.abc import AsyncGenerator
26
+
27
+ from hud.eval.context import EvalContext
28
+ from hud.eval.task import Task
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ def _get_eval_name(tasks: list[Task] | None = None) -> str:
34
+ """Extract a nice name for job display.
35
+
36
+ Args:
37
+ tasks: List of Task objects
38
+
39
+ Returns:
40
+ Name like "scenario with val1, val2" or "eval" if no tasks
41
+ """
42
+ from hud.eval.task import build_eval_name
43
+
44
+ # If we have Task objects, derive name from first one
45
+ if tasks:
46
+ if tasks[0].scenario:
47
+ return build_eval_name(tasks[0].scenario, tasks[0].args)
48
+ # Fall back to env name or prompt
49
+ if tasks[0].env and hasattr(tasks[0].env, "name"):
50
+ return tasks[0].env.name
51
+ if tasks[0].env and hasattr(tasks[0].env, "prompt") and tasks[0].env.prompt:
52
+ return tasks[0].env.prompt[:30].strip()
53
+ if tasks[0].id:
54
+ return tasks[0].id
55
+
56
+ return "eval"
57
+
58
+
59
+ async def _send_job_enter(
60
+ job_id: str,
61
+ name: str,
62
+ variants: dict[str, Any] | None,
63
+ group: int,
64
+ api_key: str | None,
65
+ taskset: str | None = None,
66
+ tasks: list[dict[str, Any]] | None = None,
67
+ ) -> list[str] | None:
68
+ """Send job enter payload (async request before traces start)."""
69
+ import httpx
70
+
71
+ from hud.eval.types import JobEnterPayload
72
+ from hud.settings import settings
73
+
74
+ api_key = api_key or settings.api_key
75
+ if not settings.telemetry_enabled or not api_key:
76
+ return None
77
+
78
+ payload = JobEnterPayload(
79
+ name=name,
80
+ variants=variants,
81
+ group=group,
82
+ taskset=taskset,
83
+ tasks=tasks if taskset else None, # only send tasks if taskset specified
84
+ )
85
+
86
+ try:
87
+ async with httpx.AsyncClient(timeout=10.0) as client:
88
+ resp = await client.post(
89
+ f"{settings.hud_api_url}/trace/job/{job_id}/enter",
90
+ json=payload.model_dump(exclude_none=True),
91
+ headers={"Authorization": f"Bearer {api_key}"},
92
+ )
93
+ if resp.is_success:
94
+ try:
95
+ data = resp.json()
96
+ except Exception:
97
+ return None
98
+ if isinstance(data, dict):
99
+ ids = data.get("task_version_ids")
100
+ if isinstance(ids, list) and all(isinstance(x, str) for x in ids):
101
+ return ids
102
+ except Exception as e:
103
+ logger.warning("Failed to send job enter: %s", e)
104
+ return None
105
+
106
+
107
+ @asynccontextmanager
108
+ async def run_eval(
109
+ source: Task | list[Task] | None = None,
110
+ *,
111
+ name: str | None = None,
112
+ variants: dict[str, Any] | None = None,
113
+ group: int = 1,
114
+ group_ids: list[str] | None = None,
115
+ job_id: str | None = None,
116
+ group_id: str | None = None,
117
+ trace_id: str | None = None,
118
+ api_key: str | None = None,
119
+ max_concurrent: int | None = None,
120
+ trace: bool = True,
121
+ quiet: bool = False,
122
+ taskset: str | None = None,
123
+ ) -> AsyncGenerator[EvalContext, None]:
124
+ """Standalone eval context manager.
125
+
126
+ Creates an EvalContext for evaluation using Task objects (or deprecated LegacyTask).
127
+ For loading tasks from datasets, use load_tasks() first.
128
+
129
+ Args:
130
+ source: Task source. Can be:
131
+ - None: Create blank eval context
132
+ - Task: Single Task object (from env() or load_tasks())
133
+ - list[Task]: List of Task objects
134
+ - LegacyTask: Single LegacyTask object (deprecated, use Task.from_v4())
135
+ - list[LegacyTask]: List of LegacyTask objects (deprecated)
136
+ name: Optional name for the eval (used in trace)
137
+ variants: A/B test configuration (dict with list values expanded)
138
+ group: Runs per variant for statistical significance
139
+ group_ids: Optional list of group IDs
140
+ job_id: Job ID to link to
141
+ group_id: Group ID for parallel evaluations
142
+ trace_id: Pre-assigned trace ID (auto-generated if not provided)
143
+ api_key: API key for backend calls
144
+ max_concurrent: Maximum concurrent evals (None = unlimited)
145
+ trace: Whether to send trace data to backend (default True)
146
+ quiet: Whether to suppress printing links (default False)
147
+
148
+ Yields:
149
+ EvalContext: Environment with evaluation tracking
150
+
151
+ Example:
152
+ ```python
153
+ from hud.datasets import load_tasks
154
+
155
+ # Blank eval (for manual reward)
156
+ async with hud.eval() as ctx:
157
+ ctx.reward = compute_reward()
158
+
159
+ # With Task objects (from env())
160
+ env = Environment("my-env").connect_hub("browser")
161
+ tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
162
+ async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
163
+ await agent.run(ctx.prompt)
164
+
165
+ # Load tasks from file or API
166
+ tasks = load_tasks("hud-evals/SheetBench-50")
167
+ async with hud.eval(tasks) as ctx:
168
+ await agent.run(ctx)
169
+
170
+ # With variants and group
171
+ async with hud.eval(
172
+ tasks,
173
+ variants={"model": ["gpt-4o", "claude"]},
174
+ group=3,
175
+ ) as ctx:
176
+ model = ctx.variants["model"]
177
+ await run_agent(model)
178
+ ctx.reward = evaluate()
179
+
180
+ # With concurrency limit
181
+ async with hud.eval(tasks, max_concurrent=10) as ctx:
182
+ await agent.run(ctx)
183
+
184
+ # Access results after parallel run
185
+ for e in ctx.results:
186
+ print(f"{e.variants}: reward={e.reward}")
187
+ ```
188
+ """
189
+ from hud.eval.task import Task
190
+ from hud.types import LegacyTask
191
+
192
+ if group <= 0:
193
+ raise ValueError("group must be >= 1")
194
+
195
+ # Expand variants
196
+ variant_combos = expand_variants(variants)
197
+
198
+ # Parse source into tasks list - only Task objects accepted
199
+ tasks: list[Task] = []
200
+
201
+ if source is not None:
202
+ if isinstance(source, Task):
203
+ # Single Task object
204
+ tasks = [source]
205
+ elif isinstance(source, list) and source and isinstance(source[0], Task):
206
+ # List of Task objects
207
+ tasks = source # type: ignore[assignment]
208
+ elif isinstance(source, LegacyTask) or (
209
+ isinstance(source, list) and source and isinstance(source[0], LegacyTask)
210
+ ):
211
+ # LegacyTask no longer accepted - user must convert first
212
+ raise TypeError(
213
+ "LegacyTask is no longer accepted by hud.eval(). "
214
+ "Convert first with Task.from_v4(legacy_task), or use load_tasks()."
215
+ )
216
+ elif isinstance(source, str):
217
+ # String slugs no longer supported - use load_dataset()
218
+ raise TypeError(
219
+ f"String slugs are no longer supported in hud.eval(). "
220
+ f"Use load_tasks('{source}') first, then pass the tasks list."
221
+ )
222
+ elif isinstance(source, list) and source and isinstance(source[0], str):
223
+ # List of string slugs no longer supported
224
+ raise TypeError(
225
+ "String slugs are no longer supported in hud.eval(). "
226
+ "Use load_tasks() first, then pass the tasks list."
227
+ )
228
+
229
+ # Calculate total evaluations
230
+ # Each task gets (variants x group) runs; no tasks = single blank eval
231
+ base_count = len(tasks) or 1
232
+ total_evals = base_count * len(variant_combos) * group
233
+
234
+ # Capture code snippet for parallel execution
235
+ code_snippet: str | None = None
236
+ if total_evals > 1:
237
+ frame = inspect.currentframe()
238
+ if frame is not None:
239
+ try:
240
+ caller = frame.f_back
241
+ if caller is not None:
242
+ code_snippet, _, _ = get_with_block_body(caller)
243
+ except ASTExtractionError:
244
+ pass
245
+ finally:
246
+ del frame
247
+
248
+ # Lazy import to avoid circular dependency
249
+ from hud.eval.context import EvalContext
250
+
251
+ if total_evals == 1:
252
+ if tasks:
253
+ # Even for single-task evals, --taskset requires a job_enter call so the run
254
+ # and task are linked to the taskset (via job_id + task_version_id).
255
+ job_id_for_run = job_id
256
+ if taskset:
257
+ eval_name = _get_eval_name(tasks=tasks)
258
+ if job_id_for_run is None:
259
+ job_id_for_run = str(uuid.uuid4())
260
+
261
+ task_data = None
262
+ if not tasks[0].id:
263
+ task_data = [tasks[0].model_dump(mode="json", exclude_none=True)]
264
+
265
+ created_task_version_ids = await _send_job_enter(
266
+ job_id=job_id_for_run,
267
+ name=eval_name,
268
+ variants=variants,
269
+ group=group,
270
+ api_key=api_key,
271
+ taskset=taskset,
272
+ tasks=task_data,
273
+ )
274
+ if created_task_version_ids and not tasks[0].id:
275
+ tasks[0].id = created_task_version_ids[0]
276
+
277
+ # Single task - use EvalContext.from_task()
278
+ ctx = EvalContext.from_task(
279
+ tasks[0],
280
+ name=name,
281
+ trace_id=trace_id,
282
+ api_key=api_key,
283
+ job_id=job_id_for_run,
284
+ group_id=group_id,
285
+ variants=variant_combos[0],
286
+ code_snippet=code_snippet,
287
+ trace=trace,
288
+ quiet=quiet,
289
+ )
290
+ async with ctx:
291
+ yield ctx
292
+ else:
293
+ # Blank eval - use EvalContext directly
294
+ ctx = EvalContext(
295
+ name=name or "eval",
296
+ trace_id=trace_id,
297
+ api_key=api_key,
298
+ job_id=job_id,
299
+ group_id=group_id,
300
+ variants=variant_combos[0],
301
+ code_snippet=code_snippet,
302
+ trace=trace,
303
+ quiet=quiet,
304
+ )
305
+ async with ctx:
306
+ yield ctx
307
+
308
+ else:
309
+ # Parallel execution: create implicit job to group traces
310
+ eval_name = _get_eval_name(tasks=tasks)
311
+ implicit_job_id = job_id or str(uuid.uuid4())
312
+ job_url = f"https://hud.ai/jobs/{implicit_job_id}"
313
+
314
+ # Send job enter (sync request before traces start)
315
+ # Serialize tasks for auto-add to taskset (only tasks without existing backend id).
316
+ # For v5 scenario tasks, the backend task_version_id is carried in Task.id.
317
+ tasks_data = None
318
+ tasks_to_create: list[Task] = []
319
+ if taskset and tasks:
320
+ tasks_to_create = [t for t in tasks if not t.id]
321
+ tasks_data = (
322
+ [t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
323
+ if tasks_to_create
324
+ else None
325
+ )
326
+ created_task_version_ids = await _send_job_enter(
327
+ job_id=implicit_job_id,
328
+ name=eval_name,
329
+ variants=variants,
330
+ group=group,
331
+ api_key=api_key,
332
+ taskset=taskset,
333
+ tasks=tasks_data,
334
+ )
335
+ if created_task_version_ids and tasks_to_create:
336
+ # Assign backend IDs back onto the in-memory tasks so trace enter includes
337
+ # task_version_id.
338
+ # Platform guarantees ordered one-to-one mapping, but warn if counts differ.
339
+ if len(created_task_version_ids) != len(tasks_to_create):
340
+ logger.warning(
341
+ "Task count mismatch: sent %d tasks, received %d IDs. "
342
+ "Some tasks may not be linked to the taskset.",
343
+ len(tasks_to_create),
344
+ len(created_task_version_ids),
345
+ )
346
+ for task_obj, task_version_id in zip(
347
+ tasks_to_create, created_task_version_ids, strict=False
348
+ ):
349
+ task_obj.id = task_version_id
350
+
351
+ # Print job URL (not individual trace URLs)
352
+ if not quiet:
353
+ print_link(job_url, f"🚀 {eval_name}")
354
+
355
+ error_occurred = False
356
+ try:
357
+ # Run parallel evals with job_id
358
+ completed = await _run_parallel_eval(
359
+ tasks=tasks,
360
+ variant_combos=variant_combos,
361
+ group=group,
362
+ group_ids=group_ids,
363
+ job_id=implicit_job_id, # Propagate job_id to child traces
364
+ api_key=api_key,
365
+ code_snippet=code_snippet,
366
+ max_concurrent=max_concurrent,
367
+ trace=trace,
368
+ quiet=quiet,
369
+ )
370
+
371
+ # Create summary context (no trace, just aggregates results)
372
+ if tasks:
373
+ # Create summary from first task
374
+ ctx = EvalContext(
375
+ name=eval_name, # Use the same smart name
376
+ api_key=api_key,
377
+ job_id=implicit_job_id,
378
+ )
379
+ else:
380
+ ctx = EvalContext(
381
+ name="eval",
382
+ api_key=api_key,
383
+ job_id=implicit_job_id,
384
+ )
385
+
386
+ ctx._is_summary = True # Skip trace tracking
387
+ ctx.results = completed
388
+
389
+ # Compute aggregate reward
390
+ rewards = [e.reward for e in completed if e.reward is not None]
391
+ if rewards:
392
+ ctx.reward = sum(rewards) / len(rewards)
393
+
394
+ # Check if any failed
395
+ error_occurred = any(e.error is not None for e in completed)
396
+
397
+ yield ctx
398
+ except ParallelEvalComplete:
399
+ # Expected - body re-executed on summary context, skip it
400
+ pass
401
+ except Exception:
402
+ error_occurred = True
403
+ raise
404
+ finally:
405
+ print_complete(job_url, eval_name, error=error_occurred)
406
+
407
+
408
+ async def _run_parallel_eval(
409
+ tasks: list[Task],
410
+ variant_combos: list[dict[str, Any]],
411
+ group: int,
412
+ group_ids: list[str] | None,
413
+ job_id: str | None,
414
+ api_key: str | None,
415
+ code_snippet: str | None,
416
+ max_concurrent: int | None,
417
+ trace: bool = True,
418
+ quiet: bool = False,
419
+ ) -> list[EvalContext]:
420
+ """Run parallel evaluation.
421
+
422
+ Creates EvalContexts from Tasks (or blank) and runs them in parallel.
423
+ """
424
+ import asyncio
425
+ import textwrap
426
+
427
+ from hud.eval.parallel import log_eval_stats
428
+
429
+ # Find user code frame and extract the with block body
430
+ caller_frame = find_user_frame()
431
+ body_source, captured_locals, context_var = get_with_block_body(caller_frame)
432
+
433
+ # Calculate total evals and resolve group IDs
434
+ base_count = len(tasks) or 1
435
+ total_evals = base_count * len(variant_combos) * group
436
+ resolved_group_ids = resolve_group_ids(group_ids, total_evals)
437
+
438
+ # Build list of (task_or_none, runtime_params) for each parallel eval
439
+ from hud.eval.context import EvalContext
440
+
441
+ eval_configs: list[tuple[Task | None, dict[str, Any]]] = []
442
+ idx = 0
443
+
444
+ if tasks:
445
+ for base_task in tasks:
446
+ for variant in variant_combos:
447
+ for _ in range(group):
448
+ runtime_params = {
449
+ "api_key": api_key,
450
+ "job_id": job_id,
451
+ "group_id": resolved_group_ids[idx],
452
+ "index": idx,
453
+ "variants": variant,
454
+ "code_snippet": code_snippet,
455
+ "trace": trace,
456
+ "quiet": True, # Individual traces don't print links
457
+ }
458
+ eval_configs.append((base_task, runtime_params))
459
+ idx += 1
460
+ else:
461
+ for variant in variant_combos:
462
+ for _ in range(group):
463
+ runtime_params = {
464
+ "api_key": api_key,
465
+ "job_id": job_id,
466
+ "group_id": resolved_group_ids[idx],
467
+ "index": idx,
468
+ "variants": variant,
469
+ "code_snippet": code_snippet,
470
+ "trace": trace,
471
+ "quiet": True,
472
+ }
473
+ eval_configs.append((None, runtime_params))
474
+ idx += 1
475
+
476
+ # Create runner function using the actual variable name from the 'as' clause
477
+ wrapped = f"async def __runner__({context_var}):\n{textwrap.indent(body_source, ' ')}"
478
+ code = compile(wrapped, "<parallel_eval>", "exec")
479
+ namespace = captured_locals.copy()
480
+ exec(code, namespace) # noqa: S102
481
+ runner = namespace["__runner__"]
482
+
483
+ # Create semaphore for concurrency control
484
+ sem = asyncio.Semaphore(max_concurrent) if max_concurrent else None
485
+
486
+ async def run_one(config: tuple[Task | None, dict[str, Any]]) -> EvalContext:
487
+ """Run a single eval and return its EvalContext."""
488
+ task, params = config
489
+ idx = params["index"]
490
+
491
+ # Create context from task or blank
492
+ if task is not None:
493
+ ctx = EvalContext.from_task(task, **params)
494
+ else:
495
+ ctx = EvalContext(name="eval", **params)
496
+
497
+ # Remove sensitive data from params after context creation to prevent
498
+ # accidental logging if an exception includes local variables
499
+ params.pop("api_key", None)
500
+
501
+ try:
502
+ if sem:
503
+ async with sem, ctx:
504
+ await runner(ctx)
505
+ else:
506
+ async with ctx:
507
+ await runner(ctx)
508
+ return ctx
509
+ except Exception as e:
510
+ logger.warning("Parallel eval %d failed: %s", idx, e)
511
+ ctx.error = e
512
+ return ctx
513
+
514
+ # Run in parallel
515
+ logger.info(
516
+ "Running %d evals (%d base x %d variants x %d runs)%s",
517
+ len(eval_configs),
518
+ base_count,
519
+ len(variant_combos),
520
+ group,
521
+ f", max_concurrent={max_concurrent}" if max_concurrent else "",
522
+ )
523
+ completed = await asyncio.gather(*[run_one(cfg) for cfg in eval_configs])
524
+
525
+ # Log and print stats
526
+ eval_name = completed[0].eval_name if completed else "eval"
527
+ log_eval_stats(completed)
528
+ print_eval_stats(completed, name=eval_name)
529
+
530
+ return list(completed)
531
+
532
+
533
+ __all__ = ["run_eval"]