hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/eval/manager.py ADDED
@@ -0,0 +1,466 @@
1
+ """Standalone eval() context manager.
2
+
3
+ Provides hud.eval() for task-based evaluation without needing an existing environment.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import inspect
9
+ import logging
10
+ import uuid
11
+ from contextlib import asynccontextmanager
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ from hud.eval.display import print_complete, print_eval_stats, print_link
15
+ from hud.eval.parallel import (
16
+ ASTExtractionError,
17
+ expand_variants,
18
+ find_user_frame,
19
+ get_with_block_body,
20
+ resolve_group_ids,
21
+ )
22
+ from hud.eval.types import ParallelEvalComplete
23
+
24
+ if TYPE_CHECKING:
25
+ from collections.abc import AsyncGenerator
26
+
27
+ from hud.eval.context import EvalContext
28
+ from hud.eval.task import Task
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ def _get_eval_name(tasks: list[Task] | None = None) -> str:
34
+ """Extract a nice name for job display.
35
+
36
+ Args:
37
+ tasks: List of Task objects
38
+
39
+ Returns:
40
+ Name like "scenario with val1, val2" or "eval" if no tasks
41
+ """
42
+ from hud.eval.task import build_eval_name
43
+
44
+ # If we have Task objects, derive name from first one
45
+ if tasks:
46
+ if tasks[0].scenario:
47
+ return build_eval_name(tasks[0].scenario, tasks[0].args)
48
+ # Fall back to env name or prompt
49
+ if tasks[0].env and hasattr(tasks[0].env, "name"):
50
+ return tasks[0].env.name
51
+ if tasks[0].env and hasattr(tasks[0].env, "prompt") and tasks[0].env.prompt:
52
+ return tasks[0].env.prompt[:30].strip()
53
+ if tasks[0].id:
54
+ return tasks[0].id
55
+
56
+ return "eval"
57
+
58
+
59
+ def _send_job_enter(
60
+ job_id: str,
61
+ name: str,
62
+ variants: dict[str, Any] | None,
63
+ group: int,
64
+ api_key: str | None,
65
+ ) -> None:
66
+ """Send job enter payload (sync request before traces start)."""
67
+ import httpx
68
+
69
+ from hud.eval.types import JobEnterPayload
70
+ from hud.settings import settings
71
+
72
+ api_key = api_key or settings.api_key
73
+ if not settings.telemetry_enabled or not api_key:
74
+ return
75
+
76
+ payload = JobEnterPayload(
77
+ name=name,
78
+ variants=variants,
79
+ group=group,
80
+ )
81
+
82
+ try:
83
+ httpx.post(
84
+ f"{settings.hud_api_url}/trace/job/{job_id}/enter",
85
+ json=payload.model_dump(exclude_none=True),
86
+ headers={"Authorization": f"Bearer {api_key}"},
87
+ timeout=10.0,
88
+ )
89
+ except Exception as e:
90
+ logger.warning("Failed to send job enter: %s", e)
91
+
92
+
93
+ @asynccontextmanager
94
+ async def run_eval(
95
+ source: Task | list[Task] | None = None,
96
+ *,
97
+ name: str | None = None,
98
+ variants: dict[str, Any] | None = None,
99
+ group: int = 1,
100
+ group_ids: list[str] | None = None,
101
+ job_id: str | None = None,
102
+ group_id: str | None = None,
103
+ trace_id: str | None = None,
104
+ api_key: str | None = None,
105
+ max_concurrent: int | None = None,
106
+ trace: bool = True,
107
+ quiet: bool = False,
108
+ ) -> AsyncGenerator[EvalContext, None]:
109
+ """Standalone eval context manager.
110
+
111
+ Creates an EvalContext for evaluation using Task objects (or deprecated LegacyTask).
112
+ For loading tasks from datasets, use load_tasks() first.
113
+
114
+ Args:
115
+ source: Task source. Can be:
116
+ - None: Create blank eval context
117
+ - Task: Single Task object (from env() or load_tasks())
118
+ - list[Task]: List of Task objects
119
+ - LegacyTask: Single LegacyTask object (deprecated, use Task.from_v4())
120
+ - list[LegacyTask]: List of LegacyTask objects (deprecated)
121
+ name: Optional name for the eval (used in trace)
122
+ variants: A/B test configuration (dict with list values expanded)
123
+ group: Runs per variant for statistical significance
124
+ group_ids: Optional list of group IDs
125
+ job_id: Job ID to link to
126
+ group_id: Group ID for parallel evaluations
127
+ trace_id: Pre-assigned trace ID (auto-generated if not provided)
128
+ api_key: API key for backend calls
129
+ max_concurrent: Maximum concurrent evals (None = unlimited)
130
+ trace: Whether to send trace data to backend (default True)
131
+ quiet: Whether to suppress printing links (default False)
132
+
133
+ Yields:
134
+ EvalContext: Environment with evaluation tracking
135
+
136
+ Example:
137
+ ```python
138
+ from hud.datasets import load_tasks
139
+
140
+ # Blank eval (for manual reward)
141
+ async with hud.eval() as ctx:
142
+ ctx.reward = compute_reward()
143
+
144
+ # With Task objects (from env())
145
+ env = Environment("my-env").connect_hub("browser")
146
+ tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
147
+ async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
148
+ await agent.run(ctx.prompt)
149
+
150
+ # Load tasks from file or API
151
+ tasks = load_tasks("hud-evals/SheetBench-50")
152
+ async with hud.eval(tasks) as ctx:
153
+ await agent.run(ctx)
154
+
155
+ # With variants and group
156
+ async with hud.eval(
157
+ tasks,
158
+ variants={"model": ["gpt-4o", "claude"]},
159
+ group=3,
160
+ ) as ctx:
161
+ model = ctx.variants["model"]
162
+ await run_agent(model)
163
+ ctx.reward = evaluate()
164
+
165
+ # With concurrency limit
166
+ async with hud.eval(tasks, max_concurrent=10) as ctx:
167
+ await agent.run(ctx)
168
+
169
+ # Access results after parallel run
170
+ for e in ctx.results:
171
+ print(f"{e.variants}: reward={e.reward}")
172
+ ```
173
+ """
174
+ from hud.eval.task import Task
175
+ from hud.types import LegacyTask
176
+
177
+ if group <= 0:
178
+ raise ValueError("group must be >= 1")
179
+
180
+ # Expand variants
181
+ variant_combos = expand_variants(variants)
182
+
183
+ # Parse source into tasks list - only Task objects accepted
184
+ tasks: list[Task] = []
185
+
186
+ if source is not None:
187
+ if isinstance(source, Task):
188
+ # Single Task object
189
+ tasks = [source]
190
+ elif isinstance(source, list) and source and isinstance(source[0], Task):
191
+ # List of Task objects
192
+ tasks = source # type: ignore[assignment]
193
+ elif isinstance(source, LegacyTask) or (
194
+ isinstance(source, list) and source and isinstance(source[0], LegacyTask)
195
+ ):
196
+ # LegacyTask no longer accepted - user must convert first
197
+ raise TypeError(
198
+ "LegacyTask is no longer accepted by hud.eval(). "
199
+ "Convert first with Task.from_v4(legacy_task), or use load_tasks()."
200
+ )
201
+ elif isinstance(source, str):
202
+ # String slugs no longer supported - use load_dataset()
203
+ raise TypeError(
204
+ f"String slugs are no longer supported in hud.eval(). "
205
+ f"Use load_tasks('{source}') first, then pass the tasks list."
206
+ )
207
+ elif isinstance(source, list) and source and isinstance(source[0], str):
208
+ # List of string slugs no longer supported
209
+ raise TypeError(
210
+ "String slugs are no longer supported in hud.eval(). "
211
+ "Use load_tasks() first, then pass the tasks list."
212
+ )
213
+
214
+ # Calculate total evaluations
215
+ # Each task gets (variants x group) runs; no tasks = single blank eval
216
+ base_count = len(tasks) or 1
217
+ total_evals = base_count * len(variant_combos) * group
218
+
219
+ # Capture code snippet for parallel execution
220
+ code_snippet: str | None = None
221
+ if total_evals > 1:
222
+ frame = inspect.currentframe()
223
+ if frame is not None:
224
+ try:
225
+ caller = frame.f_back
226
+ if caller is not None:
227
+ code_snippet, _, _ = get_with_block_body(caller)
228
+ except ASTExtractionError:
229
+ pass
230
+ finally:
231
+ del frame
232
+
233
+ # Lazy import to avoid circular dependency
234
+ from hud.eval.context import EvalContext
235
+
236
+ if total_evals == 1:
237
+ if tasks:
238
+ # Single task - use EvalContext.from_task()
239
+ ctx = EvalContext.from_task(
240
+ tasks[0],
241
+ name=name,
242
+ trace_id=trace_id,
243
+ api_key=api_key,
244
+ job_id=job_id,
245
+ group_id=group_id,
246
+ variants=variant_combos[0],
247
+ code_snippet=code_snippet,
248
+ trace=trace,
249
+ quiet=quiet,
250
+ )
251
+ async with ctx:
252
+ yield ctx
253
+ else:
254
+ # Blank eval - use EvalContext directly
255
+ ctx = EvalContext(
256
+ name=name or "eval",
257
+ trace_id=trace_id,
258
+ api_key=api_key,
259
+ job_id=job_id,
260
+ group_id=group_id,
261
+ variants=variant_combos[0],
262
+ code_snippet=code_snippet,
263
+ trace=trace,
264
+ quiet=quiet,
265
+ )
266
+ async with ctx:
267
+ yield ctx
268
+
269
+ else:
270
+ # Parallel execution: create implicit job to group traces
271
+ eval_name = _get_eval_name(tasks=tasks)
272
+ implicit_job_id = job_id or str(uuid.uuid4())
273
+ job_url = f"https://hud.ai/jobs/{implicit_job_id}"
274
+
275
+ # Send job enter (sync request before traces start)
276
+ _send_job_enter(
277
+ job_id=implicit_job_id,
278
+ name=eval_name,
279
+ variants=variants,
280
+ group=group,
281
+ api_key=api_key,
282
+ )
283
+
284
+ # Print job URL (not individual trace URLs)
285
+ if not quiet:
286
+ print_link(job_url, f"🚀 {eval_name}")
287
+
288
+ error_occurred = False
289
+ try:
290
+ # Run parallel evals with job_id
291
+ completed = await _run_parallel_eval(
292
+ tasks=tasks,
293
+ variant_combos=variant_combos,
294
+ group=group,
295
+ group_ids=group_ids,
296
+ job_id=implicit_job_id, # Propagate job_id to child traces
297
+ api_key=api_key,
298
+ code_snippet=code_snippet,
299
+ max_concurrent=max_concurrent,
300
+ trace=trace,
301
+ quiet=quiet,
302
+ )
303
+
304
+ # Create summary context (no trace, just aggregates results)
305
+ if tasks:
306
+ # Create summary from first task
307
+ ctx = EvalContext(
308
+ name=eval_name, # Use the same smart name
309
+ api_key=api_key,
310
+ job_id=implicit_job_id,
311
+ )
312
+ else:
313
+ ctx = EvalContext(
314
+ name="eval",
315
+ api_key=api_key,
316
+ job_id=implicit_job_id,
317
+ )
318
+
319
+ ctx._is_summary = True # Skip trace tracking
320
+ ctx.results = completed
321
+
322
+ # Compute aggregate reward
323
+ rewards = [e.reward for e in completed if e.reward is not None]
324
+ if rewards:
325
+ ctx.reward = sum(rewards) / len(rewards)
326
+
327
+ # Check if any failed
328
+ error_occurred = any(e.error is not None for e in completed)
329
+
330
+ yield ctx
331
+ except ParallelEvalComplete:
332
+ # Expected - body re-executed on summary context, skip it
333
+ pass
334
+ except Exception:
335
+ error_occurred = True
336
+ raise
337
+ finally:
338
+ print_complete(job_url, eval_name, error=error_occurred)
339
+
340
+
341
+ async def _run_parallel_eval(
342
+ tasks: list[Task],
343
+ variant_combos: list[dict[str, Any]],
344
+ group: int,
345
+ group_ids: list[str] | None,
346
+ job_id: str | None,
347
+ api_key: str | None,
348
+ code_snippet: str | None,
349
+ max_concurrent: int | None,
350
+ trace: bool = True,
351
+ quiet: bool = False,
352
+ ) -> list[EvalContext]:
353
+ """Run parallel evaluation.
354
+
355
+ Creates EvalContexts from Tasks (or blank) and runs them in parallel.
356
+ """
357
+ import asyncio
358
+ import textwrap
359
+
360
+ from hud.eval.parallel import log_eval_stats
361
+
362
+ # Find user code frame and extract the with block body
363
+ caller_frame = find_user_frame()
364
+ body_source, captured_locals, context_var = get_with_block_body(caller_frame)
365
+
366
+ # Calculate total evals and resolve group IDs
367
+ base_count = len(tasks) or 1
368
+ total_evals = base_count * len(variant_combos) * group
369
+ resolved_group_ids = resolve_group_ids(group_ids, total_evals)
370
+
371
+ # Build list of (task_or_none, runtime_params) for each parallel eval
372
+ from hud.eval.context import EvalContext
373
+
374
+ eval_configs: list[tuple[Task | None, dict[str, Any]]] = []
375
+ idx = 0
376
+
377
+ if tasks:
378
+ for base_task in tasks:
379
+ for variant in variant_combos:
380
+ for _ in range(group):
381
+ runtime_params = {
382
+ "api_key": api_key,
383
+ "job_id": job_id,
384
+ "group_id": resolved_group_ids[idx],
385
+ "index": idx,
386
+ "variants": variant,
387
+ "code_snippet": code_snippet,
388
+ "trace": trace,
389
+ "quiet": True, # Individual traces don't print links
390
+ }
391
+ eval_configs.append((base_task, runtime_params))
392
+ idx += 1
393
+ else:
394
+ for variant in variant_combos:
395
+ for _ in range(group):
396
+ runtime_params = {
397
+ "api_key": api_key,
398
+ "job_id": job_id,
399
+ "group_id": resolved_group_ids[idx],
400
+ "index": idx,
401
+ "variants": variant,
402
+ "code_snippet": code_snippet,
403
+ "trace": trace,
404
+ "quiet": True,
405
+ }
406
+ eval_configs.append((None, runtime_params))
407
+ idx += 1
408
+
409
+ # Create runner function using the actual variable name from the 'as' clause
410
+ wrapped = f"async def __runner__({context_var}):\n{textwrap.indent(body_source, ' ')}"
411
+ code = compile(wrapped, "<parallel_eval>", "exec")
412
+ namespace = captured_locals.copy()
413
+ exec(code, namespace) # noqa: S102
414
+ runner = namespace["__runner__"]
415
+
416
+ # Create semaphore for concurrency control
417
+ sem = asyncio.Semaphore(max_concurrent) if max_concurrent else None
418
+
419
+ async def run_one(config: tuple[Task | None, dict[str, Any]]) -> EvalContext:
420
+ """Run a single eval and return its EvalContext."""
421
+ task, params = config
422
+ idx = params["index"]
423
+
424
+ # Create context from task or blank
425
+ if task is not None:
426
+ ctx = EvalContext.from_task(task, **params)
427
+ else:
428
+ ctx = EvalContext(name="eval", **params)
429
+
430
+ # Remove sensitive data from params after context creation to prevent
431
+ # accidental logging if an exception includes local variables
432
+ params.pop("api_key", None)
433
+
434
+ try:
435
+ if sem:
436
+ async with sem, ctx:
437
+ await runner(ctx)
438
+ else:
439
+ async with ctx:
440
+ await runner(ctx)
441
+ return ctx
442
+ except Exception as e:
443
+ logger.warning("Parallel eval %d failed: %s", idx, e)
444
+ ctx.error = e
445
+ return ctx
446
+
447
+ # Run in parallel
448
+ logger.info(
449
+ "Running %d evals (%d base x %d variants x %d runs)%s",
450
+ len(eval_configs),
451
+ base_count,
452
+ len(variant_combos),
453
+ group,
454
+ f", max_concurrent={max_concurrent}" if max_concurrent else "",
455
+ )
456
+ completed = await asyncio.gather(*[run_one(cfg) for cfg in eval_configs])
457
+
458
+ # Log and print stats
459
+ eval_name = completed[0].eval_name if completed else "eval"
460
+ log_eval_stats(completed)
461
+ print_eval_stats(completed, name=eval_name)
462
+
463
+ return list(completed)
464
+
465
+
466
+ __all__ = ["run_eval"]