hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/eval/parallel.py ADDED
@@ -0,0 +1,268 @@
1
+ """Parallel execution support for evaluations.
2
+
3
+ This module provides AST extraction and parallel execution for running
4
+ the same eval body N times concurrently.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import ast
10
+ import inspect
11
+ import itertools
12
+ import linecache
13
+ import logging
14
+ import textwrap
15
+ import uuid
16
+ from typing import TYPE_CHECKING, Any
17
+
18
+ if TYPE_CHECKING:
19
+ from types import FrameType
20
+
21
+ from hud.eval.context import EvalContext
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Frames to skip when walking the call stack to find user code
26
+ # These are internal implementation details that shouldn't be considered user code
27
+ _SKIP_FRAME_PATTERNS = (
28
+ # Python stdlib
29
+ "contextlib.py",
30
+ "asyncio",
31
+ # HUD eval internals (both Unix and Windows paths)
32
+ "hud/eval/mixin.py",
33
+ "hud/eval/manager.py",
34
+ "hud/eval/parallel.py",
35
+ "hud\\eval\\mixin.py",
36
+ "hud\\eval\\manager.py",
37
+ "hud\\eval\\parallel.py",
38
+ )
39
+
40
+ # Frames that should NOT be skipped even if in site-packages
41
+ # These contain legitimate async with hud.eval() calls
42
+ _ALLOWED_FRAME_PATTERNS = (
43
+ "hud/datasets/runner.py",
44
+ "hud\\datasets\\runner.py",
45
+ )
46
+
47
+
48
+ def find_user_frame() -> FrameType:
49
+ """Walk the call stack to find the first user code frame.
50
+
51
+ Skips internal frames from contextlib, asyncio, and hud.eval internals.
52
+ Frames in site-packages are skipped UNLESS they match _ALLOWED_FRAME_PATTERNS.
53
+
54
+ Returns:
55
+ The frame containing user code (typically the async with statement).
56
+
57
+ Raises:
58
+ ASTExtractionError: If no user code frame can be found.
59
+ """
60
+ frame = inspect.currentframe()
61
+ if frame is None:
62
+ raise ASTExtractionError("Cannot get current frame")
63
+
64
+ try:
65
+ caller_frame = frame.f_back
66
+ while caller_frame is not None:
67
+ filename = caller_frame.f_code.co_filename
68
+
69
+ # Check if this is an explicitly allowed frame (e.g., hud/datasets/runner.py)
70
+ if any(pattern in filename for pattern in _ALLOWED_FRAME_PATTERNS):
71
+ return caller_frame
72
+
73
+ # Skip internal frames, but also skip site-packages unless allowed above
74
+ is_internal = any(pattern in filename for pattern in _SKIP_FRAME_PATTERNS)
75
+ is_site_packages = "site-packages" in filename
76
+
77
+ if not is_internal and not is_site_packages:
78
+ return caller_frame
79
+
80
+ caller_frame = caller_frame.f_back
81
+
82
+ raise ASTExtractionError("Cannot find user code frame in call stack")
83
+ finally:
84
+ del frame
85
+
86
+
87
+ def expand_variants(
88
+ variants: dict[str, Any] | None,
89
+ ) -> list[dict[str, Any]]:
90
+ """Expand variants dict into all combinations.
91
+
92
+ Args:
93
+ variants: Dict where values can be:
94
+ - Single value: {"model": "gpt-4o"} → fixed
95
+ - List: {"model": ["gpt-4o", "claude"]} → expand
96
+
97
+ Returns:
98
+ List of variant assignments, one per combination.
99
+
100
+ Examples:
101
+ >>> expand_variants(None)
102
+ [{}]
103
+ >>> expand_variants({"model": "gpt-4o"})
104
+ [{"model": "gpt-4o"}]
105
+ >>> expand_variants({"model": ["gpt-4o", "claude"]})
106
+ [{"model": "gpt-4o"}, {"model": "claude"}]
107
+ """
108
+ if not variants:
109
+ return [{}]
110
+
111
+ expanded: dict[str, list[Any]] = {}
112
+ for key, value in variants.items():
113
+ if isinstance(value, list):
114
+ expanded[key] = value
115
+ else:
116
+ expanded[key] = [value]
117
+
118
+ keys = list(expanded.keys())
119
+ value_lists = [expanded[k] for k in keys]
120
+
121
+ return [dict(zip(keys, combo, strict=True)) for combo in itertools.product(*value_lists)]
122
+
123
+
124
+ def resolve_group_ids(
125
+ group_ids: list[str] | None,
126
+ total_count: int,
127
+ ) -> list[str]:
128
+ """Resolve group IDs for parallel execution.
129
+
130
+ Args:
131
+ group_ids: Optional list of group IDs (must match total_count if provided)
132
+ total_count: Total number of evals
133
+
134
+ Returns:
135
+ List of group IDs (one per eval)
136
+
137
+ Raises:
138
+ ValueError: If group_ids length doesn't match total_count
139
+ """
140
+ if group_ids:
141
+ if len(group_ids) != total_count:
142
+ raise ValueError(
143
+ f"group_ids length ({len(group_ids)}) must match total evals ({total_count})"
144
+ )
145
+ return group_ids
146
+ else:
147
+ shared_group_id = str(uuid.uuid4())
148
+ return [shared_group_id] * total_count
149
+
150
+
151
+ def log_eval_stats(completed: list[EvalContext], context: str = "") -> None:
152
+ """Log statistics for completed evaluations.
153
+
154
+ Args:
155
+ completed: List of completed EvalContext objects
156
+ context: Optional context string for the log message
157
+ """
158
+ rewards = [ctx.reward for ctx in completed if ctx.reward is not None]
159
+ mean_reward = sum(rewards) / len(rewards) if rewards else 0.0
160
+ success_count = sum(1 for ctx in completed if ctx.success)
161
+
162
+ logger.info(
163
+ "Evals complete%s: %d/%d succeeded, mean_reward=%.3f",
164
+ f" ({context})" if context else "",
165
+ success_count,
166
+ len(completed),
167
+ mean_reward,
168
+ )
169
+
170
+
171
+ class ASTExtractionError(Exception):
172
+ """Error extracting AST from source."""
173
+
174
+
175
+ def get_with_block_body(frame: Any) -> tuple[str, dict[str, Any], str]:
176
+ """Extract the body of a with-block from the calling frame.
177
+
178
+ Args:
179
+ frame: The calling frame (from inspect.currentframe())
180
+
181
+ Returns:
182
+ Tuple of (body_source, captured_locals, context_var_name)
183
+ """
184
+ filename = frame.f_code.co_filename
185
+ lineno = frame.f_lineno
186
+
187
+ # Check for interactive session
188
+ if filename.startswith("<") or filename in ("<stdin>", "<string>"):
189
+ raise ASTExtractionError("Cannot extract source from interactive session. Use a .py file.")
190
+
191
+ # Read and parse source
192
+ lines = linecache.getlines(filename)
193
+ if not lines:
194
+ with open(filename, encoding="utf-8") as f:
195
+ lines = f.readlines()
196
+
197
+ source = "".join(lines)
198
+ tree = ast.parse(source, filename=filename)
199
+
200
+ # Find the async with containing this line
201
+ with_node = _find_async_with(tree, lineno)
202
+ if with_node is None:
203
+ raise ASTExtractionError(f"Cannot find 'async with' statement at line {lineno}")
204
+
205
+ # Extract body source
206
+ body_source = _extract_body(lines, with_node)
207
+
208
+ # Extract the context variable name from 'as' clause
209
+ context_var = _extract_context_var(with_node)
210
+
211
+ # Capture both globals (imports) and locals (variables in scope)
212
+ captured = {**frame.f_globals, **frame.f_locals}
213
+
214
+ return body_source, captured, context_var
215
+
216
+
217
+ def _extract_context_var(with_node: ast.AsyncWith) -> str:
218
+ """Extract the variable name from the 'as' clause of an async with statement."""
219
+ if not with_node.items or not with_node.items[0].optional_vars:
220
+ raise ASTExtractionError("async with statement must use 'as' clause for parallel execution")
221
+
222
+ var_node = with_node.items[0].optional_vars
223
+ if not isinstance(var_node, ast.Name):
224
+ raise ASTExtractionError("async with 'as' clause must be a simple variable name")
225
+
226
+ return var_node.id
227
+
228
+
229
+ def _find_async_with(tree: ast.AST, target_line: int) -> ast.AsyncWith | None:
230
+ """Find AsyncWith node containing the target line."""
231
+ for node in ast.walk(tree):
232
+ if isinstance(node, ast.AsyncWith):
233
+ end_line = _get_end_line(node)
234
+ if node.lineno <= target_line <= end_line:
235
+ return node
236
+ return None
237
+
238
+
239
+ def _get_end_line(node: ast.AST) -> int:
240
+ """Get the last line number of an AST node."""
241
+ end = getattr(node, "end_lineno", getattr(node, "lineno", 0))
242
+ for child in ast.walk(node):
243
+ child_end = getattr(child, "end_lineno", 0)
244
+ if child_end > end:
245
+ end = child_end
246
+ return end
247
+
248
+
249
+ def _extract_body(lines: list[str], with_node: ast.AsyncWith) -> str:
250
+ """Extract the body source from an AsyncWith node."""
251
+ if not with_node.body:
252
+ return "pass"
253
+
254
+ start = with_node.body[0].lineno - 1
255
+ end = _get_end_line(with_node.body[-1])
256
+
257
+ body = "".join(lines[start:end])
258
+ return textwrap.dedent(body)
259
+
260
+
261
+ __all__ = [
262
+ "ASTExtractionError",
263
+ "expand_variants",
264
+ "find_user_frame",
265
+ "get_with_block_body",
266
+ "log_eval_stats",
267
+ "resolve_group_ids",
268
+ ]
hud/eval/task.py ADDED
@@ -0,0 +1,340 @@
1
+ """Task - A runnable evaluation unit (Pydantic model).
2
+
3
+ A Task holds the configuration needed to run an evaluation:
4
+ - Environment configuration (how to create/connect)
5
+ - Optional scenario name and args
6
+
7
+ When entered as a context manager, it creates an EvalContext.
8
+
9
+ Usage:
10
+ env = Environment("my-env").connect_hub("browser")
11
+
12
+ # Empty - just env
13
+ async with env() as ctx:
14
+ await ctx.call_tool("navigate", url="...")
15
+
16
+ # With scenario
17
+ async with env("checkout", user_id="alice") as ctx:
18
+ await agent.run(ctx.prompt)
19
+
20
+ # Orchestrated via hud.eval
21
+ tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
22
+ async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
23
+ ...
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import logging
29
+ from typing import TYPE_CHECKING, Any
30
+
31
+ from pydantic import (
32
+ BaseModel,
33
+ ConfigDict,
34
+ Field,
35
+ field_serializer,
36
+ field_validator,
37
+ model_serializer,
38
+ model_validator,
39
+ )
40
+
41
+ from hud.types import MCPToolCall
42
+
43
+ if TYPE_CHECKING:
44
+ from hud.environment import Environment
45
+ from hud.environment.types import EnvConfig
46
+
47
+ __all__ = ["Task", "TaskAgentConfig", "build_eval_name"]
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+
52
+ class TaskAgentConfig(BaseModel):
53
+ """Agent configuration for a Task.
54
+
55
+ Contains settings that should be passed to the agent when running this task.
56
+ """
57
+
58
+ model_config = ConfigDict(extra="ignore")
59
+
60
+ system_prompt: str | None = Field(
61
+ default=None,
62
+ description="Custom system prompt to pass to the agent",
63
+ )
64
+
65
+ @model_validator(mode="before")
66
+ @classmethod
67
+ def warn_extra_fields(cls, data: Any) -> Any:
68
+ """Warn about extra fields that will be ignored."""
69
+ if isinstance(data, dict):
70
+ known_fields = {"system_prompt"}
71
+ extra = set(data.keys()) - known_fields
72
+ if extra:
73
+ logger.warning(
74
+ "Deprecated or unknown fields in agent_config will be ignored: %s",
75
+ ", ".join(sorted(extra)),
76
+ )
77
+ return data
78
+
79
+
80
+ def build_eval_name(scenario: str | None, args: dict[str, Any] | None) -> str:
81
+ """Build descriptive name: 'scenario with val1, val2, ...'"""
82
+ if not scenario:
83
+ return "eval"
84
+ if not args:
85
+ return scenario
86
+
87
+ val_parts = []
88
+ for v in list(args.values())[:3]: # Max 3 values
89
+ v_str = repr(v) if isinstance(v, str) else str(v)
90
+ if len(v_str) > 25:
91
+ v_str = v_str[:22] + "..."
92
+ val_parts.append(v_str)
93
+
94
+ if val_parts:
95
+ return f"{scenario} with {', '.join(val_parts)}"
96
+ return scenario
97
+
98
+
99
+ class Task(BaseModel):
100
+ """A runnable evaluation unit (Pydantic model).
101
+
102
+ Simplified v5 Task format:
103
+ - env: Environment instance OR EnvConfig with hub name + filters
104
+ - scenario: Scenario name to run
105
+ - args: Scenario arguments
106
+ - validation: Optional list of tool calls representing successful completion
107
+
108
+ When entered as a context manager, creates an EvalContext.
109
+
110
+ Attributes:
111
+ id: Optional task identifier for filtering/tracking
112
+ env: Environment instance (auto-created from dict/EnvConfig via validator)
113
+ scenario: Scenario name to run (from @env.scenario)
114
+ args: Scenario arguments
115
+ validation: Optional list of MCPToolCall objects representing successful completion
116
+
117
+ Example (v5 format):
118
+ ```python
119
+ from hud.eval import Task
120
+
121
+ # Pass dict - auto-converts to Environment
122
+ task = Task(
123
+ env={"name": "browser", "include": ["navigate", "screenshot"]},
124
+ scenario="checkout",
125
+ args={"user_id": "alice"},
126
+ validation=[{"name": "check_cart", "arguments": {}}],
127
+ )
128
+ # task.env is now Environment connected to browser hub!
129
+
130
+ # Or pass live Environment directly
131
+ env = Environment("my-env").connect_hub("browser")
132
+ task = Task(env=env, scenario="checkout", args={"user_id": "alice"})
133
+ ```
134
+
135
+ Migration from v4:
136
+ Use Task.from_v4() to convert LegacyTask objects:
137
+
138
+ ```python
139
+ task = Task.from_v4(legacy_task)
140
+ # or
141
+ task = Task.from_v4({"prompt": "...", "mcp_config": {...}, ...})
142
+ ```
143
+ """
144
+
145
+ model_config = ConfigDict(arbitrary_types_allowed=True)
146
+
147
+ # Fields - env accepts Environment | EnvConfig | dict, auto-converts to Environment
148
+ env: Any = Field(default=None) # Typed as Any for input flexibility, validated below
149
+ scenario: str | None = None
150
+ id: str | None = None
151
+ args: dict[str, Any] = Field(default_factory=dict)
152
+ validation: list[MCPToolCall] | None = None
153
+
154
+ # Agent config - settings passed to agent (system_prompt, etc.)
155
+ # Accepts TaskAgentConfig or dict (auto-converted via validator)
156
+ agent_config: TaskAgentConfig | dict[str, Any] | None = None
157
+
158
+ # Task metadata - for tracking/filtering, not used by agent
159
+ metadata: dict[str, Any] = Field(default_factory=dict)
160
+
161
+ @field_validator("agent_config", mode="before")
162
+ @classmethod
163
+ def convert_agent_config(
164
+ cls, v: TaskAgentConfig | dict[str, Any] | None
165
+ ) -> TaskAgentConfig | None:
166
+ """Auto-convert dict to TaskAgentConfig."""
167
+ if v is None:
168
+ return None
169
+ if isinstance(v, TaskAgentConfig):
170
+ return v
171
+ if isinstance(v, dict):
172
+ return TaskAgentConfig(**v)
173
+ raise TypeError(
174
+ f"Task.agent_config must be TaskAgentConfig or dict. Got {type(v).__name__}"
175
+ )
176
+
177
+ @model_validator(mode="before")
178
+ @classmethod
179
+ def detect_v4_format(cls, data: Any) -> Any:
180
+ """Auto-detect v4 LegacyTask format and convert to v5 Task format.
181
+
182
+ If the input dict is a valid v4 format (has prompt, mcp_config, evaluate_tool),
183
+ it's converted using build_env_from_v4().
184
+
185
+ This allows Task(**v4_dict) to work seamlessly.
186
+ """
187
+ from hud.eval.utils import build_env_from_v4, is_v4_format, validate_v4_task
188
+
189
+ if not isinstance(data, dict):
190
+ return data
191
+
192
+ if is_v4_format(data):
193
+ # Validate completeness before conversion
194
+ validate_v4_task(data)
195
+ # build_env_from_v4 returns a dict with all Task fields
196
+ return build_env_from_v4(data)
197
+
198
+ return data
199
+
200
+ @field_validator("env", mode="before")
201
+ @classmethod
202
+ def convert_env(cls, v: Environment | EnvConfig | dict[str, Any] | None) -> Environment | None:
203
+ """Auto-convert dict/EnvConfig to Environment.
204
+
205
+ Format: {"name": "browser", "include": [...], "exclude": [...]}
206
+ """
207
+ from hud.environment import Environment
208
+ from hud.environment.types import EnvConfig
209
+
210
+ if v is None:
211
+ return None
212
+ if isinstance(v, Environment):
213
+ return v
214
+ if isinstance(v, dict):
215
+ try:
216
+ config = EnvConfig(**v)
217
+ except Exception as e:
218
+ raise ValueError(
219
+ f"Invalid env config: {e}. Expected fields: name (str), "
220
+ f"include (list[str] | None), exclude (list[str] | None)"
221
+ ) from e
222
+ env = Environment(config.name)
223
+ env.connect_hub(config.name, include=config.include, exclude=config.exclude)
224
+ return env
225
+ if isinstance(v, EnvConfig):
226
+ env = Environment(v.name)
227
+ env.connect_hub(v.name, include=v.include, exclude=v.exclude)
228
+ return env
229
+ raise TypeError(f"Task.env must be Environment, EnvConfig, or dict. Got {type(v).__name__}")
230
+
231
+ @field_validator("validation", mode="before")
232
+ @classmethod
233
+ def convert_validation(
234
+ cls, v: list[MCPToolCall | dict[str, Any]] | None
235
+ ) -> list[MCPToolCall] | None:
236
+ """Auto-convert validation dicts to MCPToolCall objects."""
237
+ if v is None:
238
+ return None
239
+ if not isinstance(v, list):
240
+ raise TypeError(f"validation must be a list, got {type(v).__name__}")
241
+
242
+ converted = []
243
+ for item in v:
244
+ if isinstance(item, dict):
245
+ converted.append(MCPToolCall(**item))
246
+ elif isinstance(item, MCPToolCall):
247
+ converted.append(item)
248
+ else:
249
+ raise TypeError(
250
+ f"validation items must be dict or MCPToolCall, got {type(item).__name__}"
251
+ )
252
+ return converted
253
+
254
+ @field_serializer("env")
255
+ def serialize_env(self, env: Environment | None) -> dict[str, Any] | None:
256
+ """Serialize Environment to config dict via to_config()."""
257
+ if env is None:
258
+ return None
259
+ return env.to_config()
260
+
261
+ @model_serializer(mode="wrap")
262
+ def _serialize_task(
263
+ self,
264
+ handler: Any, # SerializerFunctionWrapHandler
265
+ ) -> dict[str, Any]:
266
+ """Custom serializer for v4 format flattening.
267
+
268
+ For v5 tasks: uses default serialization (env field handled by field_serializer)
269
+ For v4 tasks: flattens {"prompt": ..., "mcp_config": ..., "evaluate_tool": ...}
270
+ """
271
+ # Get default serialization (env is already converted by field_serializer)
272
+ data = handler(self)
273
+
274
+ # Check if this is a v4 task (env config has mcp_config)
275
+ env_config = data.get("env")
276
+ if env_config and isinstance(env_config, dict) and "mcp_config" in env_config:
277
+ # v4 format - flatten into top-level dict
278
+ result = env_config.copy()
279
+
280
+ # Map validation → integration_test_tool
281
+ if self.validation:
282
+ result["integration_test_tool"] = [
283
+ {"name": v.name, "arguments": v.arguments or {}} for v in self.validation
284
+ ]
285
+
286
+ # Preserve agent_config
287
+ if data.get("agent_config"):
288
+ result["agent_config"] = data["agent_config"]
289
+
290
+ # Preserve metadata
291
+ if data.get("metadata"):
292
+ result["metadata"] = data["metadata"]
293
+
294
+ # Preserve id
295
+ if data.get("id"):
296
+ result["id"] = data["id"]
297
+
298
+ return result
299
+
300
+ return data
301
+
302
+ @classmethod
303
+ def from_v4(cls, source: Any) -> Task:
304
+ """Convert v4 LegacyTask format to v5 Task.
305
+
306
+ This is a convenience wrapper. You can also use Task(**dict) directly
307
+ since the model validator auto-detects v4 format.
308
+
309
+ Args:
310
+ source: LegacyTask, dict, or JSON string with v4 fields
311
+
312
+ Returns:
313
+ Task configured for v4 behavior
314
+ """
315
+ import json as json_module
316
+
317
+ # JSON string → dict
318
+ if isinstance(source, str):
319
+ source = json_module.loads(source)
320
+
321
+ # LegacyTask → dict (import only when needed)
322
+ if hasattr(source, "model_dump"):
323
+ source = source.model_dump()
324
+
325
+ # Model validator handles v4 detection and conversion
326
+ return cls(**source)
327
+
328
+ def copy(self) -> Task:
329
+ """Create a copy of this Task config.
330
+
331
+ Note: env is shared (not deep copied) since Environment instances
332
+ should be reused. Args and validation are deep copied.
333
+ """
334
+ return Task(
335
+ id=self.id,
336
+ env=self.env, # Share reference
337
+ scenario=self.scenario,
338
+ args=self.args.copy() if self.args else {},
339
+ validation=self.validation.copy() if self.validation else None,
340
+ )
@@ -0,0 +1 @@
1
+ """Tests for hud.eval module."""