hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/eval/context.py ADDED
@@ -0,0 +1,727 @@
1
+ """EvalContext - Environment with evaluation tracking.
2
+
3
+ EvalContext IS an Environment, with additional evaluation tracking
4
+ capabilities (trace_id, reward, backend reporting).
5
+
6
+ This makes `async with env.eval("task") as env` natural - you get
7
+ a full Environment that you can call tools on directly.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import contextvars
13
+ import logging
14
+ import uuid
15
+ from typing import TYPE_CHECKING, Any, Self
16
+
17
+ from hud.environment import Environment
18
+ from hud.settings import settings
19
+ from hud.shared import make_request
20
+ from hud.telemetry import flush, instrument
21
+
22
+ if TYPE_CHECKING:
23
+ from types import TracebackType
24
+
25
+ from hud.eval.task import Task
26
+ from hud.types import MCPToolResult
27
+
28
+
29
+ from hud.eval.types import EvalExitPayload, EvalPayload, ParallelEvalComplete
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # Contextvar to store current trace headers (for httpx auto-instrumentation)
34
+ _current_trace_headers: contextvars.ContextVar[dict[str, str] | None] = contextvars.ContextVar(
35
+ "current_trace_headers", default=None
36
+ )
37
+
38
+ # Contextvar to store current api_key override (for telemetry exporter)
39
+ _current_api_key: contextvars.ContextVar[str | None] = contextvars.ContextVar(
40
+ "current_api_key", default=None
41
+ )
42
+
43
+
44
+ def get_current_trace_headers() -> dict[str, str] | None:
45
+ """Get the current trace headers from context."""
46
+ return _current_trace_headers.get()
47
+
48
+
49
+ def get_current_trace_id() -> str | None:
50
+ """Get the current trace ID (task_run_id) from context.
51
+
52
+ Returns the Trace-Id if inside an eval context, None otherwise.
53
+ Used by @instrument decorator to know where to send telemetry.
54
+ """
55
+ headers = _current_trace_headers.get()
56
+ if headers:
57
+ return headers.get("Trace-Id")
58
+ return None
59
+
60
+
61
+ def get_current_api_key() -> str | None:
62
+ """Get the current API key override from context.
63
+
64
+ Returns the api_key if one was passed to hud.eval(), otherwise None.
65
+ Falls back to settings.api_key if not in an eval context.
66
+ Used by telemetry exporter for uploads.
67
+ """
68
+ return _current_api_key.get()
69
+
70
+
71
+ # =============================================================================
72
+ # EvalContext
73
+ # =============================================================================
74
+
75
+
76
+ class EvalContext(Environment):
77
+ """Environment with evaluation tracking capabilities.
78
+
79
+ Attributes:
80
+ trace_id: Unique identifier for this evaluation
81
+ eval_name: Task/evaluation name (separate from env name)
82
+ job_id: Links to parent job (auto-detected from hud.job() context)
83
+ group_id: Links parallel evaluations together
84
+ variants: Variant assignment dict (for A/B testing)
85
+ reward: Reward value (user-settable)
86
+ error: Exception if failed
87
+ results: All eval results (populated for parallel execution, empty for single)
88
+ task: Task definition (if loaded from slug)
89
+
90
+ Example:
91
+ ```python
92
+ # With task (scenario sets reward automatically)
93
+ tasks = load_tasks("my-org/task:1")
94
+ async with hud.eval(tasks) as ctx:
95
+ await agent.run(ctx)
96
+ # reward set by scenario evaluate phase in __aexit__
97
+
98
+ # Blank eval (manual reward)
99
+ async with hud.eval() as ctx:
100
+ ctx.reward = compute_reward()
101
+ ```
102
+ """
103
+
104
+ def __init__(
105
+ self,
106
+ name: str = "eval",
107
+ *,
108
+ trace_id: str | None = None,
109
+ api_key: str | None = None,
110
+ job_id: str | None = None,
111
+ group_id: str | None = None,
112
+ index: int = 0,
113
+ variants: dict[str, Any] | None = None,
114
+ code_snippet: str | None = None,
115
+ trace: bool = True,
116
+ quiet: bool = False,
117
+ **env_kwargs: Any,
118
+ ) -> None:
119
+ """Initialize EvalContext.
120
+
121
+ Args:
122
+ name: Environment/evaluation name
123
+ trace_id: Unique trace ID (auto-generated if not provided)
124
+ api_key: API key for backend calls
125
+ job_id: Job ID to link to (auto-detected if not provided)
126
+ group_id: Group ID for parallel evaluations
127
+ index: Index in parallel execution
128
+ variants: Variant assignment for A/B testing
129
+ code_snippet: Code being evaluated (for reproducibility)
130
+ trace: Whether to send trace data to backend (default True)
131
+ quiet: Whether to suppress printing links (default False)
132
+ **env_kwargs: Additional kwargs passed to Environment.__init__
133
+ """
134
+ # Initialize Environment
135
+ super().__init__(name=name, **env_kwargs)
136
+
137
+ # === Evaluation tracking (not in Environment) ===
138
+
139
+ # Identity
140
+ self.trace_id: str = trace_id or str(uuid.uuid4())
141
+ self.eval_name: str = name # Separate from self.name for clarity
142
+
143
+ # Job linkage
144
+ self.job_id: str | None = job_id
145
+
146
+ self.group_id: str | None = group_id
147
+ self.index: int = index
148
+
149
+ # Variant assignment
150
+ self.variants: dict[str, Any] = variants or {}
151
+
152
+ # User-settable (per-run values, override Environment defaults)
153
+ self.prompt: str | None = None # From scenario setup or task
154
+ self.reward: float | None = None
155
+ self.answer: str | None = None # Agent's submitted answer
156
+ self.system_prompt: str | None = None # From task.agent_config, passed to agent
157
+
158
+ # Agent config overrides from task (applied by agent when running)
159
+ self.append_setup_output: bool = False # Whether to append setup tool output to prompt
160
+
161
+ # Error tracking
162
+ self.error: BaseException | None = None
163
+
164
+ # User metadata (arbitrary key-value pairs)
165
+ self.metadata: dict[str, Any] = {}
166
+
167
+ # Parallel results (empty list for single evals, populated for parallel)
168
+ self.results: list[EvalContext] = []
169
+
170
+ # Code snippet for reproducibility
171
+ self.code_snippet: str | None = code_snippet
172
+
173
+ # Private state for eval tracking
174
+ self._eval_api_key = api_key
175
+ self._token: contextvars.Token[dict[str, str] | None] | None = None
176
+ self._api_key_token: contextvars.Token[str | None] | None = None
177
+ self._is_summary: bool = False # True for summary contexts (skip trace)
178
+ self._suppress_link: bool = quiet # True to suppress printing eval link
179
+ self._trace_enabled: bool = trace # Whether to send trace data to backend
180
+ self._source_env_name: str | None = None # Source env name for remote lookups
181
+ self._task: Task | None = None # Task config (set by from_task)
182
+
183
+ @classmethod
184
+ def from_environment(
185
+ cls,
186
+ env: Environment,
187
+ name: str,
188
+ *,
189
+ trace_id: str | None = None,
190
+ api_key: str | None = None,
191
+ job_id: str | None = None,
192
+ group_id: str | None = None,
193
+ index: int = 0,
194
+ variants: dict[str, Any] | None = None,
195
+ code_snippet: str | None = None,
196
+ trace: bool = True,
197
+ quiet: bool = False,
198
+ ) -> EvalContext:
199
+ """Create an EvalContext that copies configuration from an existing Environment.
200
+
201
+ This creates a new EvalContext with the same connections as the parent.
202
+ Used by env.eval() to create evaluation contexts.
203
+
204
+ Args:
205
+ env: Parent environment to copy from
206
+ name: Evaluation name
207
+ trace_id: Unique trace ID
208
+ api_key: API key for backend calls
209
+ job_id: Job ID to link to
210
+ group_id: Group ID for parallel evaluations
211
+ index: Index in parallel execution
212
+ variants: Variant assignment
213
+ code_snippet: Code being evaluated
214
+ """
215
+ ctx = cls(
216
+ name=name,
217
+ trace_id=trace_id,
218
+ api_key=api_key,
219
+ job_id=job_id,
220
+ group_id=group_id,
221
+ index=index,
222
+ variants=variants,
223
+ code_snippet=code_snippet,
224
+ trace=trace,
225
+ quiet=quiet,
226
+ )
227
+
228
+ # Copy connections from parent - each connector is copied so parallel
229
+ # execution gets fresh client instances
230
+ ctx._connections = {name: connector.copy() for name, connector in env._connections.items()}
231
+
232
+ # Note: Auth is injected at request time by httpx/aiohttp hooks in hud.eval.instrument
233
+ # using the contextvar set in __aenter__ (supports api_key passed to hud.eval())
234
+ ctx._setup_calls = env._setup_calls.copy()
235
+ ctx._evaluate_calls = env._evaluate_calls.copy()
236
+ ctx._integration_test_calls = getattr(env, "_integration_test_calls", []).copy()
237
+ ctx._setup_results = getattr(env, "_setup_results", []).copy()
238
+
239
+ # Copy scenarios (definitions) by reference - they don't change
240
+ ctx._scenarios = getattr(env, "_scenarios", {})
241
+ # Create fresh session state for this eval (parallel evals each need their own)
242
+ ctx._active_session = None
243
+
244
+ # Store source env name for remote scenario lookups
245
+ ctx._source_env_name = env.name
246
+
247
+ # Copy managers by reference (they hold local tools, prompts, resources)
248
+ # This allows ctx.call_tool(), ctx.get_prompt(), ctx.read_resource() to work
249
+ # for locally defined tools/scenarios
250
+ ctx._tool_manager = env._tool_manager
251
+ ctx._prompt_manager = env._prompt_manager
252
+ ctx._resource_manager = env._resource_manager
253
+
254
+ # Copy prompt
255
+ if env.prompt:
256
+ ctx.prompt = env.prompt
257
+
258
+ # Copy agent-level tool filters (allowed_tools/disallowed_tools)
259
+ ctx._agent_include = getattr(env, "_agent_include", None)
260
+ ctx._agent_exclude = getattr(env, "_agent_exclude", None)
261
+
262
+ # Copy router's conflict resolution strategy
263
+ ctx._router.conflict_resolution = env._router.conflict_resolution
264
+
265
+ # Copy mock mode settings (for testing)
266
+ ctx._mock_mode = getattr(env, "_mock_mode", False)
267
+ ctx._mock_outputs = getattr(env, "_mock_outputs", {}).copy()
268
+ ctx._mock_tool_schemas = getattr(env, "_mock_tool_schemas", {}).copy()
269
+
270
+ # Copy hub config (needed to detect remote hub for telemetry)
271
+ ctx._hub_config = getattr(env, "_hub_config", None)
272
+
273
+ # Copy mcp config (needed to detect remote HUD MCP for telemetry)
274
+ ctx._mcp_config = getattr(env, "_mcp_config", None)
275
+
276
+ return ctx
277
+
278
+ @classmethod
279
+ def from_task(
280
+ cls,
281
+ task: Task,
282
+ *,
283
+ name: str | None = None,
284
+ trace_id: str | None = None,
285
+ api_key: str | None = None,
286
+ job_id: str | None = None,
287
+ group_id: str | None = None,
288
+ index: int = 0,
289
+ variants: dict[str, Any] | None = None,
290
+ code_snippet: str | None = None,
291
+ trace: bool = True,
292
+ quiet: bool = False,
293
+ ) -> EvalContext:
294
+ """Create an EvalContext from a Task config.
295
+
296
+ Args:
297
+ task: Task config (env, scenario, args)
298
+ name: Override for eval/trace name (defaults to task scenario/args)
299
+ trace_id: Unique trace ID
300
+ api_key: API key for backend calls
301
+ job_id: Job ID to link to
302
+ group_id: Group ID for parallel evaluations
303
+ index: Index in parallel execution
304
+ variants: Variant assignment
305
+ code_snippet: Code being evaluated
306
+ trace: Whether to send traces to backend
307
+ quiet: Whether to suppress output
308
+
309
+ Raises:
310
+ ValueError: If task.args is None (template tasks cannot be run directly)
311
+ """
312
+ from hud.environment import Environment
313
+ from hud.eval.task import build_eval_name
314
+
315
+ # Validate that task has args (not a template)
316
+ if task.args is None:
317
+ raise ValueError(
318
+ f"Cannot run task with args=None (this is a template). "
319
+ f"Provide args when creating the task: env('{task.scenario}', **args)"
320
+ )
321
+
322
+ eval_name = name or build_eval_name(task.scenario, task.args)
323
+
324
+ # task.env is guaranteed to be Environment after Task.__post_init__
325
+ assert isinstance(task.env, Environment), "Task.env should be Environment"
326
+
327
+ ctx = cls.from_environment(
328
+ env=task.env,
329
+ name=eval_name,
330
+ trace_id=trace_id,
331
+ api_key=api_key,
332
+ job_id=job_id,
333
+ group_id=group_id,
334
+ index=index,
335
+ variants=variants,
336
+ code_snippet=code_snippet,
337
+ trace=trace,
338
+ quiet=quiet,
339
+ )
340
+
341
+ # Store task info for scenario execution
342
+ ctx._task = task
343
+
344
+ # Copy agent_config fields from task to ctx (these override agent defaults)
345
+ if task.agent_config:
346
+ agent_config = task.agent_config
347
+ if isinstance(agent_config, dict):
348
+ if agent_config.get("system_prompt"):
349
+ ctx.system_prompt = agent_config["system_prompt"]
350
+ if agent_config.get("append_setup_output"):
351
+ ctx.append_setup_output = agent_config["append_setup_output"]
352
+ # Also check append_setup_tool alias
353
+ if agent_config.get("append_setup_tool"):
354
+ ctx.append_setup_output = agent_config["append_setup_tool"]
355
+ else:
356
+ # It's a BaseAgentConfig or TaskAgentConfig object
357
+ if getattr(agent_config, "system_prompt", None):
358
+ ctx.system_prompt = agent_config.system_prompt
359
+ if getattr(agent_config, "append_setup_output", False):
360
+ ctx.append_setup_output = agent_config.append_setup_output
361
+ # Also check append_setup_tool alias
362
+ if getattr(agent_config, "append_setup_tool", False):
363
+ ctx.append_setup_output = True
364
+
365
+ return ctx
366
+
367
+ async def _run_task_scenario_setup(self) -> None:
368
+ """Run the task's scenario setup phase (if scenario provided)."""
369
+ if self._task is None or self._task.scenario is None:
370
+ return
371
+
372
+ prompt = await self.run_scenario_setup(self._task.scenario, self._task.args or {})
373
+ if prompt:
374
+ self.prompt = prompt
375
+
376
+ async def _run_task_scenario_evaluate(self) -> None:
377
+ """Run the task's scenario evaluate phase (if scenario provided)."""
378
+ if self._task is None or self._task.scenario is None:
379
+ return
380
+
381
+ reward = await self.run_scenario_evaluate(self._task.scenario)
382
+ if reward is not None:
383
+ self.reward = reward
384
+
385
+ # =========================================================================
386
+ # Summary Context - Attribute Access Control
387
+ # =========================================================================
388
+
389
+ # Attributes accessible on summary context (everything else raises ParallelEvalComplete)
390
+ _SUMMARY_ALLOWED = frozenset(
391
+ {
392
+ # Results and metadata
393
+ "results",
394
+ "reward",
395
+ "error",
396
+ "success",
397
+ # IDs
398
+ "trace_id",
399
+ "job_id",
400
+ "group_id",
401
+ "index",
402
+ # Private attrs
403
+ "_is_summary",
404
+ "_suppress_link",
405
+ "__class__",
406
+ "__dict__",
407
+ }
408
+ )
409
+
410
+ def __getattribute__(self, name: str) -> Any:
411
+ """Block most attribute access on summary contexts."""
412
+ # Always allow private/dunder and whitelisted attrs
413
+ if name.startswith("_") or name in EvalContext._SUMMARY_ALLOWED:
414
+ return super().__getattribute__(name)
415
+
416
+ # Check if this is a summary context
417
+ try:
418
+ is_summary = super().__getattribute__("_is_summary")
419
+ except AttributeError:
420
+ is_summary = False
421
+
422
+ if is_summary:
423
+ raise ParallelEvalComplete
424
+
425
+ return super().__getattribute__(name)
426
+
427
+ # =========================================================================
428
+ # Computed Properties (eval-specific)
429
+ # =========================================================================
430
+
431
+ @property
432
+ def headers(self) -> dict[str, str]:
433
+ """Headers for gateway integration."""
434
+ return {"Trace-Id": self.trace_id}
435
+
436
+ @property
437
+ def success(self) -> bool:
438
+ """True if no error occurred."""
439
+ return self.error is None
440
+
441
+ @property
442
+ def has_scenario(self) -> bool:
443
+ """True if a scenario is running and can accept submissions."""
444
+ return self._task is not None and self._task.scenario is not None
445
+
446
+ @property
447
+ def setup_output(self) -> str | None:
448
+ """Get setup tool output as formatted string for prepending to agent context.
449
+
450
+ Returns None if no setup tools were executed or all results were empty.
451
+ Used by agents when append_setup_output is enabled.
452
+ """
453
+ import mcp.types as mcp_types
454
+
455
+ setup_results = getattr(self, "_setup_results", [])
456
+ if not setup_results:
457
+ return None
458
+
459
+ output_parts: list[str] = []
460
+ for result in setup_results:
461
+ if result.content:
462
+ output_parts.extend(
463
+ block.text
464
+ for block in result.content
465
+ if isinstance(block, mcp_types.TextContent)
466
+ )
467
+
468
+ if not output_parts:
469
+ return None
470
+
471
+ return "\n".join(output_parts)
472
+
473
+ # =========================================================================
474
+ # Backend Integration
475
+ # =========================================================================
476
+
477
+ def _get_eval_api_key(self) -> str | None:
478
+ return self._eval_api_key or settings.api_key
479
+
480
+ def _build_base_payload(self) -> EvalPayload:
481
+ """Build the base payload for enter/exit."""
482
+ return EvalPayload(
483
+ prompt=self.prompt,
484
+ code_snippet=self.code_snippet,
485
+ job_id=self.job_id,
486
+ group_id=self.group_id,
487
+ variants=self.variants if self.variants else None,
488
+ # Only send task_version_id for v5 tasks (those with scenarios).
489
+ # v4 tasks have client-side IDs that shouldn't be sent to backend.
490
+ task_version_id=self._task.id if self._task and self._task.scenario else None,
491
+ metadata=self.metadata if self.metadata else None,
492
+ )
493
+
494
+ async def log(self, metrics: dict[str, Any]) -> None:
495
+ """Log metrics to the backend."""
496
+ api_key = self._get_eval_api_key()
497
+ if not settings.telemetry_enabled or not api_key:
498
+ return
499
+
500
+ try:
501
+ await make_request(
502
+ method="POST",
503
+ url=f"{settings.hud_telemetry_url}/traces/{self.trace_id}/log",
504
+ json={"metrics": metrics},
505
+ api_key=api_key,
506
+ )
507
+ except Exception as e:
508
+ logger.warning("Failed to log metrics: %s", e)
509
+
510
+ async def submit(self, answer: str) -> None:
511
+ """Submit the agent's answer for scenario evaluation.
512
+
513
+ Delegates to Environment.submit() with the current scenario name.
514
+ The answer will be passed to the scenario's evaluate phase via
515
+ `yield`, e.g.: `answer = yield "Do the task"`
516
+
517
+ Args:
518
+ answer: The agent's final answer/result to submit
519
+
520
+ Example:
521
+ async with env("checkout", product="laptop") as ctx:
522
+ response = await agent.run(ctx.prompt)
523
+ await ctx.submit(response)
524
+ # On exit, scenario's evaluate phase receives the answer
525
+ """
526
+ if not self._task or not self._task.scenario:
527
+ return
528
+
529
+ # Store answer on context for display
530
+ self.answer = answer
531
+
532
+ # Delegate to Environment.submit() which handles storage + broadcast
533
+ await super().submit(self._task.scenario, answer)
534
+
535
+ async def _eval_enter(self) -> None:
536
+ """Notify backend that eval has started."""
537
+ if not self._trace_enabled:
538
+ return
539
+ api_key = self._get_eval_api_key()
540
+ if not settings.telemetry_enabled or not api_key:
541
+ return
542
+
543
+ try:
544
+ payload = self._build_base_payload()
545
+ await make_request(
546
+ method="POST",
547
+ url=f"{settings.hud_api_url}/trace/{self.trace_id}/enter",
548
+ json=payload.model_dump(exclude_none=True),
549
+ api_key=api_key,
550
+ )
551
+ except Exception as e:
552
+ logger.warning("Failed to send eval enter: %s", e)
553
+
554
+ async def _eval_exit(self, error_message: str | None = None) -> None:
555
+ """Notify backend that eval has completed."""
556
+ if not self._trace_enabled:
557
+ return
558
+ api_key = self._get_eval_api_key()
559
+ if not settings.telemetry_enabled or not api_key:
560
+ return
561
+
562
+ try:
563
+ payload = EvalExitPayload(
564
+ **self._build_base_payload().model_dump(),
565
+ reward=self.reward,
566
+ success=self.success,
567
+ error_message=error_message,
568
+ )
569
+ await make_request(
570
+ method="POST",
571
+ url=f"{settings.hud_api_url}/trace/{self.trace_id}/exit",
572
+ json=payload.model_dump(exclude_none=True),
573
+ api_key=api_key,
574
+ )
575
+ except Exception as e:
576
+ logger.warning("Failed to send eval exit: %s", e)
577
+
578
+ # =========================================================================
579
+ # Context Manager (override Environment)
580
+ # =========================================================================
581
+
582
+ async def __aenter__(self) -> Self:
583
+ """Enter eval context - connect environment and set trace headers."""
584
+ if self._is_summary:
585
+ return self
586
+
587
+ # Start tracking
588
+ self._token = _current_trace_headers.set(self.headers)
589
+ self._api_key_token = _current_api_key.set(self._eval_api_key)
590
+
591
+ # Register trace first (environment connection can fail)
592
+ await self._eval_enter()
593
+
594
+ try:
595
+ # Connect environment (MCP servers, tools)
596
+ await super().__aenter__()
597
+
598
+ # Run task scenario setup (if created from_task with scenario)
599
+ await self._run_task_scenario_setup()
600
+ self._print_eval_link()
601
+ except BaseException as e:
602
+ # Cleanup if setup fails - __aexit__ won't be called automatically
603
+ await self.__aexit__(type(e), e, e.__traceback__)
604
+ raise
605
+
606
+ return self
607
+
608
+ async def __aexit__(
609
+ self,
610
+ exc_type: type[BaseException] | None,
611
+ exc_val: BaseException | None,
612
+ exc_tb: TracebackType | None,
613
+ ) -> bool:
614
+ """Exit eval context - disconnect and report."""
615
+ # Summary contexts skip trace tracking (parallel results already tracked)
616
+ # Suppress ParallelEvalComplete - it's expected for skipping body re-execution
617
+ if self._is_summary:
618
+ return exc_type is ParallelEvalComplete
619
+
620
+ # Run task scenario evaluate (if no error and has scenario)
621
+ if exc_type is None:
622
+ await self._run_task_scenario_evaluate()
623
+
624
+ # Track error
625
+ error_msg: str | None = None
626
+ if exc_type is not None:
627
+ self.error = exc_val
628
+ error_msg = str(exc_val) if exc_val else "Unknown error"
629
+
630
+ # Flush any pending telemetry spans for this trace
631
+ flush(self.trace_id)
632
+
633
+ # Disconnect environment (parent class) - also runs evaluate tools
634
+ await super().__aexit__(exc_type, exc_val, exc_tb)
635
+
636
+ # Set reward from evaluate tools if not already set
637
+ if self.reward is None and hasattr(self, "_evaluate_reward"):
638
+ self.reward = self._evaluate_reward
639
+
640
+ # Reset context vars
641
+ if self._token is not None:
642
+ _current_trace_headers.reset(self._token)
643
+ self._token = None
644
+ if self._api_key_token is not None:
645
+ _current_api_key.reset(self._api_key_token)
646
+ self._api_key_token = None
647
+
648
+ # Notify backend
649
+ await self._eval_exit(error_msg)
650
+
651
+ # Print single eval result summary (unless suppressed for parallel evals)
652
+ self._print_single_result(error_msg)
653
+
654
+ return False
655
+
656
+ # =========================================================================
657
+ # Tool Call Instrumentation
658
+ # =========================================================================
659
+
660
+ async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
661
+ """Execute a tool with automatic telemetry recording.
662
+
663
+ Overrides Environment._execute_tool to record MCP spans for the eval context.
664
+ Instrumentation is disabled when connected to a remote HUD server (telemetry is
665
+ recorded server-side in that case).
666
+ """
667
+ # Skip instrumentation when connected to a remote hub - telemetry is handled server-side
668
+ if self._hub_config is not None:
669
+ return await super()._execute_tool(name, arguments)
670
+
671
+ # Skip instrumentation for v4 tasks with HUD MCP config (remote server)
672
+ if self._mcp_config is not None:
673
+ from hud.utils.mcp import _is_hud_server
674
+
675
+ for server_cfg in self._mcp_config.values():
676
+ if isinstance(server_cfg, dict):
677
+ url = server_cfg.get("url", "")
678
+ if url and _is_hud_server(url):
679
+ return await super()._execute_tool(name, arguments)
680
+
681
+ # For local environments, record MCP spans
682
+ return await self._execute_tool_instrumented(name, arguments)
683
+
684
+ @instrument(category="mcp")
685
+ async def _execute_tool_instrumented(
686
+ self, name: str, arguments: dict[str, Any]
687
+ ) -> MCPToolResult:
688
+ """Instrumented version of _execute_tool for local environments."""
689
+ return await super()._execute_tool(name, arguments)
690
+
691
+ def __repr__(self) -> str:
692
+ return f"EvalContext({self.trace_id[:8]}..., name={self.eval_name!r}, reward={self.reward})"
693
+
694
+ def _print_eval_link(self) -> None:
695
+ """Print a nicely formatted eval link."""
696
+ # Skip if link printing is suppressed (e.g., parallel child traces)
697
+ if self._suppress_link:
698
+ return
699
+
700
+ from hud.eval.display import print_link
701
+
702
+ trace_url = f"https://hud.ai/trace/{self.trace_id}"
703
+ print_link(trace_url, "🔗 Eval Started")
704
+
705
+ def _print_single_result(self, error_msg: str | None) -> None:
706
+ """Print a single eval result summary."""
707
+ # Skip if link printing is suppressed (e.g., parallel child traces)
708
+ if self._suppress_link:
709
+ return
710
+
711
+ from hud.eval.display import print_single_result
712
+
713
+ print_single_result(
714
+ trace_id=self.trace_id,
715
+ name=self.eval_name,
716
+ reward=self.reward,
717
+ error=error_msg,
718
+ )
719
+
720
+
721
+ # Re-export for backwards compatibility with trace module
722
+ __all__ = [
723
+ "EvalContext",
724
+ "get_current_api_key",
725
+ "get_current_trace_headers",
726
+ "get_current_trace_id",
727
+ ]