hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/eval/context.py ADDED
@@ -0,0 +1,674 @@
1
+ """EvalContext - Environment with evaluation tracking.
2
+
3
+ EvalContext IS an Environment, with additional evaluation tracking
4
+ capabilities (trace_id, reward, backend reporting).
5
+
6
+ This makes `async with env.eval("task") as env` natural - you get
7
+ a full Environment that you can call tools on directly.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import contextvars
13
+ import logging
14
+ import uuid
15
+ from typing import TYPE_CHECKING, Any, Self
16
+
17
+ from hud.environment import Environment
18
+ from hud.settings import settings
19
+ from hud.shared import make_request
20
+ from hud.telemetry import flush, instrument
21
+
22
+ if TYPE_CHECKING:
23
+ from types import TracebackType
24
+
25
+ from hud.eval.task import Task
26
+ from hud.types import MCPToolResult
27
+
28
+
29
+ from hud.eval.types import EvalExitPayload, EvalPayload, ParallelEvalComplete
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # Contextvar to store current trace headers (for httpx auto-instrumentation)
34
+ _current_trace_headers: contextvars.ContextVar[dict[str, str] | None] = contextvars.ContextVar(
35
+ "current_trace_headers", default=None
36
+ )
37
+
38
+ # Contextvar to store current api_key override (for telemetry exporter)
39
+ _current_api_key: contextvars.ContextVar[str | None] = contextvars.ContextVar(
40
+ "current_api_key", default=None
41
+ )
42
+
43
+
44
+ def get_current_trace_headers() -> dict[str, str] | None:
45
+ """Get the current trace headers from context."""
46
+ return _current_trace_headers.get()
47
+
48
+
49
+ def get_current_trace_id() -> str | None:
50
+ """Get the current trace ID (task_run_id) from context.
51
+
52
+ Returns the Trace-Id if inside an eval context, None otherwise.
53
+ Used by @instrument decorator to know where to send telemetry.
54
+ """
55
+ headers = _current_trace_headers.get()
56
+ if headers:
57
+ return headers.get("Trace-Id")
58
+ return None
59
+
60
+
61
+ def get_current_api_key() -> str | None:
62
+ """Get the current API key override from context.
63
+
64
+ Returns the api_key if one was passed to hud.eval(), otherwise None.
65
+ Falls back to settings.api_key if not in an eval context.
66
+ Used by telemetry exporter for uploads.
67
+ """
68
+ return _current_api_key.get()
69
+
70
+
71
+ # =============================================================================
72
+ # EvalContext
73
+ # =============================================================================
74
+
75
+
76
+ class EvalContext(Environment):
77
+ """Environment with evaluation tracking capabilities.
78
+
79
+ Attributes:
80
+ trace_id: Unique identifier for this evaluation
81
+ eval_name: Task/evaluation name (separate from env name)
82
+ job_id: Links to parent job (auto-detected from hud.job() context)
83
+ group_id: Links parallel evaluations together
84
+ variants: Variant assignment dict (for A/B testing)
85
+ reward: Reward value (user-settable)
86
+ error: Exception if failed
87
+ results: All eval results (populated for parallel execution, empty for single)
88
+ task: Task definition (if loaded from slug)
89
+
90
+ Example:
91
+ ```python
92
+ # With task (scenario sets reward automatically)
93
+ tasks = load_tasks("my-org/task:1")
94
+ async with hud.eval(tasks) as ctx:
95
+ await agent.run(ctx)
96
+ # reward set by scenario evaluate phase in __aexit__
97
+
98
+ # Blank eval (manual reward)
99
+ async with hud.eval() as ctx:
100
+ ctx.reward = compute_reward()
101
+ ```
102
+ """
103
+
104
+ def __init__(
105
+ self,
106
+ name: str = "eval",
107
+ *,
108
+ trace_id: str | None = None,
109
+ api_key: str | None = None,
110
+ job_id: str | None = None,
111
+ group_id: str | None = None,
112
+ index: int = 0,
113
+ variants: dict[str, Any] | None = None,
114
+ code_snippet: str | None = None,
115
+ trace: bool = True,
116
+ quiet: bool = False,
117
+ **env_kwargs: Any,
118
+ ) -> None:
119
+ """Initialize EvalContext.
120
+
121
+ Args:
122
+ name: Environment/evaluation name
123
+ trace_id: Unique trace ID (auto-generated if not provided)
124
+ api_key: API key for backend calls
125
+ job_id: Job ID to link to (auto-detected if not provided)
126
+ group_id: Group ID for parallel evaluations
127
+ index: Index in parallel execution
128
+ variants: Variant assignment for A/B testing
129
+ code_snippet: Code being evaluated (for reproducibility)
130
+ trace: Whether to send trace data to backend (default True)
131
+ quiet: Whether to suppress printing links (default False)
132
+ **env_kwargs: Additional kwargs passed to Environment.__init__
133
+ """
134
+ # Initialize Environment
135
+ super().__init__(name=name, **env_kwargs)
136
+
137
+ # === Evaluation tracking (not in Environment) ===
138
+
139
+ # Identity
140
+ self.trace_id: str = trace_id or str(uuid.uuid4())
141
+ self.eval_name: str = name # Separate from self.name for clarity
142
+
143
+ # Job linkage
144
+ self.job_id: str | None = job_id
145
+
146
+ self.group_id: str | None = group_id
147
+ self.index: int = index
148
+
149
+ # Variant assignment
150
+ self.variants: dict[str, Any] = variants or {}
151
+
152
+ # User-settable (per-run values, override Environment defaults)
153
+ self.prompt: str | None = None # From scenario setup or task
154
+ self.reward: float | None = None
155
+ self.answer: str | None = None # Agent's submitted answer
156
+ self.system_prompt: str | None = None # From task.agent_config, passed to agent
157
+
158
+ # Error tracking
159
+ self.error: BaseException | None = None
160
+
161
+ # User metadata (arbitrary key-value pairs)
162
+ self.metadata: dict[str, Any] = {}
163
+
164
+ # Parallel results (empty list for single evals, populated for parallel)
165
+ self.results: list[EvalContext] = []
166
+
167
+ # Code snippet for reproducibility
168
+ self.code_snippet: str | None = code_snippet
169
+
170
+ # Private state for eval tracking
171
+ self._eval_api_key = api_key
172
+ self._token: contextvars.Token[dict[str, str] | None] | None = None
173
+ self._api_key_token: contextvars.Token[str | None] | None = None
174
+ self._is_summary: bool = False # True for summary contexts (skip trace)
175
+ self._suppress_link: bool = quiet # True to suppress printing eval link
176
+ self._trace_enabled: bool = trace # Whether to send trace data to backend
177
+ self._source_env_name: str | None = None # Source env name for remote lookups
178
+ self._task: Task | None = None # Task config (set by from_task)
179
+
180
+ @classmethod
181
+ def from_environment(
182
+ cls,
183
+ env: Environment,
184
+ name: str,
185
+ *,
186
+ trace_id: str | None = None,
187
+ api_key: str | None = None,
188
+ job_id: str | None = None,
189
+ group_id: str | None = None,
190
+ index: int = 0,
191
+ variants: dict[str, Any] | None = None,
192
+ code_snippet: str | None = None,
193
+ trace: bool = True,
194
+ quiet: bool = False,
195
+ ) -> EvalContext:
196
+ """Create an EvalContext that copies configuration from an existing Environment.
197
+
198
+ This creates a new EvalContext with the same connections as the parent.
199
+ Used by env.eval() to create evaluation contexts.
200
+
201
+ Args:
202
+ env: Parent environment to copy from
203
+ name: Evaluation name
204
+ trace_id: Unique trace ID
205
+ api_key: API key for backend calls
206
+ job_id: Job ID to link to
207
+ group_id: Group ID for parallel evaluations
208
+ index: Index in parallel execution
209
+ variants: Variant assignment
210
+ code_snippet: Code being evaluated
211
+ """
212
+ ctx = cls(
213
+ name=name,
214
+ trace_id=trace_id,
215
+ api_key=api_key,
216
+ job_id=job_id,
217
+ group_id=group_id,
218
+ index=index,
219
+ variants=variants,
220
+ code_snippet=code_snippet,
221
+ trace=trace,
222
+ quiet=quiet,
223
+ )
224
+
225
+ # Copy connections from parent - each connector is copied so parallel
226
+ # execution gets fresh client instances
227
+ ctx._connections = {name: connector.copy() for name, connector in env._connections.items()}
228
+
229
+ # Note: Auth is injected at request time by httpx/aiohttp hooks in hud.eval.instrument
230
+ # using the contextvar set in __aenter__ (supports api_key passed to hud.eval())
231
+ ctx._setup_calls = env._setup_calls.copy()
232
+ ctx._evaluate_calls = env._evaluate_calls.copy()
233
+
234
+ # Copy scenarios (definitions) by reference - they don't change
235
+ ctx._scenarios = getattr(env, "_scenarios", {})
236
+ # Create fresh session state for this eval (parallel evals each need their own)
237
+ ctx._scenario_sessions = {}
238
+ ctx._scenario_latest = {}
239
+ ctx._scenario_answers = {}
240
+
241
+ # Store source env name for remote scenario lookups
242
+ ctx._source_env_name = env.name
243
+
244
+ # Copy managers by reference (they hold local tools, prompts, resources)
245
+ # This allows ctx.call_tool(), ctx.get_prompt(), ctx.read_resource() to work
246
+ # for locally defined tools/scenarios
247
+ ctx._tool_manager = env._tool_manager
248
+ ctx._prompt_manager = env._prompt_manager
249
+ ctx._resource_manager = env._resource_manager
250
+
251
+ # Copy prompt
252
+ if env.prompt:
253
+ ctx.prompt = env.prompt
254
+
255
+ # Copy agent-level tool filters (allowed_tools/disallowed_tools)
256
+ ctx._agent_include = getattr(env, "_agent_include", None)
257
+ ctx._agent_exclude = getattr(env, "_agent_exclude", None)
258
+
259
+ # Copy router's conflict resolution strategy
260
+ ctx._router.conflict_resolution = env._router.conflict_resolution
261
+
262
+ # Copy mock mode settings (for testing)
263
+ ctx._mock_mode = getattr(env, "_mock_mode", False)
264
+ ctx._mock_outputs = getattr(env, "_mock_outputs", {}).copy()
265
+ ctx._mock_tool_schemas = getattr(env, "_mock_tool_schemas", {}).copy()
266
+
267
+ # Copy hub config (needed to detect remote hub for telemetry)
268
+ ctx._hub_config = getattr(env, "_hub_config", None)
269
+
270
+ # Copy mcp config (needed to detect remote HUD MCP for telemetry)
271
+ ctx._mcp_config = getattr(env, "_mcp_config", None)
272
+
273
+ return ctx
274
+
275
+ @classmethod
276
+ def from_task(
277
+ cls,
278
+ task: Task,
279
+ *,
280
+ name: str | None = None,
281
+ trace_id: str | None = None,
282
+ api_key: str | None = None,
283
+ job_id: str | None = None,
284
+ group_id: str | None = None,
285
+ index: int = 0,
286
+ variants: dict[str, Any] | None = None,
287
+ code_snippet: str | None = None,
288
+ trace: bool = True,
289
+ quiet: bool = False,
290
+ ) -> EvalContext:
291
+ """Create an EvalContext from a Task config.
292
+
293
+ Args:
294
+ task: Task config (env, scenario, args)
295
+ name: Override for eval/trace name (defaults to task scenario/args)
296
+ trace_id: Unique trace ID
297
+ api_key: API key for backend calls
298
+ job_id: Job ID to link to
299
+ group_id: Group ID for parallel evaluations
300
+ index: Index in parallel execution
301
+ variants: Variant assignment
302
+ code_snippet: Code being evaluated
303
+ trace: Whether to send traces to backend
304
+ quiet: Whether to suppress output
305
+ """
306
+ from hud.environment import Environment
307
+ from hud.eval.task import build_eval_name
308
+
309
+ eval_name = name or build_eval_name(task.scenario, task.args)
310
+
311
+ # task.env is guaranteed to be Environment after Task.__post_init__
312
+ assert isinstance(task.env, Environment), "Task.env should be Environment"
313
+
314
+ ctx = cls.from_environment(
315
+ env=task.env,
316
+ name=eval_name,
317
+ trace_id=trace_id,
318
+ api_key=api_key,
319
+ job_id=job_id,
320
+ group_id=group_id,
321
+ index=index,
322
+ variants=variants,
323
+ code_snippet=code_snippet,
324
+ trace=trace,
325
+ quiet=quiet,
326
+ )
327
+
328
+ # Store task info for scenario execution
329
+ ctx._task = task
330
+
331
+ # Set system_prompt from task.agent_config
332
+ if task.agent_config:
333
+ if isinstance(task.agent_config, dict):
334
+ if task.agent_config.get("system_prompt"):
335
+ ctx.system_prompt = task.agent_config["system_prompt"]
336
+ elif task.agent_config.system_prompt:
337
+ ctx.system_prompt = task.agent_config.system_prompt
338
+
339
+ return ctx
340
+
341
+ async def _run_task_scenario_setup(self) -> None:
342
+ """Run the task's scenario setup phase (if scenario provided)."""
343
+ if self._task is None or self._task.scenario is None:
344
+ return
345
+
346
+ prompt = await self.run_scenario_setup(self._task.scenario, self._task.args)
347
+ if prompt:
348
+ self.prompt = prompt
349
+
350
+ async def _run_task_scenario_evaluate(self) -> None:
351
+ """Run the task's scenario evaluate phase (if scenario provided)."""
352
+ if self._task is None or self._task.scenario is None:
353
+ return
354
+
355
+ reward = await self.run_scenario_evaluate(self._task.scenario)
356
+ if reward is not None:
357
+ self.reward = reward
358
+
359
+ # =========================================================================
360
+ # Summary Context - Attribute Access Control
361
+ # =========================================================================
362
+
363
+ # Attributes accessible on summary context (everything else raises ParallelEvalComplete)
364
+ _SUMMARY_ALLOWED = frozenset(
365
+ {
366
+ # Results and metadata
367
+ "results",
368
+ "reward",
369
+ "error",
370
+ "success",
371
+ # IDs
372
+ "trace_id",
373
+ "job_id",
374
+ "group_id",
375
+ "index",
376
+ # Private attrs
377
+ "_is_summary",
378
+ "_suppress_link",
379
+ "__class__",
380
+ "__dict__",
381
+ }
382
+ )
383
+
384
+ def __getattribute__(self, name: str) -> Any:
385
+ """Block most attribute access on summary contexts."""
386
+ # Always allow private/dunder and whitelisted attrs
387
+ if name.startswith("_") or name in EvalContext._SUMMARY_ALLOWED:
388
+ return super().__getattribute__(name)
389
+
390
+ # Check if this is a summary context
391
+ try:
392
+ is_summary = super().__getattribute__("_is_summary")
393
+ except AttributeError:
394
+ is_summary = False
395
+
396
+ if is_summary:
397
+ raise ParallelEvalComplete
398
+
399
+ return super().__getattribute__(name)
400
+
401
+ # =========================================================================
402
+ # Computed Properties (eval-specific)
403
+ # =========================================================================
404
+
405
+ @property
406
+ def headers(self) -> dict[str, str]:
407
+ """Headers for gateway integration."""
408
+ return {"Trace-Id": self.trace_id}
409
+
410
+ @property
411
+ def success(self) -> bool:
412
+ """True if no error occurred."""
413
+ return self.error is None
414
+
415
+ @property
416
+ def has_scenario(self) -> bool:
417
+ """True if a scenario is running and can accept submissions."""
418
+ return self._task is not None and self._task.scenario is not None
419
+
420
+ # =========================================================================
421
+ # Backend Integration
422
+ # =========================================================================
423
+
424
+ def _get_eval_api_key(self) -> str | None:
425
+ return self._eval_api_key or settings.api_key
426
+
427
+ def _build_base_payload(self) -> EvalPayload:
428
+ """Build the base payload for enter/exit."""
429
+ return EvalPayload(
430
+ prompt=self.prompt,
431
+ code_snippet=self.code_snippet,
432
+ job_id=self.job_id,
433
+ group_id=self.group_id,
434
+ variants=self.variants if self.variants else None,
435
+ # Only send task_version_id for v5 tasks (those with scenarios).
436
+ # v4 tasks have client-side IDs that shouldn't be sent to backend.
437
+ task_version_id=self._task.id if self._task and self._task.scenario else None,
438
+ metadata=self.metadata if self.metadata else None,
439
+ )
440
+
441
+ async def log(self, metrics: dict[str, Any]) -> None:
442
+ """Log metrics to the backend."""
443
+ api_key = self._get_eval_api_key()
444
+ if not settings.telemetry_enabled or not api_key:
445
+ return
446
+
447
+ try:
448
+ await make_request(
449
+ method="POST",
450
+ url=f"{settings.hud_telemetry_url}/traces/{self.trace_id}/log",
451
+ json={"metrics": metrics},
452
+ api_key=api_key,
453
+ )
454
+ except Exception as e:
455
+ logger.warning("Failed to log metrics: %s", e)
456
+
457
+ async def submit(self, answer: str) -> None:
458
+ """Submit the agent's answer for scenario evaluation.
459
+
460
+ Delegates to Environment.submit() with the current scenario name.
461
+ The answer will be passed to the scenario's evaluate phase via
462
+ `yield`, e.g.: `answer = yield "Do the task"`
463
+
464
+ Args:
465
+ answer: The agent's final answer/result to submit
466
+
467
+ Example:
468
+ async with env("checkout", product="laptop") as ctx:
469
+ response = await agent.run(ctx.prompt)
470
+ await ctx.submit(response)
471
+ # On exit, scenario's evaluate phase receives the answer
472
+ """
473
+ if not self._task or not self._task.scenario:
474
+ return
475
+
476
+ # Store answer on context for display
477
+ self.answer = answer
478
+
479
+ # Delegate to Environment.submit() which handles storage + broadcast
480
+ await super().submit(self._task.scenario, answer)
481
+
482
+ async def _eval_enter(self) -> None:
483
+ """Notify backend that eval has started."""
484
+ if not self._trace_enabled:
485
+ return
486
+ api_key = self._get_eval_api_key()
487
+ if not settings.telemetry_enabled or not api_key:
488
+ return
489
+
490
+ try:
491
+ payload = self._build_base_payload()
492
+ await make_request(
493
+ method="POST",
494
+ url=f"{settings.hud_api_url}/trace/{self.trace_id}/enter",
495
+ json=payload.model_dump(exclude_none=True),
496
+ api_key=api_key,
497
+ )
498
+ except Exception as e:
499
+ logger.warning("Failed to send eval enter: %s", e)
500
+
501
+ async def _eval_exit(self, error_message: str | None = None) -> None:
502
+ """Notify backend that eval has completed."""
503
+ if not self._trace_enabled:
504
+ return
505
+ api_key = self._get_eval_api_key()
506
+ if not settings.telemetry_enabled or not api_key:
507
+ return
508
+
509
+ try:
510
+ payload = EvalExitPayload(
511
+ **self._build_base_payload().model_dump(),
512
+ reward=self.reward,
513
+ success=self.success,
514
+ error_message=error_message,
515
+ )
516
+ await make_request(
517
+ method="POST",
518
+ url=f"{settings.hud_api_url}/trace/{self.trace_id}/exit",
519
+ json=payload.model_dump(exclude_none=True),
520
+ api_key=api_key,
521
+ )
522
+ except Exception as e:
523
+ logger.warning("Failed to send eval exit: %s", e)
524
+
525
+ # =========================================================================
526
+ # Context Manager (override Environment)
527
+ # =========================================================================
528
+
529
+ async def __aenter__(self) -> Self:
530
+ """Enter eval context - connect environment and set trace headers."""
531
+ if self._is_summary:
532
+ return self
533
+
534
+ # Start tracking
535
+ self._token = _current_trace_headers.set(self.headers)
536
+ self._api_key_token = _current_api_key.set(self._eval_api_key)
537
+
538
+ # Register trace first (environment connection can fail)
539
+ await self._eval_enter()
540
+
541
+ try:
542
+ # Connect environment (MCP servers, tools)
543
+ await super().__aenter__()
544
+
545
+ # Run task scenario setup (if created from_task with scenario)
546
+ await self._run_task_scenario_setup()
547
+ self._print_eval_link()
548
+ except BaseException as e:
549
+ # Cleanup if setup fails - __aexit__ won't be called automatically
550
+ await self.__aexit__(type(e), e, e.__traceback__)
551
+ raise
552
+
553
+ return self
554
+
555
+ async def __aexit__(
556
+ self,
557
+ exc_type: type[BaseException] | None,
558
+ exc_val: BaseException | None,
559
+ exc_tb: TracebackType | None,
560
+ ) -> bool:
561
+ """Exit eval context - disconnect and report."""
562
+ # Summary contexts skip trace tracking (parallel results already tracked)
563
+ # Suppress ParallelEvalComplete - it's expected for skipping body re-execution
564
+ if self._is_summary:
565
+ return exc_type is ParallelEvalComplete
566
+
567
+ # Run task scenario evaluate (if no error and has scenario)
568
+ if exc_type is None:
569
+ await self._run_task_scenario_evaluate()
570
+
571
+ # Track error
572
+ error_msg: str | None = None
573
+ if exc_type is not None:
574
+ self.error = exc_val
575
+ error_msg = str(exc_val) if exc_val else "Unknown error"
576
+
577
+ # Flush any pending telemetry spans for this trace
578
+ flush(self.trace_id)
579
+
580
+ # Disconnect environment (parent class) - also runs evaluate tools
581
+ await super().__aexit__(exc_type, exc_val, exc_tb)
582
+
583
+ # Set reward from evaluate tools if not already set
584
+ if self.reward is None and hasattr(self, "_evaluate_reward"):
585
+ self.reward = self._evaluate_reward
586
+
587
+ # Reset context vars
588
+ if self._token is not None:
589
+ _current_trace_headers.reset(self._token)
590
+ self._token = None
591
+ if self._api_key_token is not None:
592
+ _current_api_key.reset(self._api_key_token)
593
+ self._api_key_token = None
594
+
595
+ # Notify backend
596
+ await self._eval_exit(error_msg)
597
+
598
+ # Print single eval result summary (unless suppressed for parallel evals)
599
+ self._print_single_result(error_msg)
600
+
601
+ return False
602
+
603
+ # =========================================================================
604
+ # Tool Call Instrumentation
605
+ # =========================================================================
606
+
607
+ async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
608
+ """Execute a tool with automatic telemetry recording.
609
+
610
+ Overrides Environment._execute_tool to record MCP spans for the eval context.
611
+ Instrumentation is disabled when connected to a remote HUD server (telemetry is
612
+ recorded server-side in that case).
613
+ """
614
+ # Skip instrumentation when connected to a remote hub - telemetry is handled server-side
615
+ if self._hub_config is not None:
616
+ return await super()._execute_tool(name, arguments)
617
+
618
+ # Skip instrumentation for v4 tasks with HUD MCP config (remote server)
619
+ if self._mcp_config is not None:
620
+ from hud.utils.mcp import _is_hud_server
621
+
622
+ for server_cfg in self._mcp_config.values():
623
+ if isinstance(server_cfg, dict):
624
+ url = server_cfg.get("url", "")
625
+ if url and _is_hud_server(url):
626
+ return await super()._execute_tool(name, arguments)
627
+
628
+ # For local environments, record MCP spans
629
+ return await self._execute_tool_instrumented(name, arguments)
630
+
631
+ @instrument(category="mcp")
632
+ async def _execute_tool_instrumented(
633
+ self, name: str, arguments: dict[str, Any]
634
+ ) -> MCPToolResult:
635
+ """Instrumented version of _execute_tool for local environments."""
636
+ return await super()._execute_tool(name, arguments)
637
+
638
+ def __repr__(self) -> str:
639
+ return f"EvalContext({self.trace_id[:8]}..., name={self.eval_name!r}, reward={self.reward})"
640
+
641
+ def _print_eval_link(self) -> None:
642
+ """Print a nicely formatted eval link."""
643
+ # Skip if link printing is suppressed (e.g., parallel child traces)
644
+ if self._suppress_link:
645
+ return
646
+
647
+ from hud.eval.display import print_link
648
+
649
+ trace_url = f"https://hud.ai/trace/{self.trace_id}"
650
+ print_link(trace_url, "🔗 Eval Started")
651
+
652
+ def _print_single_result(self, error_msg: str | None) -> None:
653
+ """Print a single eval result summary."""
654
+ # Skip if link printing is suppressed (e.g., parallel child traces)
655
+ if self._suppress_link:
656
+ return
657
+
658
+ from hud.eval.display import print_single_result
659
+
660
+ print_single_result(
661
+ trace_id=self.trace_id,
662
+ name=self.eval_name,
663
+ reward=self.reward,
664
+ error=error_msg,
665
+ )
666
+
667
+
668
+ # Re-export for backwards compatibility with trace module
669
+ __all__ = [
670
+ "EvalContext",
671
+ "get_current_api_key",
672
+ "get_current_trace_headers",
673
+ "get_current_trace_id",
674
+ ]