hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/eval/display.py ADDED
@@ -0,0 +1,299 @@
1
+ """Display helpers for eval links, job URLs, and result statistics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import webbrowser
7
+ from statistics import mean, pstdev
8
+ from typing import Any
9
+
10
+ from hud.settings import settings
11
+
12
+
13
+ def print_link(url: str, title: str, *, open_browser: bool = True) -> None:
14
+ """Print a nicely formatted link with optional browser opening."""
15
+ if not (settings.telemetry_enabled and settings.api_key):
16
+ return
17
+
18
+ if open_browser:
19
+ with contextlib.suppress(Exception):
20
+ webbrowser.open(url, new=2)
21
+
22
+ try:
23
+ from rich.align import Align
24
+ from rich.console import Console
25
+ from rich.panel import Panel
26
+
27
+ console = Console()
28
+ style = "bold underline rgb(108,113,196)"
29
+ link_markup = f"[{style}][link={url}]{url}[/link][/{style}]"
30
+ panel = Panel(
31
+ Align.center(link_markup),
32
+ title=title,
33
+ border_style="rgb(192,150,12)",
34
+ padding=(0, 2),
35
+ )
36
+ console.print(panel)
37
+ except ImportError:
38
+ print(f"{title}: {url}") # noqa: T201
39
+
40
+
41
+ def print_complete(url: str, name: str, *, error: bool = False) -> None:
42
+ """Print a completion message with link."""
43
+ if not (settings.telemetry_enabled and settings.api_key):
44
+ return
45
+
46
+ try:
47
+ from rich.console import Console
48
+
49
+ console = Console()
50
+ if error:
51
+ console.print(
52
+ f"\n[red]✗ '{name}' failed![/red] [dim]View details at:[/dim] "
53
+ f"[bold link={url}]{url}[/bold link]\n"
54
+ )
55
+ else:
56
+ console.print(
57
+ f"\n[green]✓ '{name}' complete![/green] [dim]View results at:[/dim] "
58
+ f"[bold link={url}]{url}[/bold link]\n"
59
+ )
60
+ except ImportError:
61
+ status = "failed" if error else "complete"
62
+ print(f"\n{name} {status}: {url}\n") # noqa: T201
63
+
64
+
65
+ def print_single_result(
66
+ trace_id: str,
67
+ name: str,
68
+ *,
69
+ reward: float | None = None,
70
+ error: str | None = None,
71
+ ) -> None:
72
+ """Print a single eval result summary."""
73
+ if not (settings.telemetry_enabled and settings.api_key):
74
+ return
75
+
76
+ url = f"https://hud.ai/trace/{trace_id}"
77
+
78
+ try:
79
+ from rich.console import Console
80
+
81
+ console = Console()
82
+
83
+ if error:
84
+ console.print(
85
+ f"\n[red]✗ '{name}' failed![/red]\n"
86
+ f" [dim]Error:[/dim] [red]{error[:80]}{'...' if len(error) > 80 else ''}[/red]\n"
87
+ f" [dim]View at:[/dim] [bold link={url}]{url}[/bold link]\n"
88
+ )
89
+ else:
90
+ reward_str = f"{reward:.3f}" if reward is not None else "—"
91
+ reward_color = "green" if reward is not None and reward > 0.7 else "yellow"
92
+ console.print(
93
+ f"\n[green]✓ '{name}' complete![/green]\n"
94
+ f" [dim]Reward:[/dim] [{reward_color}]{reward_str}[/{reward_color}]\n"
95
+ f" [dim]View at:[/dim] [bold link={url}]{url}[/bold link]\n"
96
+ )
97
+ except ImportError:
98
+ status = "failed" if error else "complete"
99
+ reward_str = f", reward={reward:.3f}" if reward is not None else ""
100
+ print(f"\n{name} {status}{reward_str}: {url}\n") # noqa: T201
101
+
102
+
103
+ def display_results(
104
+ results: list[Any],
105
+ *,
106
+ tasks: list[Any] | None = None,
107
+ name: str = "",
108
+ elapsed: float | None = None,
109
+ show_details: bool = True,
110
+ ) -> None:
111
+ """Display evaluation results in a formatted table.
112
+
113
+ Args:
114
+ results: List of EvalContext objects from hud.eval()
115
+ tasks: Optional list of Task objects (for task info in table)
116
+ name: Optional name for the evaluation
117
+ elapsed: Optional elapsed time in seconds
118
+ show_details: Whether to show per-eval details table
119
+ """
120
+ if not results:
121
+ print("No results to display") # noqa: T201
122
+ return
123
+
124
+ try:
125
+ from rich.console import Console
126
+ from rich.table import Table
127
+
128
+ console = Console()
129
+ except ImportError:
130
+ _display_basic(results, name, elapsed)
131
+ return
132
+
133
+ # Extract stats from results (EvalContext objects)
134
+ # Use 'or 0' to handle None rewards (scenario failed before returning a reward)
135
+ rewards = [getattr(r, "reward", 0) or 0 for r in results if r is not None]
136
+ errors = [r for r in results if r is not None and getattr(r, "error", None)]
137
+ durations = [getattr(r, "duration", 0) for r in results if getattr(r, "duration", 0) > 0]
138
+
139
+ if not rewards:
140
+ console.print("[yellow]No valid results[/yellow]")
141
+ return
142
+
143
+ mean_reward = mean(rewards) if rewards else 0.0
144
+ std_reward = pstdev(rewards) if len(rewards) > 1 else 0.0
145
+ success_count = sum(1 for r in rewards if r > 0.7)
146
+ success_rate = success_count / len(results) if results else 0.0
147
+
148
+ # Print summary
149
+ title = f"📊 '{name}' Results" if name else "📊 Evaluation Complete"
150
+ console.print(f"\n[bold]{title}[/bold]")
151
+ console.print(f" [dim]Evals:[/dim] {len(results)}")
152
+ if elapsed:
153
+ rate = len(results) / elapsed if elapsed > 0 else 0
154
+ console.print(f" [dim]Time:[/dim] {elapsed:.1f}s ({rate:.1f}/s)")
155
+ if durations:
156
+ console.print(f" [dim]Avg duration:[/dim] {mean(durations):.2f}s")
157
+ console.print(f" [dim]Mean reward:[/dim] [green]{mean_reward:.3f}[/green] ± {std_reward:.3f}")
158
+ console.print(f" [dim]Success rate:[/dim] [yellow]{success_rate * 100:.1f}%[/yellow]")
159
+ if errors:
160
+ console.print(f" [dim]Errors:[/dim] [red]{len(errors)}[/red]")
161
+
162
+ # Details table
163
+ if show_details and len(results) <= 50:
164
+ table = Table(title="Details", show_header=True, header_style="bold")
165
+ table.add_column("#", style="dim", justify="right", width=4)
166
+
167
+ # Check if we have variants (grouped parallel runs)
168
+ has_variants = any(getattr(r, "variants", None) for r in results if r)
169
+ has_prompts = any(getattr(r, "prompt", None) for r in results if r)
170
+ has_answers = any(getattr(r, "answer", None) for r in results if r)
171
+
172
+ if has_variants:
173
+ table.add_column("Variants", style="cyan", max_width=30)
174
+ elif tasks:
175
+ table.add_column("Task", style="cyan", max_width=30)
176
+
177
+ if has_prompts:
178
+ table.add_column("Prompt", style="dim", max_width=35)
179
+
180
+ if has_answers:
181
+ table.add_column("Answer", style="dim", max_width=35)
182
+
183
+ table.add_column("Reward", justify="right", style="green", width=8)
184
+ if durations:
185
+ table.add_column("Time", justify="right", width=8)
186
+ table.add_column("", justify="center", width=3) # Status icon
187
+
188
+ for i, r in enumerate(results):
189
+ if r is None:
190
+ continue
191
+
192
+ idx = getattr(r, "index", i)
193
+ reward = getattr(r, "reward", None)
194
+ error = getattr(r, "error", None)
195
+ duration = getattr(r, "duration", 0)
196
+ variants = getattr(r, "variants", None)
197
+ prompt = getattr(r, "prompt", None)
198
+ answer = getattr(r, "answer", None)
199
+
200
+ # Status icon
201
+ if error:
202
+ status = "[red]✗[/red]"
203
+ elif reward is not None and reward > 0.7:
204
+ status = "[green]✓[/green]"
205
+ else:
206
+ status = "[yellow]○[/yellow]"
207
+
208
+ row = [str(idx)]
209
+
210
+ # Variant or task column
211
+ if has_variants:
212
+ row.append(_format_variants(variants))
213
+ elif tasks and i < len(tasks):
214
+ task = tasks[i]
215
+ task_label = _get_task_label(task, i)
216
+ row.append(task_label[:30])
217
+
218
+ # Prompt column
219
+ if has_prompts:
220
+ row.append(_truncate(prompt, 35))
221
+
222
+ # Answer column
223
+ if has_answers:
224
+ row.append(_truncate(answer, 35))
225
+
226
+ # Reward
227
+ row.append(f"{reward:.3f}" if reward is not None else "—")
228
+
229
+ # Duration
230
+ if durations:
231
+ row.append(f"{duration:.1f}s" if duration > 0 else "—")
232
+
233
+ row.append(status)
234
+ table.add_row(*row)
235
+
236
+ console.print(table)
237
+
238
+ # Variance warning
239
+ if std_reward > 0.3:
240
+ console.print(f"\n[yellow]⚠️ High variance (std={std_reward:.3f})[/yellow]")
241
+
242
+ console.print()
243
+
244
+
245
+ def _display_basic(results: list[Any], name: str, elapsed: float | None) -> None:
246
+ """Fallback display without rich."""
247
+ rewards = [getattr(r, "reward", 0) for r in results if r is not None]
248
+ title = f"'{name}' Results" if name else "Eval Results"
249
+ print(f"\n{title}") # noqa: T201
250
+ print(f" Evals: {len(results)}") # noqa: T201
251
+ if elapsed:
252
+ print(f" Time: {elapsed:.1f}s") # noqa: T201
253
+ if rewards:
254
+ print(f" Mean reward: {mean(rewards):.3f}") # noqa: T201
255
+ print() # noqa: T201
256
+
257
+
258
+ def _format_variants(variants: dict[str, Any] | None) -> str:
259
+ """Format variants dict for display."""
260
+ if not variants:
261
+ return "-"
262
+ parts = [f"{k}={v}" for k, v in variants.items()]
263
+ result = ", ".join(parts)
264
+ return result[:28] + ".." if len(result) > 30 else result
265
+
266
+
267
+ def _truncate(text: str | None, max_len: int) -> str:
268
+ """Truncate text to max length."""
269
+ if not text:
270
+ return "-"
271
+ text = text.replace("\n", " ").strip()
272
+ return text[: max_len - 2] + ".." if len(text) > max_len else text
273
+
274
+
275
+ def _get_task_label(task: Any, index: int) -> str:
276
+ """Get a display label for a task."""
277
+ if task is None:
278
+ return f"task_{index}"
279
+ if isinstance(task, dict):
280
+ return task.get("id") or task.get("prompt", "")[:25] or f"task_{index}"
281
+ task_id = getattr(task, "id", None)
282
+ if task_id:
283
+ return task_id
284
+ prompt = getattr(task, "prompt", None) or getattr(task, "scenario", None)
285
+ if prompt:
286
+ return prompt[:25]
287
+ return f"task_{index}"
288
+
289
+
290
+ # Backwards compatibility alias
291
+ print_eval_stats = display_results
292
+
293
+ __all__ = [
294
+ "display_results",
295
+ "print_complete",
296
+ "print_eval_stats",
297
+ "print_link",
298
+ "print_single_result",
299
+ ]
hud/eval/instrument.py ADDED
@@ -0,0 +1,187 @@
1
+ """Auto-instrumentation for httpx and aiohttp to inject trace headers.
2
+
3
+ This module patches HTTP clients to automatically add:
4
+ - Trace-Id headers when inside an eval context
5
+ - Authorization headers for HUD API calls
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import TYPE_CHECKING, Any
12
+ from urllib.parse import urlparse
13
+
14
+ if TYPE_CHECKING:
15
+ from types import SimpleNamespace
16
+
17
+ from hud.settings import settings
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _get_trace_headers() -> dict[str, str] | None:
23
+ """Lazy import to avoid circular dependency."""
24
+ from hud.eval.context import get_current_trace_headers
25
+
26
+ return get_current_trace_headers()
27
+
28
+
29
+ def _get_api_key() -> str | None:
30
+ """Get API key from context or settings.
31
+
32
+ Prefers the contextvar (set by hud.eval(api_key=...)),
33
+ falls back to settings (env var HUD_API_KEY).
34
+ """
35
+ from hud.eval.context import get_current_api_key
36
+
37
+ return get_current_api_key() or settings.api_key
38
+
39
+
40
+ def _is_hud_url(url_str: str) -> bool:
41
+ """Check if URL is a HUD service (inference or MCP)."""
42
+ parsed = urlparse(url_str)
43
+ request_host = parsed.netloc or url_str.split("/")[0]
44
+
45
+ # Check for known HUD domains (works for any subdomain)
46
+ if request_host.endswith((".hud.ai", ".hud.so")):
47
+ return True
48
+
49
+ # Also check settings URLs
50
+ known_hosts = {
51
+ urlparse(settings.hud_gateway_url).netloc,
52
+ urlparse(settings.hud_mcp_url).netloc,
53
+ }
54
+ return request_host in known_hosts
55
+
56
+
57
+ def _httpx_request_hook(request: Any) -> None:
58
+ """httpx event hook that adds trace headers and auth to HUD requests.
59
+
60
+ For inference.hud.ai and mcp.hud.ai:
61
+ - Injects trace headers (Trace-Id) if in trace context
62
+ - Injects Authorization header if API key is set and no auth present
63
+ """
64
+ url_str = str(request.url)
65
+ if not _is_hud_url(url_str):
66
+ return
67
+
68
+ # Inject trace headers if in trace context
69
+ headers = _get_trace_headers()
70
+ if headers is not None:
71
+ for key, value in headers.items():
72
+ if key.lower() not in {k.lower() for k in request.headers}:
73
+ request.headers[key] = value
74
+ logger.debug("Added trace headers to request: %s", url_str)
75
+
76
+ # Auto-inject API key if not present or invalid (prefer contextvar, fallback to settings)
77
+ api_key = _get_api_key()
78
+ if api_key:
79
+ existing_auth = request.headers.get("Authorization", "")
80
+ # Override if no auth, empty auth, or invalid "Bearer None"
81
+ if not existing_auth or existing_auth in ("Bearer None", "Bearer null", "Bearer "):
82
+ request.headers["Authorization"] = f"Bearer {api_key}"
83
+ logger.debug("Added API key auth to request: %s", url_str)
84
+
85
+
86
+ async def _async_httpx_request_hook(request: Any) -> None:
87
+ """Async version of the httpx event hook."""
88
+ _httpx_request_hook(request)
89
+
90
+
91
+ def _instrument_httpx_client(client: Any) -> None:
92
+ """Add trace hook to an httpx client instance."""
93
+ is_async = hasattr(client, "aclose")
94
+ hook = _async_httpx_request_hook if is_async else _httpx_request_hook
95
+
96
+ existing_hooks = client.event_hooks.get("request", [])
97
+ if hook not in existing_hooks:
98
+ existing_hooks.append(hook)
99
+ client.event_hooks["request"] = existing_hooks
100
+
101
+
102
+ def _patch_httpx() -> None:
103
+ """Monkey-patch httpx to auto-instrument all clients."""
104
+ try:
105
+ import httpx
106
+ except ImportError:
107
+ logger.debug("httpx not installed, skipping auto-instrumentation")
108
+ return
109
+
110
+ _original_async_init = httpx.AsyncClient.__init__
111
+
112
+ def _patched_async_init(self: Any, *args: Any, **kwargs: Any) -> None:
113
+ _original_async_init(self, *args, **kwargs)
114
+ _instrument_httpx_client(self)
115
+
116
+ httpx.AsyncClient.__init__ = _patched_async_init # type: ignore[method-assign]
117
+
118
+ _original_sync_init = httpx.Client.__init__
119
+
120
+ def _patched_sync_init(self: Any, *args: Any, **kwargs: Any) -> None:
121
+ _original_sync_init(self, *args, **kwargs)
122
+ _instrument_httpx_client(self)
123
+
124
+ httpx.Client.__init__ = _patched_sync_init # type: ignore[method-assign]
125
+
126
+ logger.debug("httpx auto-instrumentation enabled")
127
+
128
+
129
+ def _patch_aiohttp() -> None:
130
+ """
131
+ Monkey-patch aiohttp to auto-instrument all ClientSession instances.
132
+ This is important for the Gemini client in particular, which uses aiohttp by default.
133
+ """
134
+ try:
135
+ import aiohttp
136
+ except ImportError:
137
+ logger.debug("aiohttp not installed, skipping auto-instrumentation")
138
+ return
139
+
140
+ async def on_request_start(
141
+ _session: aiohttp.ClientSession,
142
+ _trace_config_ctx: SimpleNamespace,
143
+ params: aiohttp.TraceRequestStartParams,
144
+ ) -> None:
145
+ """aiohttp trace hook that adds trace headers and auth to HUD requests."""
146
+ url_str = str(params.url)
147
+ if not _is_hud_url(url_str):
148
+ return
149
+
150
+ trace_headers = _get_trace_headers()
151
+ if trace_headers is not None:
152
+ for key, value in trace_headers.items():
153
+ if key.lower() not in {k.lower() for k in params.headers}:
154
+ params.headers[key] = value
155
+ logger.debug("Added trace headers to aiohttp request: %s", url_str)
156
+
157
+ api_key = _get_api_key()
158
+ if api_key:
159
+ existing_auth = params.headers.get("Authorization", "")
160
+ # Override if no auth, empty auth, or invalid "Bearer None"
161
+ if not existing_auth or existing_auth in ("Bearer None", "Bearer null", "Bearer "):
162
+ params.headers["Authorization"] = f"Bearer {api_key}"
163
+ logger.debug("Added API key auth to aiohttp request: %s", url_str)
164
+
165
+ trace_config = aiohttp.TraceConfig()
166
+ trace_config.on_request_start.append(on_request_start)
167
+
168
+ _original_init = aiohttp.ClientSession.__init__
169
+
170
+ def _patched_init(self: aiohttp.ClientSession, *args: Any, **kwargs: Any) -> None:
171
+ existing_traces = kwargs.get("trace_configs") or []
172
+ if trace_config not in existing_traces:
173
+ existing_traces = [*list(existing_traces), trace_config]
174
+ kwargs["trace_configs"] = existing_traces
175
+ _original_init(self, *args, **kwargs)
176
+
177
+ aiohttp.ClientSession.__init__ = _patched_init # type: ignore[method-assign]
178
+
179
+ logger.debug("aiohttp auto-instrumentation enabled")
180
+
181
+
182
+ # Auto-patch on module import
183
+ _patch_httpx()
184
+ _patch_aiohttp()
185
+
186
+
187
+ __all__ = ["_patch_aiohttp", "_patch_httpx"]