hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/eval/display.py ADDED
@@ -0,0 +1,299 @@
1
+ """Display helpers for eval links, job URLs, and result statistics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import webbrowser
7
+ from statistics import mean, pstdev
8
+ from typing import Any
9
+
10
+ from hud.settings import settings
11
+
12
+
13
+ def print_link(url: str, title: str, *, open_browser: bool = True) -> None:
14
+ """Print a nicely formatted link with optional browser opening."""
15
+ if not (settings.telemetry_enabled and settings.api_key):
16
+ return
17
+
18
+ if open_browser:
19
+ with contextlib.suppress(Exception):
20
+ webbrowser.open(url, new=2)
21
+
22
+ try:
23
+ from rich.align import Align
24
+ from rich.console import Console
25
+ from rich.panel import Panel
26
+
27
+ console = Console()
28
+ style = "bold underline rgb(108,113,196)"
29
+ link_markup = f"[{style}][link={url}]{url}[/link][/{style}]"
30
+ panel = Panel(
31
+ Align.center(link_markup),
32
+ title=title,
33
+ border_style="rgb(192,150,12)",
34
+ padding=(0, 2),
35
+ )
36
+ console.print(panel)
37
+ except ImportError:
38
+ print(f"{title}: {url}") # noqa: T201
39
+
40
+
41
+ def print_complete(url: str, name: str, *, error: bool = False) -> None:
42
+ """Print a completion message with link."""
43
+ if not (settings.telemetry_enabled and settings.api_key):
44
+ return
45
+
46
+ try:
47
+ from rich.console import Console
48
+
49
+ console = Console()
50
+ if error:
51
+ console.print(
52
+ f"\n[red]✗ '{name}' failed![/red] [dim]View details at:[/dim] "
53
+ f"[bold link={url}]{url}[/bold link]\n"
54
+ )
55
+ else:
56
+ console.print(
57
+ f"\n[green]✓ '{name}' complete![/green] [dim]View results at:[/dim] "
58
+ f"[bold link={url}]{url}[/bold link]\n"
59
+ )
60
+ except ImportError:
61
+ status = "failed" if error else "complete"
62
+ print(f"\n{name} {status}: {url}\n") # noqa: T201
63
+
64
+
65
+ def print_single_result(
66
+ trace_id: str,
67
+ name: str,
68
+ *,
69
+ reward: float | None = None,
70
+ error: str | None = None,
71
+ ) -> None:
72
+ """Print a single eval result summary."""
73
+ if not (settings.telemetry_enabled and settings.api_key):
74
+ return
75
+
76
+ url = f"https://hud.ai/trace/{trace_id}"
77
+
78
+ try:
79
+ from rich.console import Console
80
+
81
+ console = Console()
82
+
83
+ if error:
84
+ console.print(
85
+ f"\n[red]✗ '{name}' failed![/red]\n"
86
+ f" [dim]Error:[/dim] [red]{error[:80]}{'...' if len(error) > 80 else ''}[/red]\n"
87
+ f" [dim]View at:[/dim] [bold link={url}]{url}[/bold link]\n"
88
+ )
89
+ else:
90
+ reward_str = f"{reward:.3f}" if reward is not None else "—"
91
+ reward_color = "green" if reward is not None and reward > 0.7 else "yellow"
92
+ console.print(
93
+ f"\n[green]✓ '{name}' complete![/green]\n"
94
+ f" [dim]Reward:[/dim] [{reward_color}]{reward_str}[/{reward_color}]\n"
95
+ f" [dim]View at:[/dim] [bold link={url}]{url}[/bold link]\n"
96
+ )
97
+ except ImportError:
98
+ status = "failed" if error else "complete"
99
+ reward_str = f", reward={reward:.3f}" if reward is not None else ""
100
+ print(f"\n{name} {status}{reward_str}: {url}\n") # noqa: T201
101
+
102
+
103
+ def display_results(
104
+ results: list[Any],
105
+ *,
106
+ tasks: list[Any] | None = None,
107
+ name: str = "",
108
+ elapsed: float | None = None,
109
+ show_details: bool = True,
110
+ ) -> None:
111
+ """Display evaluation results in a formatted table.
112
+
113
+ Args:
114
+ results: List of EvalContext objects from hud.eval()
115
+ tasks: Optional list of Task objects (for task info in table)
116
+ name: Optional name for the evaluation
117
+ elapsed: Optional elapsed time in seconds
118
+ show_details: Whether to show per-eval details table
119
+ """
120
+ if not results:
121
+ print("No results to display") # noqa: T201
122
+ return
123
+
124
+ try:
125
+ from rich.console import Console
126
+ from rich.table import Table
127
+
128
+ console = Console()
129
+ except ImportError:
130
+ _display_basic(results, name, elapsed)
131
+ return
132
+
133
+ # Extract stats from results (EvalContext objects)
134
+ # Use 'or 0' to handle None rewards (scenario failed before returning a reward)
135
+ rewards = [getattr(r, "reward", 0) or 0 for r in results if r is not None]
136
+ errors = [r for r in results if r is not None and getattr(r, "error", None)]
137
+ durations = [getattr(r, "duration", 0) for r in results if getattr(r, "duration", 0) > 0]
138
+
139
+ if not rewards:
140
+ console.print("[yellow]No valid results[/yellow]")
141
+ return
142
+
143
+ mean_reward = mean(rewards) if rewards else 0.0
144
+ std_reward = pstdev(rewards) if len(rewards) > 1 else 0.0
145
+ success_count = sum(1 for r in rewards if r > 0.7)
146
+ success_rate = success_count / len(results) if results else 0.0
147
+
148
+ # Print summary
149
+ title = f"📊 '{name}' Results" if name else "📊 Evaluation Complete"
150
+ console.print(f"\n[bold]{title}[/bold]")
151
+ console.print(f" [dim]Evals:[/dim] {len(results)}")
152
+ if elapsed:
153
+ rate = len(results) / elapsed if elapsed > 0 else 0
154
+ console.print(f" [dim]Time:[/dim] {elapsed:.1f}s ({rate:.1f}/s)")
155
+ if durations:
156
+ console.print(f" [dim]Avg duration:[/dim] {mean(durations):.2f}s")
157
+ console.print(f" [dim]Mean reward:[/dim] [green]{mean_reward:.3f}[/green] ± {std_reward:.3f}")
158
+ console.print(f" [dim]Success rate:[/dim] [yellow]{success_rate * 100:.1f}%[/yellow]")
159
+ if errors:
160
+ console.print(f" [dim]Errors:[/dim] [red]{len(errors)}[/red]")
161
+
162
+ # Details table
163
+ if show_details and len(results) <= 50:
164
+ table = Table(title="Details", show_header=True, header_style="bold")
165
+ table.add_column("#", style="dim", justify="right", width=4)
166
+
167
+ # Check if we have variants (grouped parallel runs)
168
+ has_variants = any(getattr(r, "variants", None) for r in results if r)
169
+ has_prompts = any(getattr(r, "prompt", None) for r in results if r)
170
+ has_answers = any(getattr(r, "answer", None) for r in results if r)
171
+
172
+ if has_variants:
173
+ table.add_column("Variants", style="cyan", max_width=30)
174
+ elif tasks:
175
+ table.add_column("Task", style="cyan", max_width=30)
176
+
177
+ if has_prompts:
178
+ table.add_column("Prompt", style="dim", max_width=35)
179
+
180
+ if has_answers:
181
+ table.add_column("Answer", style="dim", max_width=35)
182
+
183
+ table.add_column("Reward", justify="right", style="green", width=8)
184
+ if durations:
185
+ table.add_column("Time", justify="right", width=8)
186
+ table.add_column("", justify="center", width=3) # Status icon
187
+
188
+ for i, r in enumerate(results):
189
+ if r is None:
190
+ continue
191
+
192
+ idx = getattr(r, "index", i)
193
+ reward = getattr(r, "reward", None)
194
+ error = getattr(r, "error", None)
195
+ duration = getattr(r, "duration", 0)
196
+ variants = getattr(r, "variants", None)
197
+ prompt = getattr(r, "prompt", None)
198
+ answer = getattr(r, "answer", None)
199
+
200
+ # Status icon
201
+ if error:
202
+ status = "[red]✗[/red]"
203
+ elif reward is not None and reward > 0.7:
204
+ status = "[green]✓[/green]"
205
+ else:
206
+ status = "[yellow]○[/yellow]"
207
+
208
+ row = [str(idx)]
209
+
210
+ # Variant or task column
211
+ if has_variants:
212
+ row.append(_format_variants(variants))
213
+ elif tasks and i < len(tasks):
214
+ task = tasks[i]
215
+ task_label = _get_task_label(task, i)
216
+ row.append(task_label[:30])
217
+
218
+ # Prompt column
219
+ if has_prompts:
220
+ row.append(_truncate(prompt, 35))
221
+
222
+ # Answer column
223
+ if has_answers:
224
+ row.append(_truncate(answer, 35))
225
+
226
+ # Reward
227
+ row.append(f"{reward:.3f}" if reward is not None else "—")
228
+
229
+ # Duration
230
+ if durations:
231
+ row.append(f"{duration:.1f}s" if duration > 0 else "—")
232
+
233
+ row.append(status)
234
+ table.add_row(*row)
235
+
236
+ console.print(table)
237
+
238
+ # Variance warning
239
+ if std_reward > 0.3:
240
+ console.print(f"\n[yellow]⚠️ High variance (std={std_reward:.3f})[/yellow]")
241
+
242
+ console.print()
243
+
244
+
245
+ def _display_basic(results: list[Any], name: str, elapsed: float | None) -> None:
246
+ """Fallback display without rich."""
247
+ rewards = [getattr(r, "reward", 0) for r in results if r is not None]
248
+ title = f"'{name}' Results" if name else "Eval Results"
249
+ print(f"\n{title}") # noqa: T201
250
+ print(f" Evals: {len(results)}") # noqa: T201
251
+ if elapsed:
252
+ print(f" Time: {elapsed:.1f}s") # noqa: T201
253
+ if rewards:
254
+ print(f" Mean reward: {mean(rewards):.3f}") # noqa: T201
255
+ print() # noqa: T201
256
+
257
+
258
+ def _format_variants(variants: dict[str, Any] | None) -> str:
259
+ """Format variants dict for display."""
260
+ if not variants:
261
+ return "-"
262
+ parts = [f"{k}={v}" for k, v in variants.items()]
263
+ result = ", ".join(parts)
264
+ return result[:28] + ".." if len(result) > 30 else result
265
+
266
+
267
+ def _truncate(text: str | None, max_len: int) -> str:
268
+ """Truncate text to max length."""
269
+ if not text:
270
+ return "-"
271
+ text = text.replace("\n", " ").strip()
272
+ return text[: max_len - 2] + ".." if len(text) > max_len else text
273
+
274
+
275
+ def _get_task_label(task: Any, index: int) -> str:
276
+ """Get a display label for a task."""
277
+ if task is None:
278
+ return f"task_{index}"
279
+ if isinstance(task, dict):
280
+ return task.get("id") or task.get("prompt", "")[:25] or f"task_{index}"
281
+ task_id = getattr(task, "id", None)
282
+ if task_id:
283
+ return task_id
284
+ prompt = getattr(task, "prompt", None) or getattr(task, "scenario", None)
285
+ if prompt:
286
+ return prompt[:25]
287
+ return f"task_{index}"
288
+
289
+
290
+ # Backwards compatibility alias
291
+ print_eval_stats = display_results
292
+
293
+ __all__ = [
294
+ "display_results",
295
+ "print_complete",
296
+ "print_eval_stats",
297
+ "print_link",
298
+ "print_single_result",
299
+ ]
hud/eval/instrument.py ADDED
@@ -0,0 +1,185 @@
1
+ """Auto-instrumentation for httpx and aiohttp to inject trace headers.
2
+
3
+ This module patches HTTP clients to automatically add:
4
+ - Trace-Id headers when inside an eval context
5
+ - Authorization headers for HUD API calls
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import TYPE_CHECKING, Any
12
+ from urllib.parse import urlparse
13
+
14
+ if TYPE_CHECKING:
15
+ from types import SimpleNamespace
16
+
17
+ from hud.settings import settings
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _get_trace_headers() -> dict[str, str] | None:
23
+ """Lazy import to avoid circular dependency."""
24
+ from hud.eval.context import get_current_trace_headers
25
+
26
+ return get_current_trace_headers()
27
+
28
+
29
+ def _get_api_key() -> str | None:
30
+ """Get API key from context or settings.
31
+
32
+ Prefers the contextvar (set by hud.eval(api_key=...)),
33
+ falls back to settings (env var HUD_API_KEY).
34
+ """
35
+ from hud.eval.context import get_current_api_key
36
+
37
+ return get_current_api_key() or settings.api_key
38
+
39
+
40
+ def _is_hud_url(url_str: str) -> bool:
41
+ """Check if URL is a HUD service (inference or MCP)."""
42
+ parsed = urlparse(url_str)
43
+ request_host = parsed.netloc or url_str.split("/")[0]
44
+
45
+ # Check for known HUD domains (works for any subdomain)
46
+ if request_host.endswith((".hud.ai", ".hud.so")):
47
+ return True
48
+
49
+ # Also check settings URLs
50
+ known_hosts = {
51
+ urlparse(settings.hud_gateway_url).netloc,
52
+ urlparse(settings.hud_mcp_url).netloc,
53
+ }
54
+ return request_host in known_hosts
55
+
56
+
57
+ def _httpx_request_hook(request: Any) -> None:
58
+ """httpx event hook that adds trace headers and auth to HUD requests.
59
+
60
+ For inference.hud.ai and mcp.hud.ai:
61
+ - Injects trace headers (Trace-Id) if in trace context
62
+ - Injects Authorization header if API key is set and no auth present
63
+ """
64
+ url_str = str(request.url)
65
+ if not _is_hud_url(url_str):
66
+ return
67
+
68
+ # Inject trace headers if in trace context
69
+ headers = _get_trace_headers()
70
+ if headers is not None:
71
+ for key, value in headers.items():
72
+ request.headers[key] = value
73
+ logger.debug("Added trace headers to request: %s", url_str)
74
+
75
+ # Auto-inject API key if not present or invalid (prefer contextvar, fallback to settings)
76
+ api_key = _get_api_key()
77
+ if api_key:
78
+ existing_auth = request.headers.get("Authorization", "")
79
+ # Override if no auth, empty auth, or invalid "Bearer None"
80
+ if not existing_auth or existing_auth in ("Bearer None", "Bearer null", "Bearer "):
81
+ request.headers["Authorization"] = f"Bearer {api_key}"
82
+ logger.debug("Added API key auth to request: %s", url_str)
83
+
84
+
85
+ async def _async_httpx_request_hook(request: Any) -> None:
86
+ """Async version of the httpx event hook."""
87
+ _httpx_request_hook(request)
88
+
89
+
90
+ def _instrument_httpx_client(client: Any) -> None:
91
+ """Add trace hook to an httpx client instance."""
92
+ is_async = hasattr(client, "aclose")
93
+ hook = _async_httpx_request_hook if is_async else _httpx_request_hook
94
+
95
+ existing_hooks = client.event_hooks.get("request", [])
96
+ if hook not in existing_hooks:
97
+ existing_hooks.append(hook)
98
+ client.event_hooks["request"] = existing_hooks
99
+
100
+
101
+ def _patch_httpx() -> None:
102
+ """Monkey-patch httpx to auto-instrument all clients."""
103
+ try:
104
+ import httpx
105
+ except ImportError:
106
+ logger.debug("httpx not installed, skipping auto-instrumentation")
107
+ return
108
+
109
+ _original_async_init = httpx.AsyncClient.__init__
110
+
111
+ def _patched_async_init(self: Any, *args: Any, **kwargs: Any) -> None:
112
+ _original_async_init(self, *args, **kwargs)
113
+ _instrument_httpx_client(self)
114
+
115
+ httpx.AsyncClient.__init__ = _patched_async_init # type: ignore[method-assign]
116
+
117
+ _original_sync_init = httpx.Client.__init__
118
+
119
+ def _patched_sync_init(self: Any, *args: Any, **kwargs: Any) -> None:
120
+ _original_sync_init(self, *args, **kwargs)
121
+ _instrument_httpx_client(self)
122
+
123
+ httpx.Client.__init__ = _patched_sync_init # type: ignore[method-assign]
124
+
125
+ logger.debug("httpx auto-instrumentation enabled")
126
+
127
+
128
+ def _patch_aiohttp() -> None:
129
+ """
130
+ Monkey-patch aiohttp to auto-instrument all ClientSession instances.
131
+ This is important for the Gemini client in particular, which uses aiohttp by default.
132
+ """
133
+ try:
134
+ import aiohttp
135
+ except ImportError:
136
+ logger.debug("aiohttp not installed, skipping auto-instrumentation")
137
+ return
138
+
139
+ async def on_request_start(
140
+ _session: aiohttp.ClientSession,
141
+ _trace_config_ctx: SimpleNamespace,
142
+ params: aiohttp.TraceRequestStartParams,
143
+ ) -> None:
144
+ """aiohttp trace hook that adds trace headers and auth to HUD requests."""
145
+ url_str = str(params.url)
146
+ if not _is_hud_url(url_str):
147
+ return
148
+
149
+ trace_headers = _get_trace_headers()
150
+ if trace_headers is not None:
151
+ for key, value in trace_headers.items():
152
+ params.headers[key] = value
153
+ logger.debug("Added trace headers to aiohttp request: %s", url_str)
154
+
155
+ api_key = _get_api_key()
156
+ if api_key:
157
+ existing_auth = params.headers.get("Authorization", "")
158
+ # Override if no auth, empty auth, or invalid "Bearer None"
159
+ if not existing_auth or existing_auth in ("Bearer None", "Bearer null", "Bearer "):
160
+ params.headers["Authorization"] = f"Bearer {api_key}"
161
+ logger.debug("Added API key auth to aiohttp request: %s", url_str)
162
+
163
+ trace_config = aiohttp.TraceConfig()
164
+ trace_config.on_request_start.append(on_request_start)
165
+
166
+ _original_init = aiohttp.ClientSession.__init__
167
+
168
+ def _patched_init(self: aiohttp.ClientSession, *args: Any, **kwargs: Any) -> None:
169
+ existing_traces = kwargs.get("trace_configs") or []
170
+ if trace_config not in existing_traces:
171
+ existing_traces = [*list(existing_traces), trace_config]
172
+ kwargs["trace_configs"] = existing_traces
173
+ _original_init(self, *args, **kwargs)
174
+
175
+ aiohttp.ClientSession.__init__ = _patched_init # type: ignore[method-assign]
176
+
177
+ logger.debug("aiohttp auto-instrumentation enabled")
178
+
179
+
180
+ # Auto-patch on module import
181
+ _patch_httpx()
182
+ _patch_aiohttp()
183
+
184
+
185
+ __all__ = ["_patch_aiohttp", "_patch_httpx"]