hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/eval/display.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""Display helpers for eval links, job URLs, and result statistics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import webbrowser
|
|
7
|
+
from statistics import mean, pstdev
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from hud.settings import settings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def print_link(url: str, title: str, *, open_browser: bool = True) -> None:
|
|
14
|
+
"""Print a nicely formatted link with optional browser opening."""
|
|
15
|
+
if not (settings.telemetry_enabled and settings.api_key):
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
if open_browser:
|
|
19
|
+
with contextlib.suppress(Exception):
|
|
20
|
+
webbrowser.open(url, new=2)
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from rich.align import Align
|
|
24
|
+
from rich.console import Console
|
|
25
|
+
from rich.panel import Panel
|
|
26
|
+
|
|
27
|
+
console = Console()
|
|
28
|
+
style = "bold underline rgb(108,113,196)"
|
|
29
|
+
link_markup = f"[{style}][link={url}]{url}[/link][/{style}]"
|
|
30
|
+
panel = Panel(
|
|
31
|
+
Align.center(link_markup),
|
|
32
|
+
title=title,
|
|
33
|
+
border_style="rgb(192,150,12)",
|
|
34
|
+
padding=(0, 2),
|
|
35
|
+
)
|
|
36
|
+
console.print(panel)
|
|
37
|
+
except ImportError:
|
|
38
|
+
print(f"{title}: {url}") # noqa: T201
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def print_complete(url: str, name: str, *, error: bool = False) -> None:
|
|
42
|
+
"""Print a completion message with link."""
|
|
43
|
+
if not (settings.telemetry_enabled and settings.api_key):
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
from rich.console import Console
|
|
48
|
+
|
|
49
|
+
console = Console()
|
|
50
|
+
if error:
|
|
51
|
+
console.print(
|
|
52
|
+
f"\n[red]✗ '{name}' failed![/red] [dim]View details at:[/dim] "
|
|
53
|
+
f"[bold link={url}]{url}[/bold link]\n"
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
console.print(
|
|
57
|
+
f"\n[green]✓ '{name}' complete![/green] [dim]View results at:[/dim] "
|
|
58
|
+
f"[bold link={url}]{url}[/bold link]\n"
|
|
59
|
+
)
|
|
60
|
+
except ImportError:
|
|
61
|
+
status = "failed" if error else "complete"
|
|
62
|
+
print(f"\n{name} {status}: {url}\n") # noqa: T201
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def print_single_result(
|
|
66
|
+
trace_id: str,
|
|
67
|
+
name: str,
|
|
68
|
+
*,
|
|
69
|
+
reward: float | None = None,
|
|
70
|
+
error: str | None = None,
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Print a single eval result summary."""
|
|
73
|
+
if not (settings.telemetry_enabled and settings.api_key):
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
url = f"https://hud.ai/trace/{trace_id}"
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
from rich.console import Console
|
|
80
|
+
|
|
81
|
+
console = Console()
|
|
82
|
+
|
|
83
|
+
if error:
|
|
84
|
+
console.print(
|
|
85
|
+
f"\n[red]✗ '{name}' failed![/red]\n"
|
|
86
|
+
f" [dim]Error:[/dim] [red]{error[:80]}{'...' if len(error) > 80 else ''}[/red]\n"
|
|
87
|
+
f" [dim]View at:[/dim] [bold link={url}]{url}[/bold link]\n"
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
reward_str = f"{reward:.3f}" if reward is not None else "—"
|
|
91
|
+
reward_color = "green" if reward is not None and reward > 0.7 else "yellow"
|
|
92
|
+
console.print(
|
|
93
|
+
f"\n[green]✓ '{name}' complete![/green]\n"
|
|
94
|
+
f" [dim]Reward:[/dim] [{reward_color}]{reward_str}[/{reward_color}]\n"
|
|
95
|
+
f" [dim]View at:[/dim] [bold link={url}]{url}[/bold link]\n"
|
|
96
|
+
)
|
|
97
|
+
except ImportError:
|
|
98
|
+
status = "failed" if error else "complete"
|
|
99
|
+
reward_str = f", reward={reward:.3f}" if reward is not None else ""
|
|
100
|
+
print(f"\n{name} {status}{reward_str}: {url}\n") # noqa: T201
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def display_results(
|
|
104
|
+
results: list[Any],
|
|
105
|
+
*,
|
|
106
|
+
tasks: list[Any] | None = None,
|
|
107
|
+
name: str = "",
|
|
108
|
+
elapsed: float | None = None,
|
|
109
|
+
show_details: bool = True,
|
|
110
|
+
) -> None:
|
|
111
|
+
"""Display evaluation results in a formatted table.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
results: List of EvalContext objects from hud.eval()
|
|
115
|
+
tasks: Optional list of Task objects (for task info in table)
|
|
116
|
+
name: Optional name for the evaluation
|
|
117
|
+
elapsed: Optional elapsed time in seconds
|
|
118
|
+
show_details: Whether to show per-eval details table
|
|
119
|
+
"""
|
|
120
|
+
if not results:
|
|
121
|
+
print("No results to display") # noqa: T201
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
from rich.console import Console
|
|
126
|
+
from rich.table import Table
|
|
127
|
+
|
|
128
|
+
console = Console()
|
|
129
|
+
except ImportError:
|
|
130
|
+
_display_basic(results, name, elapsed)
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
# Extract stats from results (EvalContext objects)
|
|
134
|
+
# Use 'or 0' to handle None rewards (scenario failed before returning a reward)
|
|
135
|
+
rewards = [getattr(r, "reward", 0) or 0 for r in results if r is not None]
|
|
136
|
+
errors = [r for r in results if r is not None and getattr(r, "error", None)]
|
|
137
|
+
durations = [getattr(r, "duration", 0) for r in results if getattr(r, "duration", 0) > 0]
|
|
138
|
+
|
|
139
|
+
if not rewards:
|
|
140
|
+
console.print("[yellow]No valid results[/yellow]")
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
mean_reward = mean(rewards) if rewards else 0.0
|
|
144
|
+
std_reward = pstdev(rewards) if len(rewards) > 1 else 0.0
|
|
145
|
+
success_count = sum(1 for r in rewards if r > 0.7)
|
|
146
|
+
success_rate = success_count / len(results) if results else 0.0
|
|
147
|
+
|
|
148
|
+
# Print summary
|
|
149
|
+
title = f"📊 '{name}' Results" if name else "📊 Evaluation Complete"
|
|
150
|
+
console.print(f"\n[bold]{title}[/bold]")
|
|
151
|
+
console.print(f" [dim]Evals:[/dim] {len(results)}")
|
|
152
|
+
if elapsed:
|
|
153
|
+
rate = len(results) / elapsed if elapsed > 0 else 0
|
|
154
|
+
console.print(f" [dim]Time:[/dim] {elapsed:.1f}s ({rate:.1f}/s)")
|
|
155
|
+
if durations:
|
|
156
|
+
console.print(f" [dim]Avg duration:[/dim] {mean(durations):.2f}s")
|
|
157
|
+
console.print(f" [dim]Mean reward:[/dim] [green]{mean_reward:.3f}[/green] ± {std_reward:.3f}")
|
|
158
|
+
console.print(f" [dim]Success rate:[/dim] [yellow]{success_rate * 100:.1f}%[/yellow]")
|
|
159
|
+
if errors:
|
|
160
|
+
console.print(f" [dim]Errors:[/dim] [red]{len(errors)}[/red]")
|
|
161
|
+
|
|
162
|
+
# Details table
|
|
163
|
+
if show_details and len(results) <= 50:
|
|
164
|
+
table = Table(title="Details", show_header=True, header_style="bold")
|
|
165
|
+
table.add_column("#", style="dim", justify="right", width=4)
|
|
166
|
+
|
|
167
|
+
# Check if we have variants (grouped parallel runs)
|
|
168
|
+
has_variants = any(getattr(r, "variants", None) for r in results if r)
|
|
169
|
+
has_prompts = any(getattr(r, "prompt", None) for r in results if r)
|
|
170
|
+
has_answers = any(getattr(r, "answer", None) for r in results if r)
|
|
171
|
+
|
|
172
|
+
if has_variants:
|
|
173
|
+
table.add_column("Variants", style="cyan", max_width=30)
|
|
174
|
+
elif tasks:
|
|
175
|
+
table.add_column("Task", style="cyan", max_width=30)
|
|
176
|
+
|
|
177
|
+
if has_prompts:
|
|
178
|
+
table.add_column("Prompt", style="dim", max_width=35)
|
|
179
|
+
|
|
180
|
+
if has_answers:
|
|
181
|
+
table.add_column("Answer", style="dim", max_width=35)
|
|
182
|
+
|
|
183
|
+
table.add_column("Reward", justify="right", style="green", width=8)
|
|
184
|
+
if durations:
|
|
185
|
+
table.add_column("Time", justify="right", width=8)
|
|
186
|
+
table.add_column("", justify="center", width=3) # Status icon
|
|
187
|
+
|
|
188
|
+
for i, r in enumerate(results):
|
|
189
|
+
if r is None:
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
idx = getattr(r, "index", i)
|
|
193
|
+
reward = getattr(r, "reward", None)
|
|
194
|
+
error = getattr(r, "error", None)
|
|
195
|
+
duration = getattr(r, "duration", 0)
|
|
196
|
+
variants = getattr(r, "variants", None)
|
|
197
|
+
prompt = getattr(r, "prompt", None)
|
|
198
|
+
answer = getattr(r, "answer", None)
|
|
199
|
+
|
|
200
|
+
# Status icon
|
|
201
|
+
if error:
|
|
202
|
+
status = "[red]✗[/red]"
|
|
203
|
+
elif reward is not None and reward > 0.7:
|
|
204
|
+
status = "[green]✓[/green]"
|
|
205
|
+
else:
|
|
206
|
+
status = "[yellow]○[/yellow]"
|
|
207
|
+
|
|
208
|
+
row = [str(idx)]
|
|
209
|
+
|
|
210
|
+
# Variant or task column
|
|
211
|
+
if has_variants:
|
|
212
|
+
row.append(_format_variants(variants))
|
|
213
|
+
elif tasks and i < len(tasks):
|
|
214
|
+
task = tasks[i]
|
|
215
|
+
task_label = _get_task_label(task, i)
|
|
216
|
+
row.append(task_label[:30])
|
|
217
|
+
|
|
218
|
+
# Prompt column
|
|
219
|
+
if has_prompts:
|
|
220
|
+
row.append(_truncate(prompt, 35))
|
|
221
|
+
|
|
222
|
+
# Answer column
|
|
223
|
+
if has_answers:
|
|
224
|
+
row.append(_truncate(answer, 35))
|
|
225
|
+
|
|
226
|
+
# Reward
|
|
227
|
+
row.append(f"{reward:.3f}" if reward is not None else "—")
|
|
228
|
+
|
|
229
|
+
# Duration
|
|
230
|
+
if durations:
|
|
231
|
+
row.append(f"{duration:.1f}s" if duration > 0 else "—")
|
|
232
|
+
|
|
233
|
+
row.append(status)
|
|
234
|
+
table.add_row(*row)
|
|
235
|
+
|
|
236
|
+
console.print(table)
|
|
237
|
+
|
|
238
|
+
# Variance warning
|
|
239
|
+
if std_reward > 0.3:
|
|
240
|
+
console.print(f"\n[yellow]⚠️ High variance (std={std_reward:.3f})[/yellow]")
|
|
241
|
+
|
|
242
|
+
console.print()
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _display_basic(results: list[Any], name: str, elapsed: float | None) -> None:
|
|
246
|
+
"""Fallback display without rich."""
|
|
247
|
+
rewards = [getattr(r, "reward", 0) for r in results if r is not None]
|
|
248
|
+
title = f"'{name}' Results" if name else "Eval Results"
|
|
249
|
+
print(f"\n{title}") # noqa: T201
|
|
250
|
+
print(f" Evals: {len(results)}") # noqa: T201
|
|
251
|
+
if elapsed:
|
|
252
|
+
print(f" Time: {elapsed:.1f}s") # noqa: T201
|
|
253
|
+
if rewards:
|
|
254
|
+
print(f" Mean reward: {mean(rewards):.3f}") # noqa: T201
|
|
255
|
+
print() # noqa: T201
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _format_variants(variants: dict[str, Any] | None) -> str:
|
|
259
|
+
"""Format variants dict for display."""
|
|
260
|
+
if not variants:
|
|
261
|
+
return "-"
|
|
262
|
+
parts = [f"{k}={v}" for k, v in variants.items()]
|
|
263
|
+
result = ", ".join(parts)
|
|
264
|
+
return result[:28] + ".." if len(result) > 30 else result
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _truncate(text: str | None, max_len: int) -> str:
|
|
268
|
+
"""Truncate text to max length."""
|
|
269
|
+
if not text:
|
|
270
|
+
return "-"
|
|
271
|
+
text = text.replace("\n", " ").strip()
|
|
272
|
+
return text[: max_len - 2] + ".." if len(text) > max_len else text
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _get_task_label(task: Any, index: int) -> str:
|
|
276
|
+
"""Get a display label for a task."""
|
|
277
|
+
if task is None:
|
|
278
|
+
return f"task_{index}"
|
|
279
|
+
if isinstance(task, dict):
|
|
280
|
+
return task.get("id") or task.get("prompt", "")[:25] or f"task_{index}"
|
|
281
|
+
task_id = getattr(task, "id", None)
|
|
282
|
+
if task_id:
|
|
283
|
+
return task_id
|
|
284
|
+
prompt = getattr(task, "prompt", None) or getattr(task, "scenario", None)
|
|
285
|
+
if prompt:
|
|
286
|
+
return prompt[:25]
|
|
287
|
+
return f"task_{index}"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# Backwards compatibility alias
|
|
291
|
+
print_eval_stats = display_results
|
|
292
|
+
|
|
293
|
+
__all__ = [
|
|
294
|
+
"display_results",
|
|
295
|
+
"print_complete",
|
|
296
|
+
"print_eval_stats",
|
|
297
|
+
"print_link",
|
|
298
|
+
"print_single_result",
|
|
299
|
+
]
|
hud/eval/instrument.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Auto-instrumentation for httpx and aiohttp to inject trace headers.
|
|
2
|
+
|
|
3
|
+
This module patches HTTP clients to automatically add:
|
|
4
|
+
- Trace-Id headers when inside an eval context
|
|
5
|
+
- Authorization headers for HUD API calls
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from types import SimpleNamespace
|
|
16
|
+
|
|
17
|
+
from hud.settings import settings
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_trace_headers() -> dict[str, str] | None:
|
|
23
|
+
"""Lazy import to avoid circular dependency."""
|
|
24
|
+
from hud.eval.context import get_current_trace_headers
|
|
25
|
+
|
|
26
|
+
return get_current_trace_headers()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _get_api_key() -> str | None:
|
|
30
|
+
"""Get API key from context or settings.
|
|
31
|
+
|
|
32
|
+
Prefers the contextvar (set by hud.eval(api_key=...)),
|
|
33
|
+
falls back to settings (env var HUD_API_KEY).
|
|
34
|
+
"""
|
|
35
|
+
from hud.eval.context import get_current_api_key
|
|
36
|
+
|
|
37
|
+
return get_current_api_key() or settings.api_key
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _is_hud_url(url_str: str) -> bool:
|
|
41
|
+
"""Check if URL is a HUD service (inference or MCP)."""
|
|
42
|
+
parsed = urlparse(url_str)
|
|
43
|
+
request_host = parsed.netloc or url_str.split("/")[0]
|
|
44
|
+
|
|
45
|
+
# Check for known HUD domains (works for any subdomain)
|
|
46
|
+
if request_host.endswith((".hud.ai", ".hud.so")):
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
# Also check settings URLs
|
|
50
|
+
known_hosts = {
|
|
51
|
+
urlparse(settings.hud_gateway_url).netloc,
|
|
52
|
+
urlparse(settings.hud_mcp_url).netloc,
|
|
53
|
+
}
|
|
54
|
+
return request_host in known_hosts
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _httpx_request_hook(request: Any) -> None:
|
|
58
|
+
"""httpx event hook that adds trace headers and auth to HUD requests.
|
|
59
|
+
|
|
60
|
+
For inference.hud.ai and mcp.hud.ai:
|
|
61
|
+
- Injects trace headers (Trace-Id) if in trace context
|
|
62
|
+
- Injects Authorization header if API key is set and no auth present
|
|
63
|
+
"""
|
|
64
|
+
url_str = str(request.url)
|
|
65
|
+
if not _is_hud_url(url_str):
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
# Inject trace headers if in trace context
|
|
69
|
+
headers = _get_trace_headers()
|
|
70
|
+
if headers is not None:
|
|
71
|
+
for key, value in headers.items():
|
|
72
|
+
request.headers[key] = value
|
|
73
|
+
logger.debug("Added trace headers to request: %s", url_str)
|
|
74
|
+
|
|
75
|
+
# Auto-inject API key if not present or invalid (prefer contextvar, fallback to settings)
|
|
76
|
+
api_key = _get_api_key()
|
|
77
|
+
if api_key:
|
|
78
|
+
existing_auth = request.headers.get("Authorization", "")
|
|
79
|
+
# Override if no auth, empty auth, or invalid "Bearer None"
|
|
80
|
+
if not existing_auth or existing_auth in ("Bearer None", "Bearer null", "Bearer "):
|
|
81
|
+
request.headers["Authorization"] = f"Bearer {api_key}"
|
|
82
|
+
logger.debug("Added API key auth to request: %s", url_str)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
async def _async_httpx_request_hook(request: Any) -> None:
|
|
86
|
+
"""Async version of the httpx event hook."""
|
|
87
|
+
_httpx_request_hook(request)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _instrument_httpx_client(client: Any) -> None:
|
|
91
|
+
"""Add trace hook to an httpx client instance."""
|
|
92
|
+
is_async = hasattr(client, "aclose")
|
|
93
|
+
hook = _async_httpx_request_hook if is_async else _httpx_request_hook
|
|
94
|
+
|
|
95
|
+
existing_hooks = client.event_hooks.get("request", [])
|
|
96
|
+
if hook not in existing_hooks:
|
|
97
|
+
existing_hooks.append(hook)
|
|
98
|
+
client.event_hooks["request"] = existing_hooks
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _patch_httpx() -> None:
|
|
102
|
+
"""Monkey-patch httpx to auto-instrument all clients."""
|
|
103
|
+
try:
|
|
104
|
+
import httpx
|
|
105
|
+
except ImportError:
|
|
106
|
+
logger.debug("httpx not installed, skipping auto-instrumentation")
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
_original_async_init = httpx.AsyncClient.__init__
|
|
110
|
+
|
|
111
|
+
def _patched_async_init(self: Any, *args: Any, **kwargs: Any) -> None:
|
|
112
|
+
_original_async_init(self, *args, **kwargs)
|
|
113
|
+
_instrument_httpx_client(self)
|
|
114
|
+
|
|
115
|
+
httpx.AsyncClient.__init__ = _patched_async_init # type: ignore[method-assign]
|
|
116
|
+
|
|
117
|
+
_original_sync_init = httpx.Client.__init__
|
|
118
|
+
|
|
119
|
+
def _patched_sync_init(self: Any, *args: Any, **kwargs: Any) -> None:
|
|
120
|
+
_original_sync_init(self, *args, **kwargs)
|
|
121
|
+
_instrument_httpx_client(self)
|
|
122
|
+
|
|
123
|
+
httpx.Client.__init__ = _patched_sync_init # type: ignore[method-assign]
|
|
124
|
+
|
|
125
|
+
logger.debug("httpx auto-instrumentation enabled")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _patch_aiohttp() -> None:
|
|
129
|
+
"""
|
|
130
|
+
Monkey-patch aiohttp to auto-instrument all ClientSession instances.
|
|
131
|
+
This is important for the Gemini client in particular, which uses aiohttp by default.
|
|
132
|
+
"""
|
|
133
|
+
try:
|
|
134
|
+
import aiohttp
|
|
135
|
+
except ImportError:
|
|
136
|
+
logger.debug("aiohttp not installed, skipping auto-instrumentation")
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
async def on_request_start(
|
|
140
|
+
_session: aiohttp.ClientSession,
|
|
141
|
+
_trace_config_ctx: SimpleNamespace,
|
|
142
|
+
params: aiohttp.TraceRequestStartParams,
|
|
143
|
+
) -> None:
|
|
144
|
+
"""aiohttp trace hook that adds trace headers and auth to HUD requests."""
|
|
145
|
+
url_str = str(params.url)
|
|
146
|
+
if not _is_hud_url(url_str):
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
trace_headers = _get_trace_headers()
|
|
150
|
+
if trace_headers is not None:
|
|
151
|
+
for key, value in trace_headers.items():
|
|
152
|
+
params.headers[key] = value
|
|
153
|
+
logger.debug("Added trace headers to aiohttp request: %s", url_str)
|
|
154
|
+
|
|
155
|
+
api_key = _get_api_key()
|
|
156
|
+
if api_key:
|
|
157
|
+
existing_auth = params.headers.get("Authorization", "")
|
|
158
|
+
# Override if no auth, empty auth, or invalid "Bearer None"
|
|
159
|
+
if not existing_auth or existing_auth in ("Bearer None", "Bearer null", "Bearer "):
|
|
160
|
+
params.headers["Authorization"] = f"Bearer {api_key}"
|
|
161
|
+
logger.debug("Added API key auth to aiohttp request: %s", url_str)
|
|
162
|
+
|
|
163
|
+
trace_config = aiohttp.TraceConfig()
|
|
164
|
+
trace_config.on_request_start.append(on_request_start)
|
|
165
|
+
|
|
166
|
+
_original_init = aiohttp.ClientSession.__init__
|
|
167
|
+
|
|
168
|
+
def _patched_init(self: aiohttp.ClientSession, *args: Any, **kwargs: Any) -> None:
|
|
169
|
+
existing_traces = kwargs.get("trace_configs") or []
|
|
170
|
+
if trace_config not in existing_traces:
|
|
171
|
+
existing_traces = [*list(existing_traces), trace_config]
|
|
172
|
+
kwargs["trace_configs"] = existing_traces
|
|
173
|
+
_original_init(self, *args, **kwargs)
|
|
174
|
+
|
|
175
|
+
aiohttp.ClientSession.__init__ = _patched_init # type: ignore[method-assign]
|
|
176
|
+
|
|
177
|
+
logger.debug("aiohttp auto-instrumentation enabled")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# Auto-patch on module import
|
|
181
|
+
_patch_httpx()
|
|
182
|
+
_patch_aiohttp()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
__all__ = ["_patch_aiohttp", "_patch_httpx"]
|