hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/eval/context.py
ADDED
|
@@ -0,0 +1,674 @@
|
|
|
1
|
+
"""EvalContext - Environment with evaluation tracking.
|
|
2
|
+
|
|
3
|
+
EvalContext IS an Environment, with additional evaluation tracking
|
|
4
|
+
capabilities (trace_id, reward, backend reporting).
|
|
5
|
+
|
|
6
|
+
This makes `async with env.eval("task") as env` natural - you get
|
|
7
|
+
a full Environment that you can call tools on directly.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import contextvars
|
|
13
|
+
import logging
|
|
14
|
+
import uuid
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Self
|
|
16
|
+
|
|
17
|
+
from hud.environment import Environment
|
|
18
|
+
from hud.settings import settings
|
|
19
|
+
from hud.shared import make_request
|
|
20
|
+
from hud.telemetry import flush, instrument
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from types import TracebackType
|
|
24
|
+
|
|
25
|
+
from hud.eval.task import Task
|
|
26
|
+
from hud.types import MCPToolResult
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
from hud.eval.types import EvalExitPayload, EvalPayload, ParallelEvalComplete
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# Contextvar to store current trace headers (for httpx auto-instrumentation)
|
|
34
|
+
_current_trace_headers: contextvars.ContextVar[dict[str, str] | None] = contextvars.ContextVar(
|
|
35
|
+
"current_trace_headers", default=None
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Contextvar to store current api_key override (for telemetry exporter)
|
|
39
|
+
_current_api_key: contextvars.ContextVar[str | None] = contextvars.ContextVar(
|
|
40
|
+
"current_api_key", default=None
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_current_trace_headers() -> dict[str, str] | None:
|
|
45
|
+
"""Get the current trace headers from context."""
|
|
46
|
+
return _current_trace_headers.get()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_current_trace_id() -> str | None:
|
|
50
|
+
"""Get the current trace ID (task_run_id) from context.
|
|
51
|
+
|
|
52
|
+
Returns the Trace-Id if inside an eval context, None otherwise.
|
|
53
|
+
Used by @instrument decorator to know where to send telemetry.
|
|
54
|
+
"""
|
|
55
|
+
headers = _current_trace_headers.get()
|
|
56
|
+
if headers:
|
|
57
|
+
return headers.get("Trace-Id")
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_current_api_key() -> str | None:
|
|
62
|
+
"""Get the current API key override from context.
|
|
63
|
+
|
|
64
|
+
Returns the api_key if one was passed to hud.eval(), otherwise None.
|
|
65
|
+
Falls back to settings.api_key if not in an eval context.
|
|
66
|
+
Used by telemetry exporter for uploads.
|
|
67
|
+
"""
|
|
68
|
+
return _current_api_key.get()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# =============================================================================
|
|
72
|
+
# EvalContext
|
|
73
|
+
# =============================================================================
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EvalContext(Environment):
|
|
77
|
+
"""Environment with evaluation tracking capabilities.
|
|
78
|
+
|
|
79
|
+
Attributes:
|
|
80
|
+
trace_id: Unique identifier for this evaluation
|
|
81
|
+
eval_name: Task/evaluation name (separate from env name)
|
|
82
|
+
job_id: Links to parent job (auto-detected from hud.job() context)
|
|
83
|
+
group_id: Links parallel evaluations together
|
|
84
|
+
variants: Variant assignment dict (for A/B testing)
|
|
85
|
+
reward: Reward value (user-settable)
|
|
86
|
+
error: Exception if failed
|
|
87
|
+
results: All eval results (populated for parallel execution, empty for single)
|
|
88
|
+
task: Task definition (if loaded from slug)
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
```python
|
|
92
|
+
# With task (scenario sets reward automatically)
|
|
93
|
+
tasks = load_tasks("my-org/task:1")
|
|
94
|
+
async with hud.eval(tasks) as ctx:
|
|
95
|
+
await agent.run(ctx)
|
|
96
|
+
# reward set by scenario evaluate phase in __aexit__
|
|
97
|
+
|
|
98
|
+
# Blank eval (manual reward)
|
|
99
|
+
async with hud.eval() as ctx:
|
|
100
|
+
ctx.reward = compute_reward()
|
|
101
|
+
```
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
name: str = "eval",
|
|
107
|
+
*,
|
|
108
|
+
trace_id: str | None = None,
|
|
109
|
+
api_key: str | None = None,
|
|
110
|
+
job_id: str | None = None,
|
|
111
|
+
group_id: str | None = None,
|
|
112
|
+
index: int = 0,
|
|
113
|
+
variants: dict[str, Any] | None = None,
|
|
114
|
+
code_snippet: str | None = None,
|
|
115
|
+
trace: bool = True,
|
|
116
|
+
quiet: bool = False,
|
|
117
|
+
**env_kwargs: Any,
|
|
118
|
+
) -> None:
|
|
119
|
+
"""Initialize EvalContext.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
name: Environment/evaluation name
|
|
123
|
+
trace_id: Unique trace ID (auto-generated if not provided)
|
|
124
|
+
api_key: API key for backend calls
|
|
125
|
+
job_id: Job ID to link to (auto-detected if not provided)
|
|
126
|
+
group_id: Group ID for parallel evaluations
|
|
127
|
+
index: Index in parallel execution
|
|
128
|
+
variants: Variant assignment for A/B testing
|
|
129
|
+
code_snippet: Code being evaluated (for reproducibility)
|
|
130
|
+
trace: Whether to send trace data to backend (default True)
|
|
131
|
+
quiet: Whether to suppress printing links (default False)
|
|
132
|
+
**env_kwargs: Additional kwargs passed to Environment.__init__
|
|
133
|
+
"""
|
|
134
|
+
# Initialize Environment
|
|
135
|
+
super().__init__(name=name, **env_kwargs)
|
|
136
|
+
|
|
137
|
+
# === Evaluation tracking (not in Environment) ===
|
|
138
|
+
|
|
139
|
+
# Identity
|
|
140
|
+
self.trace_id: str = trace_id or str(uuid.uuid4())
|
|
141
|
+
self.eval_name: str = name # Separate from self.name for clarity
|
|
142
|
+
|
|
143
|
+
# Job linkage
|
|
144
|
+
self.job_id: str | None = job_id
|
|
145
|
+
|
|
146
|
+
self.group_id: str | None = group_id
|
|
147
|
+
self.index: int = index
|
|
148
|
+
|
|
149
|
+
# Variant assignment
|
|
150
|
+
self.variants: dict[str, Any] = variants or {}
|
|
151
|
+
|
|
152
|
+
# User-settable (per-run values, override Environment defaults)
|
|
153
|
+
self.prompt: str | None = None # From scenario setup or task
|
|
154
|
+
self.reward: float | None = None
|
|
155
|
+
self.answer: str | None = None # Agent's submitted answer
|
|
156
|
+
self.system_prompt: str | None = None # From task.agent_config, passed to agent
|
|
157
|
+
|
|
158
|
+
# Error tracking
|
|
159
|
+
self.error: BaseException | None = None
|
|
160
|
+
|
|
161
|
+
# User metadata (arbitrary key-value pairs)
|
|
162
|
+
self.metadata: dict[str, Any] = {}
|
|
163
|
+
|
|
164
|
+
# Parallel results (empty list for single evals, populated for parallel)
|
|
165
|
+
self.results: list[EvalContext] = []
|
|
166
|
+
|
|
167
|
+
# Code snippet for reproducibility
|
|
168
|
+
self.code_snippet: str | None = code_snippet
|
|
169
|
+
|
|
170
|
+
# Private state for eval tracking
|
|
171
|
+
self._eval_api_key = api_key
|
|
172
|
+
self._token: contextvars.Token[dict[str, str] | None] | None = None
|
|
173
|
+
self._api_key_token: contextvars.Token[str | None] | None = None
|
|
174
|
+
self._is_summary: bool = False # True for summary contexts (skip trace)
|
|
175
|
+
self._suppress_link: bool = quiet # True to suppress printing eval link
|
|
176
|
+
self._trace_enabled: bool = trace # Whether to send trace data to backend
|
|
177
|
+
self._source_env_name: str | None = None # Source env name for remote lookups
|
|
178
|
+
self._task: Task | None = None # Task config (set by from_task)
|
|
179
|
+
|
|
180
|
+
@classmethod
|
|
181
|
+
def from_environment(
|
|
182
|
+
cls,
|
|
183
|
+
env: Environment,
|
|
184
|
+
name: str,
|
|
185
|
+
*,
|
|
186
|
+
trace_id: str | None = None,
|
|
187
|
+
api_key: str | None = None,
|
|
188
|
+
job_id: str | None = None,
|
|
189
|
+
group_id: str | None = None,
|
|
190
|
+
index: int = 0,
|
|
191
|
+
variants: dict[str, Any] | None = None,
|
|
192
|
+
code_snippet: str | None = None,
|
|
193
|
+
trace: bool = True,
|
|
194
|
+
quiet: bool = False,
|
|
195
|
+
) -> EvalContext:
|
|
196
|
+
"""Create an EvalContext that copies configuration from an existing Environment.
|
|
197
|
+
|
|
198
|
+
This creates a new EvalContext with the same connections as the parent.
|
|
199
|
+
Used by env.eval() to create evaluation contexts.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
env: Parent environment to copy from
|
|
203
|
+
name: Evaluation name
|
|
204
|
+
trace_id: Unique trace ID
|
|
205
|
+
api_key: API key for backend calls
|
|
206
|
+
job_id: Job ID to link to
|
|
207
|
+
group_id: Group ID for parallel evaluations
|
|
208
|
+
index: Index in parallel execution
|
|
209
|
+
variants: Variant assignment
|
|
210
|
+
code_snippet: Code being evaluated
|
|
211
|
+
"""
|
|
212
|
+
ctx = cls(
|
|
213
|
+
name=name,
|
|
214
|
+
trace_id=trace_id,
|
|
215
|
+
api_key=api_key,
|
|
216
|
+
job_id=job_id,
|
|
217
|
+
group_id=group_id,
|
|
218
|
+
index=index,
|
|
219
|
+
variants=variants,
|
|
220
|
+
code_snippet=code_snippet,
|
|
221
|
+
trace=trace,
|
|
222
|
+
quiet=quiet,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Copy connections from parent - each connector is copied so parallel
|
|
226
|
+
# execution gets fresh client instances
|
|
227
|
+
ctx._connections = {name: connector.copy() for name, connector in env._connections.items()}
|
|
228
|
+
|
|
229
|
+
# Note: Auth is injected at request time by httpx/aiohttp hooks in hud.eval.instrument
|
|
230
|
+
# using the contextvar set in __aenter__ (supports api_key passed to hud.eval())
|
|
231
|
+
ctx._setup_calls = env._setup_calls.copy()
|
|
232
|
+
ctx._evaluate_calls = env._evaluate_calls.copy()
|
|
233
|
+
|
|
234
|
+
# Copy scenarios (definitions) by reference - they don't change
|
|
235
|
+
ctx._scenarios = getattr(env, "_scenarios", {})
|
|
236
|
+
# Create fresh session state for this eval (parallel evals each need their own)
|
|
237
|
+
ctx._scenario_sessions = {}
|
|
238
|
+
ctx._scenario_latest = {}
|
|
239
|
+
ctx._scenario_answers = {}
|
|
240
|
+
|
|
241
|
+
# Store source env name for remote scenario lookups
|
|
242
|
+
ctx._source_env_name = env.name
|
|
243
|
+
|
|
244
|
+
# Copy managers by reference (they hold local tools, prompts, resources)
|
|
245
|
+
# This allows ctx.call_tool(), ctx.get_prompt(), ctx.read_resource() to work
|
|
246
|
+
# for locally defined tools/scenarios
|
|
247
|
+
ctx._tool_manager = env._tool_manager
|
|
248
|
+
ctx._prompt_manager = env._prompt_manager
|
|
249
|
+
ctx._resource_manager = env._resource_manager
|
|
250
|
+
|
|
251
|
+
# Copy prompt
|
|
252
|
+
if env.prompt:
|
|
253
|
+
ctx.prompt = env.prompt
|
|
254
|
+
|
|
255
|
+
# Copy agent-level tool filters (allowed_tools/disallowed_tools)
|
|
256
|
+
ctx._agent_include = getattr(env, "_agent_include", None)
|
|
257
|
+
ctx._agent_exclude = getattr(env, "_agent_exclude", None)
|
|
258
|
+
|
|
259
|
+
# Copy router's conflict resolution strategy
|
|
260
|
+
ctx._router.conflict_resolution = env._router.conflict_resolution
|
|
261
|
+
|
|
262
|
+
# Copy mock mode settings (for testing)
|
|
263
|
+
ctx._mock_mode = getattr(env, "_mock_mode", False)
|
|
264
|
+
ctx._mock_outputs = getattr(env, "_mock_outputs", {}).copy()
|
|
265
|
+
ctx._mock_tool_schemas = getattr(env, "_mock_tool_schemas", {}).copy()
|
|
266
|
+
|
|
267
|
+
# Copy hub config (needed to detect remote hub for telemetry)
|
|
268
|
+
ctx._hub_config = getattr(env, "_hub_config", None)
|
|
269
|
+
|
|
270
|
+
# Copy mcp config (needed to detect remote HUD MCP for telemetry)
|
|
271
|
+
ctx._mcp_config = getattr(env, "_mcp_config", None)
|
|
272
|
+
|
|
273
|
+
return ctx
|
|
274
|
+
|
|
275
|
+
@classmethod
|
|
276
|
+
def from_task(
|
|
277
|
+
cls,
|
|
278
|
+
task: Task,
|
|
279
|
+
*,
|
|
280
|
+
name: str | None = None,
|
|
281
|
+
trace_id: str | None = None,
|
|
282
|
+
api_key: str | None = None,
|
|
283
|
+
job_id: str | None = None,
|
|
284
|
+
group_id: str | None = None,
|
|
285
|
+
index: int = 0,
|
|
286
|
+
variants: dict[str, Any] | None = None,
|
|
287
|
+
code_snippet: str | None = None,
|
|
288
|
+
trace: bool = True,
|
|
289
|
+
quiet: bool = False,
|
|
290
|
+
) -> EvalContext:
|
|
291
|
+
"""Create an EvalContext from a Task config.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
task: Task config (env, scenario, args)
|
|
295
|
+
name: Override for eval/trace name (defaults to task scenario/args)
|
|
296
|
+
trace_id: Unique trace ID
|
|
297
|
+
api_key: API key for backend calls
|
|
298
|
+
job_id: Job ID to link to
|
|
299
|
+
group_id: Group ID for parallel evaluations
|
|
300
|
+
index: Index in parallel execution
|
|
301
|
+
variants: Variant assignment
|
|
302
|
+
code_snippet: Code being evaluated
|
|
303
|
+
trace: Whether to send traces to backend
|
|
304
|
+
quiet: Whether to suppress output
|
|
305
|
+
"""
|
|
306
|
+
from hud.environment import Environment
|
|
307
|
+
from hud.eval.task import build_eval_name
|
|
308
|
+
|
|
309
|
+
eval_name = name or build_eval_name(task.scenario, task.args)
|
|
310
|
+
|
|
311
|
+
# task.env is guaranteed to be Environment after Task.__post_init__
|
|
312
|
+
assert isinstance(task.env, Environment), "Task.env should be Environment"
|
|
313
|
+
|
|
314
|
+
ctx = cls.from_environment(
|
|
315
|
+
env=task.env,
|
|
316
|
+
name=eval_name,
|
|
317
|
+
trace_id=trace_id,
|
|
318
|
+
api_key=api_key,
|
|
319
|
+
job_id=job_id,
|
|
320
|
+
group_id=group_id,
|
|
321
|
+
index=index,
|
|
322
|
+
variants=variants,
|
|
323
|
+
code_snippet=code_snippet,
|
|
324
|
+
trace=trace,
|
|
325
|
+
quiet=quiet,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Store task info for scenario execution
|
|
329
|
+
ctx._task = task
|
|
330
|
+
|
|
331
|
+
# Set system_prompt from task.agent_config
|
|
332
|
+
if task.agent_config:
|
|
333
|
+
if isinstance(task.agent_config, dict):
|
|
334
|
+
if task.agent_config.get("system_prompt"):
|
|
335
|
+
ctx.system_prompt = task.agent_config["system_prompt"]
|
|
336
|
+
elif task.agent_config.system_prompt:
|
|
337
|
+
ctx.system_prompt = task.agent_config.system_prompt
|
|
338
|
+
|
|
339
|
+
return ctx
|
|
340
|
+
|
|
341
|
+
async def _run_task_scenario_setup(self) -> None:
|
|
342
|
+
"""Run the task's scenario setup phase (if scenario provided)."""
|
|
343
|
+
if self._task is None or self._task.scenario is None:
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
prompt = await self.run_scenario_setup(self._task.scenario, self._task.args)
|
|
347
|
+
if prompt:
|
|
348
|
+
self.prompt = prompt
|
|
349
|
+
|
|
350
|
+
async def _run_task_scenario_evaluate(self) -> None:
|
|
351
|
+
"""Run the task's scenario evaluate phase (if scenario provided)."""
|
|
352
|
+
if self._task is None or self._task.scenario is None:
|
|
353
|
+
return
|
|
354
|
+
|
|
355
|
+
reward = await self.run_scenario_evaluate(self._task.scenario)
|
|
356
|
+
if reward is not None:
|
|
357
|
+
self.reward = reward
|
|
358
|
+
|
|
359
|
+
# =========================================================================
|
|
360
|
+
# Summary Context - Attribute Access Control
|
|
361
|
+
# =========================================================================
|
|
362
|
+
|
|
363
|
+
# Attributes accessible on summary context (everything else raises ParallelEvalComplete)
|
|
364
|
+
_SUMMARY_ALLOWED = frozenset(
|
|
365
|
+
{
|
|
366
|
+
# Results and metadata
|
|
367
|
+
"results",
|
|
368
|
+
"reward",
|
|
369
|
+
"error",
|
|
370
|
+
"success",
|
|
371
|
+
# IDs
|
|
372
|
+
"trace_id",
|
|
373
|
+
"job_id",
|
|
374
|
+
"group_id",
|
|
375
|
+
"index",
|
|
376
|
+
# Private attrs
|
|
377
|
+
"_is_summary",
|
|
378
|
+
"_suppress_link",
|
|
379
|
+
"__class__",
|
|
380
|
+
"__dict__",
|
|
381
|
+
}
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
def __getattribute__(self, name: str) -> Any:
|
|
385
|
+
"""Block most attribute access on summary contexts."""
|
|
386
|
+
# Always allow private/dunder and whitelisted attrs
|
|
387
|
+
if name.startswith("_") or name in EvalContext._SUMMARY_ALLOWED:
|
|
388
|
+
return super().__getattribute__(name)
|
|
389
|
+
|
|
390
|
+
# Check if this is a summary context
|
|
391
|
+
try:
|
|
392
|
+
is_summary = super().__getattribute__("_is_summary")
|
|
393
|
+
except AttributeError:
|
|
394
|
+
is_summary = False
|
|
395
|
+
|
|
396
|
+
if is_summary:
|
|
397
|
+
raise ParallelEvalComplete
|
|
398
|
+
|
|
399
|
+
return super().__getattribute__(name)
|
|
400
|
+
|
|
401
|
+
# =========================================================================
|
|
402
|
+
# Computed Properties (eval-specific)
|
|
403
|
+
# =========================================================================
|
|
404
|
+
|
|
405
|
+
@property
|
|
406
|
+
def headers(self) -> dict[str, str]:
|
|
407
|
+
"""Headers for gateway integration."""
|
|
408
|
+
return {"Trace-Id": self.trace_id}
|
|
409
|
+
|
|
410
|
+
@property
|
|
411
|
+
def success(self) -> bool:
|
|
412
|
+
"""True if no error occurred."""
|
|
413
|
+
return self.error is None
|
|
414
|
+
|
|
415
|
+
@property
|
|
416
|
+
def has_scenario(self) -> bool:
|
|
417
|
+
"""True if a scenario is running and can accept submissions."""
|
|
418
|
+
return self._task is not None and self._task.scenario is not None
|
|
419
|
+
|
|
420
|
+
# =========================================================================
|
|
421
|
+
# Backend Integration
|
|
422
|
+
# =========================================================================
|
|
423
|
+
|
|
424
|
+
def _get_eval_api_key(self) -> str | None:
|
|
425
|
+
return self._eval_api_key or settings.api_key
|
|
426
|
+
|
|
427
|
+
def _build_base_payload(self) -> EvalPayload:
|
|
428
|
+
"""Build the base payload for enter/exit."""
|
|
429
|
+
return EvalPayload(
|
|
430
|
+
prompt=self.prompt,
|
|
431
|
+
code_snippet=self.code_snippet,
|
|
432
|
+
job_id=self.job_id,
|
|
433
|
+
group_id=self.group_id,
|
|
434
|
+
variants=self.variants if self.variants else None,
|
|
435
|
+
# Only send task_version_id for v5 tasks (those with scenarios).
|
|
436
|
+
# v4 tasks have client-side IDs that shouldn't be sent to backend.
|
|
437
|
+
task_version_id=self._task.id if self._task and self._task.scenario else None,
|
|
438
|
+
metadata=self.metadata if self.metadata else None,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
async def log(self, metrics: dict[str, Any]) -> None:
|
|
442
|
+
"""Log metrics to the backend."""
|
|
443
|
+
api_key = self._get_eval_api_key()
|
|
444
|
+
if not settings.telemetry_enabled or not api_key:
|
|
445
|
+
return
|
|
446
|
+
|
|
447
|
+
try:
|
|
448
|
+
await make_request(
|
|
449
|
+
method="POST",
|
|
450
|
+
url=f"{settings.hud_telemetry_url}/traces/{self.trace_id}/log",
|
|
451
|
+
json={"metrics": metrics},
|
|
452
|
+
api_key=api_key,
|
|
453
|
+
)
|
|
454
|
+
except Exception as e:
|
|
455
|
+
logger.warning("Failed to log metrics: %s", e)
|
|
456
|
+
|
|
457
|
+
async def submit(self, answer: str) -> None:
|
|
458
|
+
"""Submit the agent's answer for scenario evaluation.
|
|
459
|
+
|
|
460
|
+
Delegates to Environment.submit() with the current scenario name.
|
|
461
|
+
The answer will be passed to the scenario's evaluate phase via
|
|
462
|
+
`yield`, e.g.: `answer = yield "Do the task"`
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
answer: The agent's final answer/result to submit
|
|
466
|
+
|
|
467
|
+
Example:
|
|
468
|
+
async with env("checkout", product="laptop") as ctx:
|
|
469
|
+
response = await agent.run(ctx.prompt)
|
|
470
|
+
await ctx.submit(response)
|
|
471
|
+
# On exit, scenario's evaluate phase receives the answer
|
|
472
|
+
"""
|
|
473
|
+
if not self._task or not self._task.scenario:
|
|
474
|
+
return
|
|
475
|
+
|
|
476
|
+
# Store answer on context for display
|
|
477
|
+
self.answer = answer
|
|
478
|
+
|
|
479
|
+
# Delegate to Environment.submit() which handles storage + broadcast
|
|
480
|
+
await super().submit(self._task.scenario, answer)
|
|
481
|
+
|
|
482
|
+
async def _eval_enter(self) -> None:
|
|
483
|
+
"""Notify backend that eval has started."""
|
|
484
|
+
if not self._trace_enabled:
|
|
485
|
+
return
|
|
486
|
+
api_key = self._get_eval_api_key()
|
|
487
|
+
if not settings.telemetry_enabled or not api_key:
|
|
488
|
+
return
|
|
489
|
+
|
|
490
|
+
try:
|
|
491
|
+
payload = self._build_base_payload()
|
|
492
|
+
await make_request(
|
|
493
|
+
method="POST",
|
|
494
|
+
url=f"{settings.hud_api_url}/trace/{self.trace_id}/enter",
|
|
495
|
+
json=payload.model_dump(exclude_none=True),
|
|
496
|
+
api_key=api_key,
|
|
497
|
+
)
|
|
498
|
+
except Exception as e:
|
|
499
|
+
logger.warning("Failed to send eval enter: %s", e)
|
|
500
|
+
|
|
501
|
+
async def _eval_exit(self, error_message: str | None = None) -> None:
|
|
502
|
+
"""Notify backend that eval has completed."""
|
|
503
|
+
if not self._trace_enabled:
|
|
504
|
+
return
|
|
505
|
+
api_key = self._get_eval_api_key()
|
|
506
|
+
if not settings.telemetry_enabled or not api_key:
|
|
507
|
+
return
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
payload = EvalExitPayload(
|
|
511
|
+
**self._build_base_payload().model_dump(),
|
|
512
|
+
reward=self.reward,
|
|
513
|
+
success=self.success,
|
|
514
|
+
error_message=error_message,
|
|
515
|
+
)
|
|
516
|
+
await make_request(
|
|
517
|
+
method="POST",
|
|
518
|
+
url=f"{settings.hud_api_url}/trace/{self.trace_id}/exit",
|
|
519
|
+
json=payload.model_dump(exclude_none=True),
|
|
520
|
+
api_key=api_key,
|
|
521
|
+
)
|
|
522
|
+
except Exception as e:
|
|
523
|
+
logger.warning("Failed to send eval exit: %s", e)
|
|
524
|
+
|
|
525
|
+
# =========================================================================
|
|
526
|
+
# Context Manager (override Environment)
|
|
527
|
+
# =========================================================================
|
|
528
|
+
|
|
529
|
+
async def __aenter__(self) -> Self:
|
|
530
|
+
"""Enter eval context - connect environment and set trace headers."""
|
|
531
|
+
if self._is_summary:
|
|
532
|
+
return self
|
|
533
|
+
|
|
534
|
+
# Start tracking
|
|
535
|
+
self._token = _current_trace_headers.set(self.headers)
|
|
536
|
+
self._api_key_token = _current_api_key.set(self._eval_api_key)
|
|
537
|
+
|
|
538
|
+
# Register trace first (environment connection can fail)
|
|
539
|
+
await self._eval_enter()
|
|
540
|
+
|
|
541
|
+
try:
|
|
542
|
+
# Connect environment (MCP servers, tools)
|
|
543
|
+
await super().__aenter__()
|
|
544
|
+
|
|
545
|
+
# Run task scenario setup (if created from_task with scenario)
|
|
546
|
+
await self._run_task_scenario_setup()
|
|
547
|
+
self._print_eval_link()
|
|
548
|
+
except BaseException as e:
|
|
549
|
+
# Cleanup if setup fails - __aexit__ won't be called automatically
|
|
550
|
+
await self.__aexit__(type(e), e, e.__traceback__)
|
|
551
|
+
raise
|
|
552
|
+
|
|
553
|
+
return self
|
|
554
|
+
|
|
555
|
+
async def __aexit__(
|
|
556
|
+
self,
|
|
557
|
+
exc_type: type[BaseException] | None,
|
|
558
|
+
exc_val: BaseException | None,
|
|
559
|
+
exc_tb: TracebackType | None,
|
|
560
|
+
) -> bool:
|
|
561
|
+
"""Exit eval context - disconnect and report."""
|
|
562
|
+
# Summary contexts skip trace tracking (parallel results already tracked)
|
|
563
|
+
# Suppress ParallelEvalComplete - it's expected for skipping body re-execution
|
|
564
|
+
if self._is_summary:
|
|
565
|
+
return exc_type is ParallelEvalComplete
|
|
566
|
+
|
|
567
|
+
# Run task scenario evaluate (if no error and has scenario)
|
|
568
|
+
if exc_type is None:
|
|
569
|
+
await self._run_task_scenario_evaluate()
|
|
570
|
+
|
|
571
|
+
# Track error
|
|
572
|
+
error_msg: str | None = None
|
|
573
|
+
if exc_type is not None:
|
|
574
|
+
self.error = exc_val
|
|
575
|
+
error_msg = str(exc_val) if exc_val else "Unknown error"
|
|
576
|
+
|
|
577
|
+
# Flush any pending telemetry spans for this trace
|
|
578
|
+
flush(self.trace_id)
|
|
579
|
+
|
|
580
|
+
# Disconnect environment (parent class) - also runs evaluate tools
|
|
581
|
+
await super().__aexit__(exc_type, exc_val, exc_tb)
|
|
582
|
+
|
|
583
|
+
# Set reward from evaluate tools if not already set
|
|
584
|
+
if self.reward is None and hasattr(self, "_evaluate_reward"):
|
|
585
|
+
self.reward = self._evaluate_reward
|
|
586
|
+
|
|
587
|
+
# Reset context vars
|
|
588
|
+
if self._token is not None:
|
|
589
|
+
_current_trace_headers.reset(self._token)
|
|
590
|
+
self._token = None
|
|
591
|
+
if self._api_key_token is not None:
|
|
592
|
+
_current_api_key.reset(self._api_key_token)
|
|
593
|
+
self._api_key_token = None
|
|
594
|
+
|
|
595
|
+
# Notify backend
|
|
596
|
+
await self._eval_exit(error_msg)
|
|
597
|
+
|
|
598
|
+
# Print single eval result summary (unless suppressed for parallel evals)
|
|
599
|
+
self._print_single_result(error_msg)
|
|
600
|
+
|
|
601
|
+
return False
|
|
602
|
+
|
|
603
|
+
# =========================================================================
|
|
604
|
+
# Tool Call Instrumentation
|
|
605
|
+
# =========================================================================
|
|
606
|
+
|
|
607
|
+
async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
|
|
608
|
+
"""Execute a tool with automatic telemetry recording.
|
|
609
|
+
|
|
610
|
+
Overrides Environment._execute_tool to record MCP spans for the eval context.
|
|
611
|
+
Instrumentation is disabled when connected to a remote HUD server (telemetry is
|
|
612
|
+
recorded server-side in that case).
|
|
613
|
+
"""
|
|
614
|
+
# Skip instrumentation when connected to a remote hub - telemetry is handled server-side
|
|
615
|
+
if self._hub_config is not None:
|
|
616
|
+
return await super()._execute_tool(name, arguments)
|
|
617
|
+
|
|
618
|
+
# Skip instrumentation for v4 tasks with HUD MCP config (remote server)
|
|
619
|
+
if self._mcp_config is not None:
|
|
620
|
+
from hud.utils.mcp import _is_hud_server
|
|
621
|
+
|
|
622
|
+
for server_cfg in self._mcp_config.values():
|
|
623
|
+
if isinstance(server_cfg, dict):
|
|
624
|
+
url = server_cfg.get("url", "")
|
|
625
|
+
if url and _is_hud_server(url):
|
|
626
|
+
return await super()._execute_tool(name, arguments)
|
|
627
|
+
|
|
628
|
+
# For local environments, record MCP spans
|
|
629
|
+
return await self._execute_tool_instrumented(name, arguments)
|
|
630
|
+
|
|
631
|
+
@instrument(category="mcp")
|
|
632
|
+
async def _execute_tool_instrumented(
|
|
633
|
+
self, name: str, arguments: dict[str, Any]
|
|
634
|
+
) -> MCPToolResult:
|
|
635
|
+
"""Instrumented version of _execute_tool for local environments."""
|
|
636
|
+
return await super()._execute_tool(name, arguments)
|
|
637
|
+
|
|
638
|
+
def __repr__(self) -> str:
|
|
639
|
+
return f"EvalContext({self.trace_id[:8]}..., name={self.eval_name!r}, reward={self.reward})"
|
|
640
|
+
|
|
641
|
+
def _print_eval_link(self) -> None:
|
|
642
|
+
"""Print a nicely formatted eval link."""
|
|
643
|
+
# Skip if link printing is suppressed (e.g., parallel child traces)
|
|
644
|
+
if self._suppress_link:
|
|
645
|
+
return
|
|
646
|
+
|
|
647
|
+
from hud.eval.display import print_link
|
|
648
|
+
|
|
649
|
+
trace_url = f"https://hud.ai/trace/{self.trace_id}"
|
|
650
|
+
print_link(trace_url, "🔗 Eval Started")
|
|
651
|
+
|
|
652
|
+
def _print_single_result(self, error_msg: str | None) -> None:
|
|
653
|
+
"""Print a single eval result summary."""
|
|
654
|
+
# Skip if link printing is suppressed (e.g., parallel child traces)
|
|
655
|
+
if self._suppress_link:
|
|
656
|
+
return
|
|
657
|
+
|
|
658
|
+
from hud.eval.display import print_single_result
|
|
659
|
+
|
|
660
|
+
print_single_result(
|
|
661
|
+
trace_id=self.trace_id,
|
|
662
|
+
name=self.eval_name,
|
|
663
|
+
reward=self.reward,
|
|
664
|
+
error=error_msg,
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
# Re-export for backwards compatibility with trace module
|
|
669
|
+
__all__ = [
|
|
670
|
+
"EvalContext",
|
|
671
|
+
"get_current_api_key",
|
|
672
|
+
"get_current_trace_headers",
|
|
673
|
+
"get_current_trace_id",
|
|
674
|
+
]
|