hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +1 -1
- hud/agents/__init__.py +65 -6
- hud/agents/base.py +33 -15
- hud/agents/claude.py +60 -31
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +15 -26
- hud/agents/gemini_cua.py +6 -17
- hud/agents/misc/response_agent.py +7 -0
- hud/agents/openai.py +16 -29
- hud/agents/openai_chat.py +3 -19
- hud/agents/operator.py +5 -17
- hud/agents/resolver.py +70 -0
- hud/agents/tests/test_claude.py +2 -4
- hud/agents/tests/test_openai.py +2 -1
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +34 -3
- hud/cli/build.py +37 -5
- hud/cli/dev.py +11 -2
- hud/cli/eval.py +51 -39
- hud/cli/flows/init.py +1 -1
- hud/cli/pull.py +1 -1
- hud/cli/push.py +9 -2
- hud/cli/tests/test_build.py +2 -2
- hud/cli/tests/test_push.py +1 -1
- hud/cli/utils/metadata.py +1 -1
- hud/cli/utils/tests/test_metadata.py +1 -1
- hud/clients/mcp_use.py +6 -1
- hud/datasets/loader.py +17 -18
- hud/datasets/runner.py +16 -10
- hud/datasets/tests/test_loader.py +15 -15
- hud/environment/__init__.py +5 -3
- hud/environment/connection.py +58 -6
- hud/environment/connectors/mcp_config.py +29 -1
- hud/environment/environment.py +218 -77
- hud/environment/router.py +175 -24
- hud/environment/scenarios.py +313 -186
- hud/environment/tests/test_connectors.py +10 -23
- hud/environment/tests/test_environment.py +432 -0
- hud/environment/tests/test_local_connectors.py +81 -40
- hud/environment/tests/test_scenarios.py +820 -14
- hud/eval/context.py +63 -10
- hud/eval/instrument.py +4 -2
- hud/eval/manager.py +79 -12
- hud/eval/task.py +36 -4
- hud/eval/tests/test_eval.py +1 -1
- hud/eval/tests/test_task.py +147 -1
- hud/eval/types.py +2 -0
- hud/eval/utils.py +14 -3
- hud/patches/mcp_patches.py +178 -21
- hud/telemetry/instrument.py +8 -1
- hud/telemetry/tests/test_eval_telemetry.py +8 -8
- hud/tools/__init__.py +2 -0
- hud/tools/agent.py +223 -0
- hud/tools/computer/__init__.py +34 -5
- hud/tools/shell.py +3 -3
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/types.py +62 -34
- hud/utils/hud_console.py +30 -17
- hud/utils/strict_schema.py +1 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/eval/context.py
CHANGED
|
@@ -155,6 +155,9 @@ class EvalContext(Environment):
|
|
|
155
155
|
self.answer: str | None = None # Agent's submitted answer
|
|
156
156
|
self.system_prompt: str | None = None # From task.agent_config, passed to agent
|
|
157
157
|
|
|
158
|
+
# Agent config overrides from task (applied by agent when running)
|
|
159
|
+
self.append_setup_output: bool = False # Whether to append setup tool output to prompt
|
|
160
|
+
|
|
158
161
|
# Error tracking
|
|
159
162
|
self.error: BaseException | None = None
|
|
160
163
|
|
|
@@ -230,13 +233,13 @@ class EvalContext(Environment):
|
|
|
230
233
|
# using the contextvar set in __aenter__ (supports api_key passed to hud.eval())
|
|
231
234
|
ctx._setup_calls = env._setup_calls.copy()
|
|
232
235
|
ctx._evaluate_calls = env._evaluate_calls.copy()
|
|
236
|
+
ctx._integration_test_calls = getattr(env, "_integration_test_calls", []).copy()
|
|
237
|
+
ctx._setup_results = getattr(env, "_setup_results", []).copy()
|
|
233
238
|
|
|
234
239
|
# Copy scenarios (definitions) by reference - they don't change
|
|
235
240
|
ctx._scenarios = getattr(env, "_scenarios", {})
|
|
236
241
|
# Create fresh session state for this eval (parallel evals each need their own)
|
|
237
|
-
ctx.
|
|
238
|
-
ctx._scenario_latest = {}
|
|
239
|
-
ctx._scenario_answers = {}
|
|
242
|
+
ctx._active_session = None
|
|
240
243
|
|
|
241
244
|
# Store source env name for remote scenario lookups
|
|
242
245
|
ctx._source_env_name = env.name
|
|
@@ -302,10 +305,20 @@ class EvalContext(Environment):
|
|
|
302
305
|
code_snippet: Code being evaluated
|
|
303
306
|
trace: Whether to send traces to backend
|
|
304
307
|
quiet: Whether to suppress output
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
ValueError: If task.args is None (template tasks cannot be run directly)
|
|
305
311
|
"""
|
|
306
312
|
from hud.environment import Environment
|
|
307
313
|
from hud.eval.task import build_eval_name
|
|
308
314
|
|
|
315
|
+
# Validate that task has args (not a template)
|
|
316
|
+
if task.args is None:
|
|
317
|
+
raise ValueError(
|
|
318
|
+
f"Cannot run task with args=None (this is a template). "
|
|
319
|
+
f"Provide args when creating the task: env('{task.scenario}', **args)"
|
|
320
|
+
)
|
|
321
|
+
|
|
309
322
|
eval_name = name or build_eval_name(task.scenario, task.args)
|
|
310
323
|
|
|
311
324
|
# task.env is guaranteed to be Environment after Task.__post_init__
|
|
@@ -328,13 +341,26 @@ class EvalContext(Environment):
|
|
|
328
341
|
# Store task info for scenario execution
|
|
329
342
|
ctx._task = task
|
|
330
343
|
|
|
331
|
-
#
|
|
344
|
+
# Copy agent_config fields from task to ctx (these override agent defaults)
|
|
332
345
|
if task.agent_config:
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
346
|
+
agent_config = task.agent_config
|
|
347
|
+
if isinstance(agent_config, dict):
|
|
348
|
+
if agent_config.get("system_prompt"):
|
|
349
|
+
ctx.system_prompt = agent_config["system_prompt"]
|
|
350
|
+
if agent_config.get("append_setup_output"):
|
|
351
|
+
ctx.append_setup_output = agent_config["append_setup_output"]
|
|
352
|
+
# Also check append_setup_tool alias
|
|
353
|
+
if agent_config.get("append_setup_tool"):
|
|
354
|
+
ctx.append_setup_output = agent_config["append_setup_tool"]
|
|
355
|
+
else:
|
|
356
|
+
# It's a BaseAgentConfig or TaskAgentConfig object
|
|
357
|
+
if getattr(agent_config, "system_prompt", None):
|
|
358
|
+
ctx.system_prompt = agent_config.system_prompt
|
|
359
|
+
if getattr(agent_config, "append_setup_output", False):
|
|
360
|
+
ctx.append_setup_output = agent_config.append_setup_output
|
|
361
|
+
# Also check append_setup_tool alias
|
|
362
|
+
if getattr(agent_config, "append_setup_tool", False):
|
|
363
|
+
ctx.append_setup_output = True
|
|
338
364
|
|
|
339
365
|
return ctx
|
|
340
366
|
|
|
@@ -343,7 +369,7 @@ class EvalContext(Environment):
|
|
|
343
369
|
if self._task is None or self._task.scenario is None:
|
|
344
370
|
return
|
|
345
371
|
|
|
346
|
-
prompt = await self.run_scenario_setup(self._task.scenario, self._task.args)
|
|
372
|
+
prompt = await self.run_scenario_setup(self._task.scenario, self._task.args or {})
|
|
347
373
|
if prompt:
|
|
348
374
|
self.prompt = prompt
|
|
349
375
|
|
|
@@ -417,6 +443,33 @@ class EvalContext(Environment):
|
|
|
417
443
|
"""True if a scenario is running and can accept submissions."""
|
|
418
444
|
return self._task is not None and self._task.scenario is not None
|
|
419
445
|
|
|
446
|
+
@property
|
|
447
|
+
def setup_output(self) -> str | None:
|
|
448
|
+
"""Get setup tool output as formatted string for prepending to agent context.
|
|
449
|
+
|
|
450
|
+
Returns None if no setup tools were executed or all results were empty.
|
|
451
|
+
Used by agents when append_setup_output is enabled.
|
|
452
|
+
"""
|
|
453
|
+
import mcp.types as mcp_types
|
|
454
|
+
|
|
455
|
+
setup_results = getattr(self, "_setup_results", [])
|
|
456
|
+
if not setup_results:
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
output_parts: list[str] = []
|
|
460
|
+
for result in setup_results:
|
|
461
|
+
if result.content:
|
|
462
|
+
output_parts.extend(
|
|
463
|
+
block.text
|
|
464
|
+
for block in result.content
|
|
465
|
+
if isinstance(block, mcp_types.TextContent)
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
if not output_parts:
|
|
469
|
+
return None
|
|
470
|
+
|
|
471
|
+
return "\n".join(output_parts)
|
|
472
|
+
|
|
420
473
|
# =========================================================================
|
|
421
474
|
# Backend Integration
|
|
422
475
|
# =========================================================================
|
hud/eval/instrument.py
CHANGED
|
@@ -69,7 +69,8 @@ def _httpx_request_hook(request: Any) -> None:
|
|
|
69
69
|
headers = _get_trace_headers()
|
|
70
70
|
if headers is not None:
|
|
71
71
|
for key, value in headers.items():
|
|
72
|
-
request.headers
|
|
72
|
+
if key.lower() not in {k.lower() for k in request.headers}:
|
|
73
|
+
request.headers[key] = value
|
|
73
74
|
logger.debug("Added trace headers to request: %s", url_str)
|
|
74
75
|
|
|
75
76
|
# Auto-inject API key if not present or invalid (prefer contextvar, fallback to settings)
|
|
@@ -149,7 +150,8 @@ def _patch_aiohttp() -> None:
|
|
|
149
150
|
trace_headers = _get_trace_headers()
|
|
150
151
|
if trace_headers is not None:
|
|
151
152
|
for key, value in trace_headers.items():
|
|
152
|
-
params.headers
|
|
153
|
+
if key.lower() not in {k.lower() for k in params.headers}:
|
|
154
|
+
params.headers[key] = value
|
|
153
155
|
logger.debug("Added trace headers to aiohttp request: %s", url_str)
|
|
154
156
|
|
|
155
157
|
api_key = _get_api_key()
|
hud/eval/manager.py
CHANGED
|
@@ -56,14 +56,16 @@ def _get_eval_name(tasks: list[Task] | None = None) -> str:
|
|
|
56
56
|
return "eval"
|
|
57
57
|
|
|
58
58
|
|
|
59
|
-
def _send_job_enter(
|
|
59
|
+
async def _send_job_enter(
|
|
60
60
|
job_id: str,
|
|
61
61
|
name: str,
|
|
62
62
|
variants: dict[str, Any] | None,
|
|
63
63
|
group: int,
|
|
64
64
|
api_key: str | None,
|
|
65
|
-
|
|
66
|
-
|
|
65
|
+
taskset: str | None = None,
|
|
66
|
+
tasks: list[dict[str, Any]] | None = None,
|
|
67
|
+
) -> list[str] | None:
|
|
68
|
+
"""Send job enter payload (async request before traces start)."""
|
|
67
69
|
import httpx
|
|
68
70
|
|
|
69
71
|
from hud.eval.types import JobEnterPayload
|
|
@@ -71,23 +73,35 @@ def _send_job_enter(
|
|
|
71
73
|
|
|
72
74
|
api_key = api_key or settings.api_key
|
|
73
75
|
if not settings.telemetry_enabled or not api_key:
|
|
74
|
-
return
|
|
76
|
+
return None
|
|
75
77
|
|
|
76
78
|
payload = JobEnterPayload(
|
|
77
79
|
name=name,
|
|
78
80
|
variants=variants,
|
|
79
81
|
group=group,
|
|
82
|
+
taskset=taskset,
|
|
83
|
+
tasks=tasks if taskset else None, # only send tasks if taskset specified
|
|
80
84
|
)
|
|
81
85
|
|
|
82
86
|
try:
|
|
83
|
-
httpx.
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
87
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
88
|
+
resp = await client.post(
|
|
89
|
+
f"{settings.hud_api_url}/trace/job/{job_id}/enter",
|
|
90
|
+
json=payload.model_dump(exclude_none=True),
|
|
91
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
92
|
+
)
|
|
93
|
+
if resp.is_success:
|
|
94
|
+
try:
|
|
95
|
+
data = resp.json()
|
|
96
|
+
except Exception:
|
|
97
|
+
return None
|
|
98
|
+
if isinstance(data, dict):
|
|
99
|
+
ids = data.get("task_version_ids")
|
|
100
|
+
if isinstance(ids, list) and all(isinstance(x, str) for x in ids):
|
|
101
|
+
return ids
|
|
89
102
|
except Exception as e:
|
|
90
103
|
logger.warning("Failed to send job enter: %s", e)
|
|
104
|
+
return None
|
|
91
105
|
|
|
92
106
|
|
|
93
107
|
@asynccontextmanager
|
|
@@ -105,6 +119,7 @@ async def run_eval(
|
|
|
105
119
|
max_concurrent: int | None = None,
|
|
106
120
|
trace: bool = True,
|
|
107
121
|
quiet: bool = False,
|
|
122
|
+
taskset: str | None = None,
|
|
108
123
|
) -> AsyncGenerator[EvalContext, None]:
|
|
109
124
|
"""Standalone eval context manager.
|
|
110
125
|
|
|
@@ -235,13 +250,37 @@ async def run_eval(
|
|
|
235
250
|
|
|
236
251
|
if total_evals == 1:
|
|
237
252
|
if tasks:
|
|
253
|
+
# Even for single-task evals, --taskset requires a job_enter call so the run
|
|
254
|
+
# and task are linked to the taskset (via job_id + task_version_id).
|
|
255
|
+
job_id_for_run = job_id
|
|
256
|
+
if taskset:
|
|
257
|
+
eval_name = _get_eval_name(tasks=tasks)
|
|
258
|
+
if job_id_for_run is None:
|
|
259
|
+
job_id_for_run = str(uuid.uuid4())
|
|
260
|
+
|
|
261
|
+
task_data = None
|
|
262
|
+
if not tasks[0].id:
|
|
263
|
+
task_data = [tasks[0].model_dump(mode="json", exclude_none=True)]
|
|
264
|
+
|
|
265
|
+
created_task_version_ids = await _send_job_enter(
|
|
266
|
+
job_id=job_id_for_run,
|
|
267
|
+
name=eval_name,
|
|
268
|
+
variants=variants,
|
|
269
|
+
group=group,
|
|
270
|
+
api_key=api_key,
|
|
271
|
+
taskset=taskset,
|
|
272
|
+
tasks=task_data,
|
|
273
|
+
)
|
|
274
|
+
if created_task_version_ids and not tasks[0].id:
|
|
275
|
+
tasks[0].id = created_task_version_ids[0]
|
|
276
|
+
|
|
238
277
|
# Single task - use EvalContext.from_task()
|
|
239
278
|
ctx = EvalContext.from_task(
|
|
240
279
|
tasks[0],
|
|
241
280
|
name=name,
|
|
242
281
|
trace_id=trace_id,
|
|
243
282
|
api_key=api_key,
|
|
244
|
-
job_id=
|
|
283
|
+
job_id=job_id_for_run,
|
|
245
284
|
group_id=group_id,
|
|
246
285
|
variants=variant_combos[0],
|
|
247
286
|
code_snippet=code_snippet,
|
|
@@ -273,13 +312,41 @@ async def run_eval(
|
|
|
273
312
|
job_url = f"https://hud.ai/jobs/{implicit_job_id}"
|
|
274
313
|
|
|
275
314
|
# Send job enter (sync request before traces start)
|
|
276
|
-
|
|
315
|
+
# Serialize tasks for auto-add to taskset (only tasks without existing backend id).
|
|
316
|
+
# For v5 scenario tasks, the backend task_version_id is carried in Task.id.
|
|
317
|
+
tasks_data = None
|
|
318
|
+
tasks_to_create: list[Task] = []
|
|
319
|
+
if taskset and tasks:
|
|
320
|
+
tasks_to_create = [t for t in tasks if not t.id]
|
|
321
|
+
tasks_data = (
|
|
322
|
+
[t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
|
|
323
|
+
if tasks_to_create
|
|
324
|
+
else None
|
|
325
|
+
)
|
|
326
|
+
created_task_version_ids = await _send_job_enter(
|
|
277
327
|
job_id=implicit_job_id,
|
|
278
328
|
name=eval_name,
|
|
279
329
|
variants=variants,
|
|
280
330
|
group=group,
|
|
281
331
|
api_key=api_key,
|
|
332
|
+
taskset=taskset,
|
|
333
|
+
tasks=tasks_data,
|
|
282
334
|
)
|
|
335
|
+
if created_task_version_ids and tasks_to_create:
|
|
336
|
+
# Assign backend IDs back onto the in-memory tasks so trace enter includes
|
|
337
|
+
# task_version_id.
|
|
338
|
+
# Platform guarantees ordered one-to-one mapping, but warn if counts differ.
|
|
339
|
+
if len(created_task_version_ids) != len(tasks_to_create):
|
|
340
|
+
logger.warning(
|
|
341
|
+
"Task count mismatch: sent %d tasks, received %d IDs. "
|
|
342
|
+
"Some tasks may not be linked to the taskset.",
|
|
343
|
+
len(tasks_to_create),
|
|
344
|
+
len(created_task_version_ids),
|
|
345
|
+
)
|
|
346
|
+
for task_obj, task_version_id in zip(
|
|
347
|
+
tasks_to_create, created_task_version_ids, strict=False
|
|
348
|
+
):
|
|
349
|
+
task_obj.id = task_version_id
|
|
283
350
|
|
|
284
351
|
# Print job URL (not individual trace URLs)
|
|
285
352
|
if not quiet:
|
hud/eval/task.py
CHANGED
|
@@ -53,6 +53,9 @@ class TaskAgentConfig(BaseModel):
|
|
|
53
53
|
"""Agent configuration for a Task.
|
|
54
54
|
|
|
55
55
|
Contains settings that should be passed to the agent when running this task.
|
|
56
|
+
|
|
57
|
+
Note: allowed_tools/disallowed_tools are handled at the Environment level
|
|
58
|
+
(via env.include()/env.exclude() for v5, or extracted by build_env_from_v4() for v4).
|
|
56
59
|
"""
|
|
57
60
|
|
|
58
61
|
model_config = ConfigDict(extra="ignore")
|
|
@@ -62,12 +65,26 @@ class TaskAgentConfig(BaseModel):
|
|
|
62
65
|
description="Custom system prompt to pass to the agent",
|
|
63
66
|
)
|
|
64
67
|
|
|
68
|
+
# Agent behavior settings (from v4 agent_config, applied by EvalContext)
|
|
69
|
+
append_setup_output: bool = Field(
|
|
70
|
+
default=False,
|
|
71
|
+
description="Append setup tool output to the agent's initial prompt",
|
|
72
|
+
)
|
|
73
|
+
append_setup_tool: bool = Field(
|
|
74
|
+
default=False,
|
|
75
|
+
description="Alias for append_setup_output (backwards compat)",
|
|
76
|
+
)
|
|
77
|
+
|
|
65
78
|
@model_validator(mode="before")
|
|
66
79
|
@classmethod
|
|
67
80
|
def warn_extra_fields(cls, data: Any) -> Any:
|
|
68
81
|
"""Warn about extra fields that will be ignored."""
|
|
69
82
|
if isinstance(data, dict):
|
|
70
|
-
known_fields = {
|
|
83
|
+
known_fields = {
|
|
84
|
+
"system_prompt",
|
|
85
|
+
"append_setup_output",
|
|
86
|
+
"append_setup_tool",
|
|
87
|
+
}
|
|
71
88
|
extra = set(data.keys()) - known_fields
|
|
72
89
|
if extra:
|
|
73
90
|
logger.warning(
|
|
@@ -148,7 +165,10 @@ class Task(BaseModel):
|
|
|
148
165
|
env: Any = Field(default=None) # Typed as Any for input flexibility, validated below
|
|
149
166
|
scenario: str | None = None
|
|
150
167
|
id: str | None = None
|
|
151
|
-
args: dict[str, Any] = Field(
|
|
168
|
+
args: dict[str, Any] | None = Field(
|
|
169
|
+
default=None,
|
|
170
|
+
description="Scenario arguments. None indicates a template (args filled in later).",
|
|
171
|
+
)
|
|
152
172
|
validation: list[MCPToolCall] | None = None
|
|
153
173
|
|
|
154
174
|
# Agent config - settings passed to agent (system_prompt, etc.)
|
|
@@ -284,8 +304,20 @@ class Task(BaseModel):
|
|
|
284
304
|
]
|
|
285
305
|
|
|
286
306
|
# Preserve agent_config
|
|
307
|
+
agent_config: dict[str, Any] = {}
|
|
287
308
|
if data.get("agent_config"):
|
|
288
|
-
|
|
309
|
+
agent_config.update(data["agent_config"])
|
|
310
|
+
# Restore tool filters from Environment (they were extracted during v4 conversion)
|
|
311
|
+
if self.env is not None:
|
|
312
|
+
if getattr(self.env, "_agent_include", None) is not None:
|
|
313
|
+
agent_config["allowed_tools"] = self.env._agent_include
|
|
314
|
+
elif "allowed_tools" not in agent_config:
|
|
315
|
+
# ["*"] was converted to None, restore it for serialization
|
|
316
|
+
agent_config["allowed_tools"] = ["*"]
|
|
317
|
+
if getattr(self.env, "_agent_exclude", None) is not None:
|
|
318
|
+
agent_config["disallowed_tools"] = self.env._agent_exclude
|
|
319
|
+
if agent_config:
|
|
320
|
+
result["agent_config"] = agent_config
|
|
289
321
|
|
|
290
322
|
# Preserve metadata
|
|
291
323
|
if data.get("metadata"):
|
|
@@ -335,6 +367,6 @@ class Task(BaseModel):
|
|
|
335
367
|
id=self.id,
|
|
336
368
|
env=self.env, # Share reference
|
|
337
369
|
scenario=self.scenario,
|
|
338
|
-
args=self.args.copy() if self.args else
|
|
370
|
+
args=self.args.copy() if self.args is not None else None,
|
|
339
371
|
validation=self.validation.copy() if self.validation else None,
|
|
340
372
|
)
|
hud/eval/tests/test_eval.py
CHANGED
|
@@ -16,7 +16,7 @@ class TestTaskDataclass:
|
|
|
16
16
|
|
|
17
17
|
assert task.env is None
|
|
18
18
|
assert task.scenario is None
|
|
19
|
-
assert task.args
|
|
19
|
+
assert task.args is None # None = template, {} = runnable with no args
|
|
20
20
|
|
|
21
21
|
def test_init_with_env_dict(self) -> None:
|
|
22
22
|
"""Task auto-converts env dict to Environment via validator."""
|
hud/eval/tests/test_task.py
CHANGED
|
@@ -85,13 +85,24 @@ class TestTaskSerialization:
|
|
|
85
85
|
task = Task.from_v4(v4_dict)
|
|
86
86
|
data = task.model_dump(mode="json")
|
|
87
87
|
|
|
88
|
-
|
|
88
|
+
# agent_config should preserve system_prompt and restore tool filters
|
|
89
|
+
agent_config = data.get("agent_config")
|
|
90
|
+
assert agent_config is not None
|
|
91
|
+
assert agent_config["system_prompt"] == "Custom system prompt"
|
|
92
|
+
# allowed_tools defaults to ["*"] when not specified (restored during serialization)
|
|
93
|
+
assert agent_config["allowed_tools"] == ["*"]
|
|
94
|
+
# These have default False values from TaskAgentConfig
|
|
95
|
+
assert agent_config["append_setup_output"] is False
|
|
96
|
+
assert agent_config["append_setup_tool"] is False
|
|
89
97
|
|
|
90
98
|
# Roundtrip
|
|
91
99
|
task2 = Task(**data)
|
|
92
100
|
assert task2.agent_config is not None
|
|
93
101
|
assert isinstance(task2.agent_config, TaskAgentConfig)
|
|
94
102
|
assert task2.agent_config.system_prompt == "Custom system prompt"
|
|
103
|
+
# Tool filters should be on Environment after roundtrip
|
|
104
|
+
assert task2.env is not None
|
|
105
|
+
assert task2.env._agent_include is None # ["*"] → None
|
|
95
106
|
|
|
96
107
|
def test_v4_preserves_metadata(self) -> None:
|
|
97
108
|
"""v4 Task preserves metadata through roundtrip."""
|
|
@@ -143,3 +154,138 @@ class TestTaskValidation:
|
|
|
143
154
|
|
|
144
155
|
assert isinstance(task.agent_config, TaskAgentConfig)
|
|
145
156
|
assert task.agent_config.system_prompt == "Hello"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class TestV4AgentConfigToolFilters:
|
|
160
|
+
"""Tests for v4 agent_config.allowed_tools and disallowed_tools processing."""
|
|
161
|
+
|
|
162
|
+
def test_v4_extracts_allowed_tools(self) -> None:
|
|
163
|
+
"""v4 allowed_tools is extracted and stored on Environment."""
|
|
164
|
+
v4_dict = {
|
|
165
|
+
"prompt": "Test prompt",
|
|
166
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
167
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
168
|
+
"agent_config": {
|
|
169
|
+
"allowed_tools": ["browser_*", "file_read"],
|
|
170
|
+
},
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
task = Task.from_v4(v4_dict)
|
|
174
|
+
|
|
175
|
+
assert task.env is not None
|
|
176
|
+
assert task.env._agent_include == ["browser_*", "file_read"]
|
|
177
|
+
|
|
178
|
+
def test_v4_extracts_disallowed_tools(self) -> None:
|
|
179
|
+
"""v4 disallowed_tools is extracted and stored on Environment."""
|
|
180
|
+
v4_dict = {
|
|
181
|
+
"prompt": "Test prompt",
|
|
182
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
183
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
184
|
+
"agent_config": {
|
|
185
|
+
"disallowed_tools": ["*setup*", "*evaluate*", "checkout_branch"],
|
|
186
|
+
},
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
task = Task.from_v4(v4_dict)
|
|
190
|
+
|
|
191
|
+
assert task.env is not None
|
|
192
|
+
assert task.env._agent_exclude == ["*setup*", "*evaluate*", "checkout_branch"]
|
|
193
|
+
|
|
194
|
+
def test_v4_wildcard_star_allowed_converts_to_none(self) -> None:
|
|
195
|
+
"""v4 allowed_tools=['*'] converts to None (meaning include all)."""
|
|
196
|
+
v4_dict = {
|
|
197
|
+
"prompt": "Test prompt",
|
|
198
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
199
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
200
|
+
"agent_config": {
|
|
201
|
+
"allowed_tools": ["*"],
|
|
202
|
+
},
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
task = Task.from_v4(v4_dict)
|
|
206
|
+
|
|
207
|
+
assert task.env is not None
|
|
208
|
+
# ["*"] should be converted to None
|
|
209
|
+
assert task.env._agent_include is None
|
|
210
|
+
|
|
211
|
+
def test_v4_both_allowed_and_disallowed(self) -> None:
|
|
212
|
+
"""v4 supports both allowed_tools and disallowed_tools together."""
|
|
213
|
+
v4_dict = {
|
|
214
|
+
"prompt": "Test prompt",
|
|
215
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
216
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
217
|
+
"agent_config": {
|
|
218
|
+
"allowed_tools": ["*"],
|
|
219
|
+
"disallowed_tools": ["*setup*", "*evaluate*"],
|
|
220
|
+
},
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
task = Task.from_v4(v4_dict)
|
|
224
|
+
|
|
225
|
+
assert task.env is not None
|
|
226
|
+
assert task.env._agent_include is None # ["*"] → None
|
|
227
|
+
assert task.env._agent_exclude == ["*setup*", "*evaluate*"]
|
|
228
|
+
|
|
229
|
+
@pytest.mark.asyncio
|
|
230
|
+
async def test_v4_tool_filters_applied_in_as_tools(self) -> None:
|
|
231
|
+
"""v4 tool filters are applied when calling env.as_tools()."""
|
|
232
|
+
v4_dict = {
|
|
233
|
+
"prompt": "Test prompt",
|
|
234
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
235
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
236
|
+
"agent_config": {
|
|
237
|
+
"allowed_tools": ["*"],
|
|
238
|
+
"disallowed_tools": ["*setup*"],
|
|
239
|
+
},
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
task = Task.from_v4(v4_dict)
|
|
243
|
+
env = task.env
|
|
244
|
+
assert env is not None
|
|
245
|
+
|
|
246
|
+
# Add local tools to test filtering
|
|
247
|
+
@env.tool()
|
|
248
|
+
def my_setup_tool() -> str:
|
|
249
|
+
"""Should be filtered out."""
|
|
250
|
+
return "setup"
|
|
251
|
+
|
|
252
|
+
@env.tool()
|
|
253
|
+
def run_query() -> str:
|
|
254
|
+
"""Should be visible."""
|
|
255
|
+
return "query"
|
|
256
|
+
|
|
257
|
+
await env._build_routing()
|
|
258
|
+
|
|
259
|
+
tools = env.as_tools()
|
|
260
|
+
tool_names = [t.name for t in tools]
|
|
261
|
+
|
|
262
|
+
assert "my_setup_tool" not in tool_names
|
|
263
|
+
assert "run_query" in tool_names
|
|
264
|
+
|
|
265
|
+
def test_v4_tool_filters_preserved_in_serialization(self) -> None:
|
|
266
|
+
"""v4 tool filters are preserved when serializing for remote execution."""
|
|
267
|
+
v4_dict = {
|
|
268
|
+
"prompt": "Test prompt",
|
|
269
|
+
"mcp_config": {"server": {"url": "http://localhost"}},
|
|
270
|
+
"evaluate_tool": {"name": "check", "arguments": {}},
|
|
271
|
+
"agent_config": {
|
|
272
|
+
"allowed_tools": ["*"],
|
|
273
|
+
"disallowed_tools": ["*setup*", "*evaluate*", "*grade*"],
|
|
274
|
+
},
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
task = Task.from_v4(v4_dict)
|
|
278
|
+
|
|
279
|
+
# Serialize (this is what gets sent to remote execution)
|
|
280
|
+
data = task.model_dump(mode="json")
|
|
281
|
+
|
|
282
|
+
# agent_config must include the tool filters for remote execution
|
|
283
|
+
assert "agent_config" in data
|
|
284
|
+
assert data["agent_config"]["allowed_tools"] == ["*"]
|
|
285
|
+
assert data["agent_config"]["disallowed_tools"] == ["*setup*", "*evaluate*", "*grade*"]
|
|
286
|
+
|
|
287
|
+
# Verify roundtrip works (remote worker will deserialize this)
|
|
288
|
+
task2 = Task(**data)
|
|
289
|
+
assert task2.env is not None
|
|
290
|
+
assert task2.env._agent_include is None # ["*"] → None
|
|
291
|
+
assert task2.env._agent_exclude == ["*setup*", "*evaluate*", "*grade*"]
|
hud/eval/types.py
CHANGED
|
@@ -53,6 +53,8 @@ class JobEnterPayload(BaseModel):
|
|
|
53
53
|
name: str | None = None
|
|
54
54
|
variants: dict[str, Any] | None = None # Full variant config
|
|
55
55
|
group: int | None = None
|
|
56
|
+
taskset: str | None = None # taskset slug to associate job with
|
|
57
|
+
tasks: list[dict[str, Any]] | None = None # task definitions to add to taskset
|
|
56
58
|
|
|
57
59
|
|
|
58
60
|
__all__ = [
|
hud/eval/utils.py
CHANGED
|
@@ -138,6 +138,7 @@ def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
|
|
|
138
138
|
}
|
|
139
139
|
|
|
140
140
|
# Map integration_test_tool → validation (same concept: tool calls to verify)
|
|
141
|
+
# Also populate _integration_test_calls for IntegrationTestRunner compatibility
|
|
141
142
|
if legacy.integration_test_tool:
|
|
142
143
|
int_test = legacy.integration_test_tool
|
|
143
144
|
if not isinstance(int_test, list):
|
|
@@ -147,10 +148,20 @@ def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
|
|
|
147
148
|
call if isinstance(call, MCPToolCall) else MCPToolCall(**call.model_dump())
|
|
148
149
|
for call in int_test
|
|
149
150
|
]
|
|
151
|
+
# Populate _integration_test_calls on env for IntegrationTestRunner
|
|
152
|
+
env._integration_test_calls = [(call.name, call.arguments or {}) for call in int_test]
|
|
150
153
|
|
|
151
|
-
# Extract agent_config
|
|
152
|
-
if legacy.agent_config
|
|
153
|
-
|
|
154
|
+
# Extract agent_config fields that need to be passed through
|
|
155
|
+
if legacy.agent_config:
|
|
156
|
+
agent_config_dict: dict[str, Any] = {}
|
|
157
|
+
if legacy.agent_config.system_prompt:
|
|
158
|
+
agent_config_dict["system_prompt"] = legacy.agent_config.system_prompt
|
|
159
|
+
if legacy.agent_config.append_setup_output:
|
|
160
|
+
agent_config_dict["append_setup_output"] = legacy.agent_config.append_setup_output
|
|
161
|
+
if legacy.agent_config.append_setup_tool:
|
|
162
|
+
agent_config_dict["append_setup_tool"] = legacy.agent_config.append_setup_tool
|
|
163
|
+
if agent_config_dict:
|
|
164
|
+
result["agent_config"] = agent_config_dict
|
|
154
165
|
|
|
155
166
|
# Preserve metadata
|
|
156
167
|
if legacy.metadata:
|