hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. hud/__init__.py +1 -1
  2. hud/agents/__init__.py +65 -6
  3. hud/agents/base.py +33 -15
  4. hud/agents/claude.py +60 -31
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +15 -26
  7. hud/agents/gemini_cua.py +6 -17
  8. hud/agents/misc/response_agent.py +7 -0
  9. hud/agents/openai.py +16 -29
  10. hud/agents/openai_chat.py +3 -19
  11. hud/agents/operator.py +5 -17
  12. hud/agents/resolver.py +70 -0
  13. hud/agents/tests/test_claude.py +2 -4
  14. hud/agents/tests/test_openai.py +2 -1
  15. hud/agents/tests/test_resolver.py +192 -0
  16. hud/agents/types.py +148 -0
  17. hud/cli/__init__.py +34 -3
  18. hud/cli/build.py +37 -5
  19. hud/cli/dev.py +11 -2
  20. hud/cli/eval.py +51 -39
  21. hud/cli/flows/init.py +1 -1
  22. hud/cli/pull.py +1 -1
  23. hud/cli/push.py +9 -2
  24. hud/cli/tests/test_build.py +2 -2
  25. hud/cli/tests/test_push.py +1 -1
  26. hud/cli/utils/metadata.py +1 -1
  27. hud/cli/utils/tests/test_metadata.py +1 -1
  28. hud/clients/mcp_use.py +6 -1
  29. hud/datasets/loader.py +17 -18
  30. hud/datasets/runner.py +16 -10
  31. hud/datasets/tests/test_loader.py +15 -15
  32. hud/environment/__init__.py +5 -3
  33. hud/environment/connection.py +58 -6
  34. hud/environment/connectors/mcp_config.py +29 -1
  35. hud/environment/environment.py +218 -77
  36. hud/environment/router.py +175 -24
  37. hud/environment/scenarios.py +313 -186
  38. hud/environment/tests/test_connectors.py +10 -23
  39. hud/environment/tests/test_environment.py +432 -0
  40. hud/environment/tests/test_local_connectors.py +81 -40
  41. hud/environment/tests/test_scenarios.py +820 -14
  42. hud/eval/context.py +63 -10
  43. hud/eval/instrument.py +4 -2
  44. hud/eval/manager.py +79 -12
  45. hud/eval/task.py +36 -4
  46. hud/eval/tests/test_eval.py +1 -1
  47. hud/eval/tests/test_task.py +147 -1
  48. hud/eval/types.py +2 -0
  49. hud/eval/utils.py +14 -3
  50. hud/patches/mcp_patches.py +178 -21
  51. hud/telemetry/instrument.py +8 -1
  52. hud/telemetry/tests/test_eval_telemetry.py +8 -8
  53. hud/tools/__init__.py +2 -0
  54. hud/tools/agent.py +223 -0
  55. hud/tools/computer/__init__.py +34 -5
  56. hud/tools/shell.py +3 -3
  57. hud/tools/tests/test_agent_tool.py +355 -0
  58. hud/types.py +62 -34
  59. hud/utils/hud_console.py +30 -17
  60. hud/utils/strict_schema.py +1 -1
  61. hud/utils/tests/test_version.py +1 -1
  62. hud/version.py +1 -1
  63. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
  64. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
  65. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
  66. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  67. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/eval/context.py CHANGED
@@ -155,6 +155,9 @@ class EvalContext(Environment):
155
155
  self.answer: str | None = None # Agent's submitted answer
156
156
  self.system_prompt: str | None = None # From task.agent_config, passed to agent
157
157
 
158
+ # Agent config overrides from task (applied by agent when running)
159
+ self.append_setup_output: bool = False # Whether to append setup tool output to prompt
160
+
158
161
  # Error tracking
159
162
  self.error: BaseException | None = None
160
163
 
@@ -230,13 +233,13 @@ class EvalContext(Environment):
230
233
  # using the contextvar set in __aenter__ (supports api_key passed to hud.eval())
231
234
  ctx._setup_calls = env._setup_calls.copy()
232
235
  ctx._evaluate_calls = env._evaluate_calls.copy()
236
+ ctx._integration_test_calls = getattr(env, "_integration_test_calls", []).copy()
237
+ ctx._setup_results = getattr(env, "_setup_results", []).copy()
233
238
 
234
239
  # Copy scenarios (definitions) by reference - they don't change
235
240
  ctx._scenarios = getattr(env, "_scenarios", {})
236
241
  # Create fresh session state for this eval (parallel evals each need their own)
237
- ctx._scenario_sessions = {}
238
- ctx._scenario_latest = {}
239
- ctx._scenario_answers = {}
242
+ ctx._active_session = None
240
243
 
241
244
  # Store source env name for remote scenario lookups
242
245
  ctx._source_env_name = env.name
@@ -302,10 +305,20 @@ class EvalContext(Environment):
302
305
  code_snippet: Code being evaluated
303
306
  trace: Whether to send traces to backend
304
307
  quiet: Whether to suppress output
308
+
309
+ Raises:
310
+ ValueError: If task.args is None (template tasks cannot be run directly)
305
311
  """
306
312
  from hud.environment import Environment
307
313
  from hud.eval.task import build_eval_name
308
314
 
315
+ # Validate that task has args (not a template)
316
+ if task.args is None:
317
+ raise ValueError(
318
+ f"Cannot run task with args=None (this is a template). "
319
+ f"Provide args when creating the task: env('{task.scenario}', **args)"
320
+ )
321
+
309
322
  eval_name = name or build_eval_name(task.scenario, task.args)
310
323
 
311
324
  # task.env is guaranteed to be Environment after Task.__post_init__
@@ -328,13 +341,26 @@ class EvalContext(Environment):
328
341
  # Store task info for scenario execution
329
342
  ctx._task = task
330
343
 
331
- # Set system_prompt from task.agent_config
344
+ # Copy agent_config fields from task to ctx (these override agent defaults)
332
345
  if task.agent_config:
333
- if isinstance(task.agent_config, dict):
334
- if task.agent_config.get("system_prompt"):
335
- ctx.system_prompt = task.agent_config["system_prompt"]
336
- elif task.agent_config.system_prompt:
337
- ctx.system_prompt = task.agent_config.system_prompt
346
+ agent_config = task.agent_config
347
+ if isinstance(agent_config, dict):
348
+ if agent_config.get("system_prompt"):
349
+ ctx.system_prompt = agent_config["system_prompt"]
350
+ if agent_config.get("append_setup_output"):
351
+ ctx.append_setup_output = agent_config["append_setup_output"]
352
+ # Also check append_setup_tool alias
353
+ if agent_config.get("append_setup_tool"):
354
+ ctx.append_setup_output = agent_config["append_setup_tool"]
355
+ else:
356
+ # It's a BaseAgentConfig or TaskAgentConfig object
357
+ if getattr(agent_config, "system_prompt", None):
358
+ ctx.system_prompt = agent_config.system_prompt
359
+ if getattr(agent_config, "append_setup_output", False):
360
+ ctx.append_setup_output = agent_config.append_setup_output
361
+ # Also check append_setup_tool alias
362
+ if getattr(agent_config, "append_setup_tool", False):
363
+ ctx.append_setup_output = True
338
364
 
339
365
  return ctx
340
366
 
@@ -343,7 +369,7 @@ class EvalContext(Environment):
343
369
  if self._task is None or self._task.scenario is None:
344
370
  return
345
371
 
346
- prompt = await self.run_scenario_setup(self._task.scenario, self._task.args)
372
+ prompt = await self.run_scenario_setup(self._task.scenario, self._task.args or {})
347
373
  if prompt:
348
374
  self.prompt = prompt
349
375
 
@@ -417,6 +443,33 @@ class EvalContext(Environment):
417
443
  """True if a scenario is running and can accept submissions."""
418
444
  return self._task is not None and self._task.scenario is not None
419
445
 
446
+ @property
447
+ def setup_output(self) -> str | None:
448
+ """Get setup tool output as formatted string for prepending to agent context.
449
+
450
+ Returns None if no setup tools were executed or all results were empty.
451
+ Used by agents when append_setup_output is enabled.
452
+ """
453
+ import mcp.types as mcp_types
454
+
455
+ setup_results = getattr(self, "_setup_results", [])
456
+ if not setup_results:
457
+ return None
458
+
459
+ output_parts: list[str] = []
460
+ for result in setup_results:
461
+ if result.content:
462
+ output_parts.extend(
463
+ block.text
464
+ for block in result.content
465
+ if isinstance(block, mcp_types.TextContent)
466
+ )
467
+
468
+ if not output_parts:
469
+ return None
470
+
471
+ return "\n".join(output_parts)
472
+
420
473
  # =========================================================================
421
474
  # Backend Integration
422
475
  # =========================================================================
hud/eval/instrument.py CHANGED
@@ -69,7 +69,8 @@ def _httpx_request_hook(request: Any) -> None:
69
69
  headers = _get_trace_headers()
70
70
  if headers is not None:
71
71
  for key, value in headers.items():
72
- request.headers[key] = value
72
+ if key.lower() not in {k.lower() for k in request.headers}:
73
+ request.headers[key] = value
73
74
  logger.debug("Added trace headers to request: %s", url_str)
74
75
 
75
76
  # Auto-inject API key if not present or invalid (prefer contextvar, fallback to settings)
@@ -149,7 +150,8 @@ def _patch_aiohttp() -> None:
149
150
  trace_headers = _get_trace_headers()
150
151
  if trace_headers is not None:
151
152
  for key, value in trace_headers.items():
152
- params.headers[key] = value
153
+ if key.lower() not in {k.lower() for k in params.headers}:
154
+ params.headers[key] = value
153
155
  logger.debug("Added trace headers to aiohttp request: %s", url_str)
154
156
 
155
157
  api_key = _get_api_key()
hud/eval/manager.py CHANGED
@@ -56,14 +56,16 @@ def _get_eval_name(tasks: list[Task] | None = None) -> str:
56
56
  return "eval"
57
57
 
58
58
 
59
- def _send_job_enter(
59
+ async def _send_job_enter(
60
60
  job_id: str,
61
61
  name: str,
62
62
  variants: dict[str, Any] | None,
63
63
  group: int,
64
64
  api_key: str | None,
65
- ) -> None:
66
- """Send job enter payload (sync request before traces start)."""
65
+ taskset: str | None = None,
66
+ tasks: list[dict[str, Any]] | None = None,
67
+ ) -> list[str] | None:
68
+ """Send job enter payload (async request before traces start)."""
67
69
  import httpx
68
70
 
69
71
  from hud.eval.types import JobEnterPayload
@@ -71,23 +73,35 @@ def _send_job_enter(
71
73
 
72
74
  api_key = api_key or settings.api_key
73
75
  if not settings.telemetry_enabled or not api_key:
74
- return
76
+ return None
75
77
 
76
78
  payload = JobEnterPayload(
77
79
  name=name,
78
80
  variants=variants,
79
81
  group=group,
82
+ taskset=taskset,
83
+ tasks=tasks if taskset else None, # only send tasks if taskset specified
80
84
  )
81
85
 
82
86
  try:
83
- httpx.post(
84
- f"{settings.hud_api_url}/trace/job/{job_id}/enter",
85
- json=payload.model_dump(exclude_none=True),
86
- headers={"Authorization": f"Bearer {api_key}"},
87
- timeout=10.0,
88
- )
87
+ async with httpx.AsyncClient(timeout=10.0) as client:
88
+ resp = await client.post(
89
+ f"{settings.hud_api_url}/trace/job/{job_id}/enter",
90
+ json=payload.model_dump(exclude_none=True),
91
+ headers={"Authorization": f"Bearer {api_key}"},
92
+ )
93
+ if resp.is_success:
94
+ try:
95
+ data = resp.json()
96
+ except Exception:
97
+ return None
98
+ if isinstance(data, dict):
99
+ ids = data.get("task_version_ids")
100
+ if isinstance(ids, list) and all(isinstance(x, str) for x in ids):
101
+ return ids
89
102
  except Exception as e:
90
103
  logger.warning("Failed to send job enter: %s", e)
104
+ return None
91
105
 
92
106
 
93
107
  @asynccontextmanager
@@ -105,6 +119,7 @@ async def run_eval(
105
119
  max_concurrent: int | None = None,
106
120
  trace: bool = True,
107
121
  quiet: bool = False,
122
+ taskset: str | None = None,
108
123
  ) -> AsyncGenerator[EvalContext, None]:
109
124
  """Standalone eval context manager.
110
125
 
@@ -235,13 +250,37 @@ async def run_eval(
235
250
 
236
251
  if total_evals == 1:
237
252
  if tasks:
253
+ # Even for single-task evals, --taskset requires a job_enter call so the run
254
+ # and task are linked to the taskset (via job_id + task_version_id).
255
+ job_id_for_run = job_id
256
+ if taskset:
257
+ eval_name = _get_eval_name(tasks=tasks)
258
+ if job_id_for_run is None:
259
+ job_id_for_run = str(uuid.uuid4())
260
+
261
+ task_data = None
262
+ if not tasks[0].id:
263
+ task_data = [tasks[0].model_dump(mode="json", exclude_none=True)]
264
+
265
+ created_task_version_ids = await _send_job_enter(
266
+ job_id=job_id_for_run,
267
+ name=eval_name,
268
+ variants=variants,
269
+ group=group,
270
+ api_key=api_key,
271
+ taskset=taskset,
272
+ tasks=task_data,
273
+ )
274
+ if created_task_version_ids and not tasks[0].id:
275
+ tasks[0].id = created_task_version_ids[0]
276
+
238
277
  # Single task - use EvalContext.from_task()
239
278
  ctx = EvalContext.from_task(
240
279
  tasks[0],
241
280
  name=name,
242
281
  trace_id=trace_id,
243
282
  api_key=api_key,
244
- job_id=job_id,
283
+ job_id=job_id_for_run,
245
284
  group_id=group_id,
246
285
  variants=variant_combos[0],
247
286
  code_snippet=code_snippet,
@@ -273,13 +312,41 @@ async def run_eval(
273
312
  job_url = f"https://hud.ai/jobs/{implicit_job_id}"
274
313
 
275
314
  # Send job enter (sync request before traces start)
276
- _send_job_enter(
315
+ # Serialize tasks for auto-add to taskset (only tasks without existing backend id).
316
+ # For v5 scenario tasks, the backend task_version_id is carried in Task.id.
317
+ tasks_data = None
318
+ tasks_to_create: list[Task] = []
319
+ if taskset and tasks:
320
+ tasks_to_create = [t for t in tasks if not t.id]
321
+ tasks_data = (
322
+ [t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
323
+ if tasks_to_create
324
+ else None
325
+ )
326
+ created_task_version_ids = await _send_job_enter(
277
327
  job_id=implicit_job_id,
278
328
  name=eval_name,
279
329
  variants=variants,
280
330
  group=group,
281
331
  api_key=api_key,
332
+ taskset=taskset,
333
+ tasks=tasks_data,
282
334
  )
335
+ if created_task_version_ids and tasks_to_create:
336
+ # Assign backend IDs back onto the in-memory tasks so trace enter includes
337
+ # task_version_id.
338
+ # Platform guarantees ordered one-to-one mapping, but warn if counts differ.
339
+ if len(created_task_version_ids) != len(tasks_to_create):
340
+ logger.warning(
341
+ "Task count mismatch: sent %d tasks, received %d IDs. "
342
+ "Some tasks may not be linked to the taskset.",
343
+ len(tasks_to_create),
344
+ len(created_task_version_ids),
345
+ )
346
+ for task_obj, task_version_id in zip(
347
+ tasks_to_create, created_task_version_ids, strict=False
348
+ ):
349
+ task_obj.id = task_version_id
283
350
 
284
351
  # Print job URL (not individual trace URLs)
285
352
  if not quiet:
hud/eval/task.py CHANGED
@@ -53,6 +53,9 @@ class TaskAgentConfig(BaseModel):
53
53
  """Agent configuration for a Task.
54
54
 
55
55
  Contains settings that should be passed to the agent when running this task.
56
+
57
+ Note: allowed_tools/disallowed_tools are handled at the Environment level
58
+ (via env.include()/env.exclude() for v5, or extracted by build_env_from_v4() for v4).
56
59
  """
57
60
 
58
61
  model_config = ConfigDict(extra="ignore")
@@ -62,12 +65,26 @@ class TaskAgentConfig(BaseModel):
62
65
  description="Custom system prompt to pass to the agent",
63
66
  )
64
67
 
68
+ # Agent behavior settings (from v4 agent_config, applied by EvalContext)
69
+ append_setup_output: bool = Field(
70
+ default=False,
71
+ description="Append setup tool output to the agent's initial prompt",
72
+ )
73
+ append_setup_tool: bool = Field(
74
+ default=False,
75
+ description="Alias for append_setup_output (backwards compat)",
76
+ )
77
+
65
78
  @model_validator(mode="before")
66
79
  @classmethod
67
80
  def warn_extra_fields(cls, data: Any) -> Any:
68
81
  """Warn about extra fields that will be ignored."""
69
82
  if isinstance(data, dict):
70
- known_fields = {"system_prompt"}
83
+ known_fields = {
84
+ "system_prompt",
85
+ "append_setup_output",
86
+ "append_setup_tool",
87
+ }
71
88
  extra = set(data.keys()) - known_fields
72
89
  if extra:
73
90
  logger.warning(
@@ -148,7 +165,10 @@ class Task(BaseModel):
148
165
  env: Any = Field(default=None) # Typed as Any for input flexibility, validated below
149
166
  scenario: str | None = None
150
167
  id: str | None = None
151
- args: dict[str, Any] = Field(default_factory=dict)
168
+ args: dict[str, Any] | None = Field(
169
+ default=None,
170
+ description="Scenario arguments. None indicates a template (args filled in later).",
171
+ )
152
172
  validation: list[MCPToolCall] | None = None
153
173
 
154
174
  # Agent config - settings passed to agent (system_prompt, etc.)
@@ -284,8 +304,20 @@ class Task(BaseModel):
284
304
  ]
285
305
 
286
306
  # Preserve agent_config
307
+ agent_config: dict[str, Any] = {}
287
308
  if data.get("agent_config"):
288
- result["agent_config"] = data["agent_config"]
309
+ agent_config.update(data["agent_config"])
310
+ # Restore tool filters from Environment (they were extracted during v4 conversion)
311
+ if self.env is not None:
312
+ if getattr(self.env, "_agent_include", None) is not None:
313
+ agent_config["allowed_tools"] = self.env._agent_include
314
+ elif "allowed_tools" not in agent_config:
315
+ # ["*"] was converted to None, restore it for serialization
316
+ agent_config["allowed_tools"] = ["*"]
317
+ if getattr(self.env, "_agent_exclude", None) is not None:
318
+ agent_config["disallowed_tools"] = self.env._agent_exclude
319
+ if agent_config:
320
+ result["agent_config"] = agent_config
289
321
 
290
322
  # Preserve metadata
291
323
  if data.get("metadata"):
@@ -335,6 +367,6 @@ class Task(BaseModel):
335
367
  id=self.id,
336
368
  env=self.env, # Share reference
337
369
  scenario=self.scenario,
338
- args=self.args.copy() if self.args else {},
370
+ args=self.args.copy() if self.args is not None else None,
339
371
  validation=self.validation.copy() if self.validation else None,
340
372
  )
@@ -16,7 +16,7 @@ class TestTaskDataclass:
16
16
 
17
17
  assert task.env is None
18
18
  assert task.scenario is None
19
- assert task.args == {}
19
+ assert task.args is None # None = template, {} = runnable with no args
20
20
 
21
21
  def test_init_with_env_dict(self) -> None:
22
22
  """Task auto-converts env dict to Environment via validator."""
@@ -85,13 +85,24 @@ class TestTaskSerialization:
85
85
  task = Task.from_v4(v4_dict)
86
86
  data = task.model_dump(mode="json")
87
87
 
88
- assert data.get("agent_config") == {"system_prompt": "Custom system prompt"}
88
+ # agent_config should preserve system_prompt and restore tool filters
89
+ agent_config = data.get("agent_config")
90
+ assert agent_config is not None
91
+ assert agent_config["system_prompt"] == "Custom system prompt"
92
+ # allowed_tools defaults to ["*"] when not specified (restored during serialization)
93
+ assert agent_config["allowed_tools"] == ["*"]
94
+ # These have default False values from TaskAgentConfig
95
+ assert agent_config["append_setup_output"] is False
96
+ assert agent_config["append_setup_tool"] is False
89
97
 
90
98
  # Roundtrip
91
99
  task2 = Task(**data)
92
100
  assert task2.agent_config is not None
93
101
  assert isinstance(task2.agent_config, TaskAgentConfig)
94
102
  assert task2.agent_config.system_prompt == "Custom system prompt"
103
+ # Tool filters should be on Environment after roundtrip
104
+ assert task2.env is not None
105
+ assert task2.env._agent_include is None # ["*"] → None
95
106
 
96
107
  def test_v4_preserves_metadata(self) -> None:
97
108
  """v4 Task preserves metadata through roundtrip."""
@@ -143,3 +154,138 @@ class TestTaskValidation:
143
154
 
144
155
  assert isinstance(task.agent_config, TaskAgentConfig)
145
156
  assert task.agent_config.system_prompt == "Hello"
157
+
158
+
159
+ class TestV4AgentConfigToolFilters:
160
+ """Tests for v4 agent_config.allowed_tools and disallowed_tools processing."""
161
+
162
+ def test_v4_extracts_allowed_tools(self) -> None:
163
+ """v4 allowed_tools is extracted and stored on Environment."""
164
+ v4_dict = {
165
+ "prompt": "Test prompt",
166
+ "mcp_config": {"server": {"url": "http://localhost"}},
167
+ "evaluate_tool": {"name": "check", "arguments": {}},
168
+ "agent_config": {
169
+ "allowed_tools": ["browser_*", "file_read"],
170
+ },
171
+ }
172
+
173
+ task = Task.from_v4(v4_dict)
174
+
175
+ assert task.env is not None
176
+ assert task.env._agent_include == ["browser_*", "file_read"]
177
+
178
+ def test_v4_extracts_disallowed_tools(self) -> None:
179
+ """v4 disallowed_tools is extracted and stored on Environment."""
180
+ v4_dict = {
181
+ "prompt": "Test prompt",
182
+ "mcp_config": {"server": {"url": "http://localhost"}},
183
+ "evaluate_tool": {"name": "check", "arguments": {}},
184
+ "agent_config": {
185
+ "disallowed_tools": ["*setup*", "*evaluate*", "checkout_branch"],
186
+ },
187
+ }
188
+
189
+ task = Task.from_v4(v4_dict)
190
+
191
+ assert task.env is not None
192
+ assert task.env._agent_exclude == ["*setup*", "*evaluate*", "checkout_branch"]
193
+
194
+ def test_v4_wildcard_star_allowed_converts_to_none(self) -> None:
195
+ """v4 allowed_tools=['*'] converts to None (meaning include all)."""
196
+ v4_dict = {
197
+ "prompt": "Test prompt",
198
+ "mcp_config": {"server": {"url": "http://localhost"}},
199
+ "evaluate_tool": {"name": "check", "arguments": {}},
200
+ "agent_config": {
201
+ "allowed_tools": ["*"],
202
+ },
203
+ }
204
+
205
+ task = Task.from_v4(v4_dict)
206
+
207
+ assert task.env is not None
208
+ # ["*"] should be converted to None
209
+ assert task.env._agent_include is None
210
+
211
+ def test_v4_both_allowed_and_disallowed(self) -> None:
212
+ """v4 supports both allowed_tools and disallowed_tools together."""
213
+ v4_dict = {
214
+ "prompt": "Test prompt",
215
+ "mcp_config": {"server": {"url": "http://localhost"}},
216
+ "evaluate_tool": {"name": "check", "arguments": {}},
217
+ "agent_config": {
218
+ "allowed_tools": ["*"],
219
+ "disallowed_tools": ["*setup*", "*evaluate*"],
220
+ },
221
+ }
222
+
223
+ task = Task.from_v4(v4_dict)
224
+
225
+ assert task.env is not None
226
+ assert task.env._agent_include is None # ["*"] → None
227
+ assert task.env._agent_exclude == ["*setup*", "*evaluate*"]
228
+
229
+ @pytest.mark.asyncio
230
+ async def test_v4_tool_filters_applied_in_as_tools(self) -> None:
231
+ """v4 tool filters are applied when calling env.as_tools()."""
232
+ v4_dict = {
233
+ "prompt": "Test prompt",
234
+ "mcp_config": {"server": {"url": "http://localhost"}},
235
+ "evaluate_tool": {"name": "check", "arguments": {}},
236
+ "agent_config": {
237
+ "allowed_tools": ["*"],
238
+ "disallowed_tools": ["*setup*"],
239
+ },
240
+ }
241
+
242
+ task = Task.from_v4(v4_dict)
243
+ env = task.env
244
+ assert env is not None
245
+
246
+ # Add local tools to test filtering
247
+ @env.tool()
248
+ def my_setup_tool() -> str:
249
+ """Should be filtered out."""
250
+ return "setup"
251
+
252
+ @env.tool()
253
+ def run_query() -> str:
254
+ """Should be visible."""
255
+ return "query"
256
+
257
+ await env._build_routing()
258
+
259
+ tools = env.as_tools()
260
+ tool_names = [t.name for t in tools]
261
+
262
+ assert "my_setup_tool" not in tool_names
263
+ assert "run_query" in tool_names
264
+
265
+ def test_v4_tool_filters_preserved_in_serialization(self) -> None:
266
+ """v4 tool filters are preserved when serializing for remote execution."""
267
+ v4_dict = {
268
+ "prompt": "Test prompt",
269
+ "mcp_config": {"server": {"url": "http://localhost"}},
270
+ "evaluate_tool": {"name": "check", "arguments": {}},
271
+ "agent_config": {
272
+ "allowed_tools": ["*"],
273
+ "disallowed_tools": ["*setup*", "*evaluate*", "*grade*"],
274
+ },
275
+ }
276
+
277
+ task = Task.from_v4(v4_dict)
278
+
279
+ # Serialize (this is what gets sent to remote execution)
280
+ data = task.model_dump(mode="json")
281
+
282
+ # agent_config must include the tool filters for remote execution
283
+ assert "agent_config" in data
284
+ assert data["agent_config"]["allowed_tools"] == ["*"]
285
+ assert data["agent_config"]["disallowed_tools"] == ["*setup*", "*evaluate*", "*grade*"]
286
+
287
+ # Verify roundtrip works (remote worker will deserialize this)
288
+ task2 = Task(**data)
289
+ assert task2.env is not None
290
+ assert task2.env._agent_include is None # ["*"] → None
291
+ assert task2.env._agent_exclude == ["*setup*", "*evaluate*", "*grade*"]
hud/eval/types.py CHANGED
@@ -53,6 +53,8 @@ class JobEnterPayload(BaseModel):
53
53
  name: str | None = None
54
54
  variants: dict[str, Any] | None = None # Full variant config
55
55
  group: int | None = None
56
+ taskset: str | None = None # taskset slug to associate job with
57
+ tasks: list[dict[str, Any]] | None = None # task definitions to add to taskset
56
58
 
57
59
 
58
60
  __all__ = [
hud/eval/utils.py CHANGED
@@ -138,6 +138,7 @@ def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
138
138
  }
139
139
 
140
140
  # Map integration_test_tool → validation (same concept: tool calls to verify)
141
+ # Also populate _integration_test_calls for IntegrationTestRunner compatibility
141
142
  if legacy.integration_test_tool:
142
143
  int_test = legacy.integration_test_tool
143
144
  if not isinstance(int_test, list):
@@ -147,10 +148,20 @@ def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
147
148
  call if isinstance(call, MCPToolCall) else MCPToolCall(**call.model_dump())
148
149
  for call in int_test
149
150
  ]
151
+ # Populate _integration_test_calls on env for IntegrationTestRunner
152
+ env._integration_test_calls = [(call.name, call.arguments or {}) for call in int_test]
150
153
 
151
- # Extract agent_config (just system_prompt for now)
152
- if legacy.agent_config and legacy.agent_config.system_prompt:
153
- result["agent_config"] = {"system_prompt": legacy.agent_config.system_prompt}
154
+ # Extract agent_config fields that need to be passed through
155
+ if legacy.agent_config:
156
+ agent_config_dict: dict[str, Any] = {}
157
+ if legacy.agent_config.system_prompt:
158
+ agent_config_dict["system_prompt"] = legacy.agent_config.system_prompt
159
+ if legacy.agent_config.append_setup_output:
160
+ agent_config_dict["append_setup_output"] = legacy.agent_config.append_setup_output
161
+ if legacy.agent_config.append_setup_tool:
162
+ agent_config_dict["append_setup_tool"] = legacy.agent_config.append_setup_tool
163
+ if agent_config_dict:
164
+ result["agent_config"] = agent_config_dict
154
165
 
155
166
  # Preserve metadata
156
167
  if legacy.metadata: