hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. hud/__init__.py +1 -1
  2. hud/agents/__init__.py +65 -6
  3. hud/agents/base.py +33 -15
  4. hud/agents/claude.py +60 -31
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +15 -26
  7. hud/agents/gemini_cua.py +6 -17
  8. hud/agents/misc/response_agent.py +7 -0
  9. hud/agents/openai.py +16 -29
  10. hud/agents/openai_chat.py +3 -19
  11. hud/agents/operator.py +5 -17
  12. hud/agents/resolver.py +70 -0
  13. hud/agents/tests/test_claude.py +2 -4
  14. hud/agents/tests/test_openai.py +2 -1
  15. hud/agents/tests/test_resolver.py +192 -0
  16. hud/agents/types.py +148 -0
  17. hud/cli/__init__.py +34 -3
  18. hud/cli/build.py +37 -5
  19. hud/cli/dev.py +11 -2
  20. hud/cli/eval.py +51 -39
  21. hud/cli/flows/init.py +1 -1
  22. hud/cli/pull.py +1 -1
  23. hud/cli/push.py +9 -2
  24. hud/cli/tests/test_build.py +2 -2
  25. hud/cli/tests/test_push.py +1 -1
  26. hud/cli/utils/metadata.py +1 -1
  27. hud/cli/utils/tests/test_metadata.py +1 -1
  28. hud/clients/mcp_use.py +6 -1
  29. hud/datasets/loader.py +17 -18
  30. hud/datasets/runner.py +16 -10
  31. hud/datasets/tests/test_loader.py +15 -15
  32. hud/environment/__init__.py +5 -3
  33. hud/environment/connection.py +58 -6
  34. hud/environment/connectors/mcp_config.py +29 -1
  35. hud/environment/environment.py +218 -77
  36. hud/environment/router.py +175 -24
  37. hud/environment/scenarios.py +313 -186
  38. hud/environment/tests/test_connectors.py +10 -23
  39. hud/environment/tests/test_environment.py +432 -0
  40. hud/environment/tests/test_local_connectors.py +81 -40
  41. hud/environment/tests/test_scenarios.py +820 -14
  42. hud/eval/context.py +63 -10
  43. hud/eval/instrument.py +4 -2
  44. hud/eval/manager.py +79 -12
  45. hud/eval/task.py +36 -4
  46. hud/eval/tests/test_eval.py +1 -1
  47. hud/eval/tests/test_task.py +147 -1
  48. hud/eval/types.py +2 -0
  49. hud/eval/utils.py +14 -3
  50. hud/patches/mcp_patches.py +178 -21
  51. hud/telemetry/instrument.py +8 -1
  52. hud/telemetry/tests/test_eval_telemetry.py +8 -8
  53. hud/tools/__init__.py +2 -0
  54. hud/tools/agent.py +223 -0
  55. hud/tools/computer/__init__.py +34 -5
  56. hud/tools/shell.py +3 -3
  57. hud/tools/tests/test_agent_tool.py +355 -0
  58. hud/types.py +62 -34
  59. hud/utils/hud_console.py +30 -17
  60. hud/utils/strict_schema.py +1 -1
  61. hud/utils/tests/test_version.py +1 -1
  62. hud/version.py +1 -1
  63. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
  64. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
  65. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
  66. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  67. {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/dev.py CHANGED
@@ -250,6 +250,15 @@ async def run_mcp_module(
250
250
  elif hasattr(module, "__dict__") and attr_name in module.__dict__:
251
251
  mcp_server = module.__dict__[attr_name]
252
252
 
253
+ # If default 'mcp' not found, try 'env' as fallback
254
+ if mcp_server is None and attr_name == "mcp":
255
+ for fallback in ["env", "environment", "server"]:
256
+ if hasattr(module, fallback):
257
+ mcp_server = getattr(module, fallback)
258
+ if verbose:
259
+ hud_console.info(f"Found '{fallback}' instead of 'mcp'")
260
+ break
261
+
253
262
  if mcp_server is None:
254
263
  hud_console.error(f"Module '{module_name}' does not have '{attr_name}' defined")
255
264
  hud_console.info("")
@@ -258,8 +267,8 @@ async def run_mcp_module(
258
267
  hud_console.info("")
259
268
  hud_console.info("[bold cyan]Expected structure:[/bold cyan]")
260
269
  hud_console.info(" from hud.environment import Environment")
261
- hud_console.info(f" {attr_name} = Environment('my-env')")
262
- raise AttributeError(f"Module '{module_name}' must define '{attr_name}'")
270
+ hud_console.info(" env = Environment('my-env') # or mcp = ...")
271
+ raise AttributeError(f"Module '{module_name}' must define 'mcp', 'env', or 'environment'")
263
272
 
264
273
  # Only show full header on first run, brief message on reload
265
274
  if is_reload:
hud/cli/eval.py CHANGED
@@ -164,6 +164,7 @@ class EvalConfig(BaseModel):
164
164
  "auto_respond",
165
165
  "quiet",
166
166
  "gateway",
167
+ "taskset",
167
168
  }
168
169
  # Fields loaded from [agent] section
169
170
  _AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
@@ -184,6 +185,7 @@ class EvalConfig(BaseModel):
184
185
  remote: bool = False
185
186
  quiet: bool = False # Suppress opening browser for eval links
186
187
  gateway: bool = False # Use HUD Gateway for LLM API calls
188
+ taskset: str | None = None # Taskset slug to associate job with
187
189
 
188
190
  # Base agent config (these merge with task's agent_config)
189
191
  allowed_tools: list[str] | None = None
@@ -338,47 +340,27 @@ class EvalConfig(BaseModel):
338
340
 
339
341
  # Configure gateway mode - route LLM API calls through HUD gateway
340
342
  if self.gateway:
341
- hud_api_key = settings.api_key
342
- if not hud_api_key:
343
+ if not settings.api_key:
343
344
  raise typer.Exit(1) # Already validated in validate_api_keys()
344
345
 
345
- if self.agent_type == AgentType.CLAUDE:
346
- from anthropic import AsyncAnthropic
346
+ from hud.agents.gateway import build_gateway_client
347
347
 
348
- kwargs["model_client"] = AsyncAnthropic(
349
- api_key=hud_api_key,
350
- base_url=settings.hud_gateway_url,
351
- )
352
- hud_console.info("🌐 Using HUD Gateway for Claude API")
353
- elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR):
354
- from openai import AsyncOpenAI
348
+ # Map AgentType to provider
349
+ agent_to_provider = {
350
+ AgentType.CLAUDE: "anthropic",
351
+ AgentType.OPENAI: "openai",
352
+ AgentType.OPERATOR: "openai",
353
+ AgentType.GEMINI: "gemini",
354
+ AgentType.GEMINI_CUA: "gemini",
355
+ AgentType.OPENAI_COMPATIBLE: "openai",
356
+ }
357
+ provider = agent_to_provider.get(self.agent_type, "openai")
358
+ client = build_gateway_client(provider)
355
359
 
356
- kwargs["model_client"] = AsyncOpenAI(
357
- api_key=hud_api_key,
358
- base_url=settings.hud_gateway_url,
359
- )
360
- hud_console.info("🌐 Using HUD Gateway for OpenAI API")
361
- elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
362
- from openai import AsyncOpenAI
363
-
364
- kwargs["openai_client"] = AsyncOpenAI(
365
- api_key=hud_api_key,
366
- base_url=settings.hud_gateway_url,
367
- )
368
- hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
369
- elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
370
- from google import genai
371
- from google.genai.types import HttpOptions
372
-
373
- kwargs["model_client"] = genai.Client(
374
- api_key="PLACEHOLDER",
375
- http_options=HttpOptions(
376
- api_version="v1beta",
377
- base_url=settings.hud_gateway_url,
378
- headers={"Authorization": f"Bearer {hud_api_key}"},
379
- ),
380
- )
381
- hud_console.info("🌐 Using HUD Gateway for Gemini API")
360
+ # OpenAI-compatible uses openai_client key
361
+ is_oai_compat = self.agent_type == AgentType.OPENAI_COMPATIBLE
362
+ kwargs["openai_client" if is_oai_compat else "model_client"] = client
363
+ hud_console.info(f"🌐 Using HUD Gateway for {provider} API")
382
364
 
383
365
  return kwargs
384
366
 
@@ -584,7 +566,7 @@ class EvalConfig(BaseModel):
584
566
  table.add_row("", "")
585
567
  table.add_row(f"[dim]{self.agent_type.value} config[/dim]", "")
586
568
 
587
- config_cls = self.agent_type.cls.config_cls
569
+ config_cls = self.agent_type.config_cls
588
570
  defaults = config_cls()
589
571
  overrides = self.agent_config.get(self.agent_type.value, {})
590
572
  skip = {
@@ -675,16 +657,41 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
675
657
  agent_kwargs = {
676
658
  k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
677
659
  }
678
- # Create a job ID for tracking
679
660
  import uuid
680
661
 
681
662
  from hud.datasets.utils import submit_rollouts
663
+ from hud.eval.manager import _send_job_enter
682
664
 
683
665
  job_id = str(uuid.uuid4())
684
666
  hud_console.info(
685
667
  f"Submitting {len(tasks)} task(s) for remote execution (job_id: {job_id})…"
686
668
  )
687
669
 
670
+ if cfg.taskset:
671
+ tasks_to_create = [t for t in tasks if not t.id]
672
+ tasks_data = (
673
+ [t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
674
+ if tasks_to_create
675
+ else None
676
+ )
677
+ ids = await _send_job_enter(
678
+ job_id=job_id,
679
+ name=f"eval ({cfg.source})" if cfg.source else "eval",
680
+ variants=None,
681
+ group=cfg.group_size,
682
+ api_key=None,
683
+ taskset=cfg.taskset,
684
+ tasks=tasks_data,
685
+ )
686
+ if ids:
687
+ if len(ids) != len(tasks_to_create):
688
+ hud_console.warning(
689
+ f"Task count mismatch: sent {len(tasks_to_create)} tasks, "
690
+ f"received {len(ids)} IDs. Some tasks may not be linked."
691
+ )
692
+ for task_obj, task_version_id in zip(tasks_to_create, ids, strict=False):
693
+ task_obj.id = task_version_id
694
+
688
695
  await submit_rollouts(
689
696
  tasks=tasks,
690
697
  job_id=job_id,
@@ -721,6 +728,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
721
728
  max_concurrent=cfg.max_concurrent,
722
729
  group_size=cfg.group_size,
723
730
  quiet=cfg.quiet,
731
+ taskset=cfg.taskset,
724
732
  )
725
733
 
726
734
  # Show reward for single task
@@ -787,6 +795,9 @@ def eval_command(
787
795
  gateway: bool = typer.Option(
788
796
  False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
789
797
  ),
798
+ taskset: str | None = typer.Option(
799
+ None, "--taskset", "-t", help="Taskset slug to associate job with"
800
+ ),
790
801
  ) -> None:
791
802
  """🚀 Run evaluation on datasets or individual tasks with agents.
792
803
 
@@ -821,6 +832,7 @@ def eval_command(
821
832
  byok=byok,
822
833
  quiet=quiet,
823
834
  gateway=gateway,
835
+ taskset=taskset,
824
836
  )
825
837
 
826
838
  # Find source if not provided
hud/cli/flows/init.py CHANGED
@@ -102,7 +102,7 @@ def smart_init(
102
102
  hud_console.info(" hud set HUD_API_KEY=your-key-here")
103
103
  hud_console.info(" Or: export HUD_API_KEY=your-key")
104
104
  hud_console.info("")
105
- hud_console.info("Get your key at: https://hud.ai/settings/api-keys")
105
+ hud_console.info("Get your key at: https://hud.ai/project/api-keys")
106
106
  return
107
107
 
108
108
  target = Path(directory).resolve()
hud/cli/pull.py CHANGED
@@ -63,7 +63,7 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
63
63
 
64
64
  # URL-encode the path segments to handle special characters in tags
65
65
  url_safe_path = "/".join(quote(part, safe="") for part in reference.split("/"))
66
- registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{url_safe_path}"
66
+ registry_url = f"{settings.hud_api_url.rstrip('/')}/registry/envs/{url_safe_path}"
67
67
 
68
68
  headers = {}
69
69
  if settings.api_key:
hud/cli/push.py CHANGED
@@ -420,13 +420,20 @@ def push_environment(
420
420
 
421
421
  # URL-encode the path segments to handle special characters in tags
422
422
  url_safe_path = "/".join(quote(part, safe="") for part in name_with_tag.split("/"))
423
- registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{url_safe_path}"
423
+ registry_url = f"{settings.hud_api_url.rstrip('/')}/registry/envs/{url_safe_path}"
424
+
425
+ # Detect git remote URL for matching existing GitHub-connected registries
426
+ from hud.cli.utils.git import get_git_remote_url
427
+
428
+ github_url = get_git_remote_url(Path(directory))
424
429
 
425
430
  # Prepare the payload
426
- payload = {
431
+ payload: dict[str, str | None] = {
427
432
  "lock": yaml.dump(lock_data, default_flow_style=False, sort_keys=False),
428
433
  "digest": pushed_digest.split("@")[-1] if "@" in pushed_digest else None,
429
434
  }
435
+ if github_url:
436
+ payload["github_url"] = github_url
430
437
 
431
438
  headers = {"Authorization": f"Bearer {settings.api_key}"}
432
439
 
@@ -60,12 +60,12 @@ class TestIncrementVersion:
60
60
  def test_increment_minor(self):
61
61
  """Test incrementing minor version."""
62
62
  assert increment_version("1.2.3", "minor") == "1.3.0"
63
- assert increment_version("0.5.10", "minor") == "0.6.0"
63
+ assert increment_version("0.5.13", "minor") == "0.6.0"
64
64
 
65
65
  def test_increment_major(self):
66
66
  """Test incrementing major version."""
67
67
  assert increment_version("1.2.3", "major") == "2.0.0"
68
- assert increment_version("0.5.10", "major") == "1.0.0"
68
+ assert increment_version("0.5.13", "major") == "1.0.0"
69
69
 
70
70
  def test_increment_with_v_prefix(self):
71
71
  """Test incrementing version with v prefix."""
@@ -160,7 +160,7 @@ class TestPushEnvironment:
160
160
  mock_hud_console = mock.Mock()
161
161
  mock_hud_console_class.return_value = mock_hud_console
162
162
  mock_settings.api_key = "test-key"
163
- mock_settings.hud_telemetry_url = "https://api.hud.test"
163
+ mock_settings.hud_api_url = "https://api.hud.test"
164
164
  mock_get_username.return_value = "testuser"
165
165
 
166
166
  # Create lock file
hud/cli/utils/metadata.py CHANGED
@@ -32,7 +32,7 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
32
32
 
33
33
  # URL-encode the path segments to handle special characters in tags
34
34
  url_safe_path = "/".join(quote(part, safe="") for part in reference.split("/"))
35
- registry_url = f"{settings.hud_telemetry_url.rstrip('/')}/registry/envs/{url_safe_path}"
35
+ registry_url = f"{settings.hud_api_url.rstrip('/')}/registry/envs/{url_safe_path}"
36
36
 
37
37
  headers = {}
38
38
  if settings.api_key:
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
18
18
  @patch("hud.cli.utils.metadata.settings")
19
19
  @patch("requests.get")
20
20
  def test_fetch_lock_from_registry_success(mock_get, mock_settings):
21
- mock_settings.hud_telemetry_url = "https://api.example.com"
21
+ mock_settings.hud_api_url = "https://api.example.com"
22
22
  mock_settings.api_key = None
23
23
  resp = MagicMock(status_code=200)
24
24
  resp.json.return_value = {"lock": "image: img\n"}
hud/clients/mcp_use.py CHANGED
@@ -64,9 +64,14 @@ class MCPUseHUDClient(BaseHUDClient):
64
64
  return
65
65
 
66
66
  # Use configurable timeout for SSE read operations to support long-running tool calls.
67
+ max_request_timeout = 840
67
68
  for server_cfg in mcp_config.values():
68
69
  if "sse_read_timeout" not in server_cfg:
69
- server_cfg["sse_read_timeout"] = settings.client_timeout
70
+ server_cfg["sse_read_timeout"] = (
71
+ min(settings.client_timeout, max_request_timeout)
72
+ if settings.client_timeout > 0
73
+ else max_request_timeout
74
+ )
70
75
 
71
76
  # If a server target matches HUD's MCP host and no auth is provided,
72
77
  # inject the HUD API key as a Bearer token to avoid OAuth browser flow.
hud/datasets/loader.py CHANGED
@@ -14,6 +14,10 @@ import warnings
14
14
  from pathlib import Path
15
15
  from typing import TYPE_CHECKING, Any, overload
16
16
 
17
+ import httpx
18
+
19
+ from hud.settings import settings
20
+
17
21
  if TYPE_CHECKING:
18
22
  from hud.eval.task import Task
19
23
 
@@ -63,7 +67,8 @@ def _load_from_file(path: Path) -> list[Task]:
63
67
  from hud.eval.task import Task
64
68
 
65
69
  raw_items = _load_raw_from_file(path)
66
- return [Task(**item) for item in raw_items]
70
+ # Default args to {} for runnable tasks (None = template)
71
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
67
72
 
68
73
 
69
74
  def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
@@ -99,15 +104,12 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]:
99
104
  raw_items = _load_raw_from_huggingface(dataset_name)
100
105
  from hud.eval.task import Task
101
106
 
102
- return [Task(**item) for item in raw_items]
107
+ # Default args to {} for runnable tasks (None = template)
108
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
103
109
 
104
110
 
105
111
  def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
106
112
  """Load raw task dicts from HUD API."""
107
- import httpx
108
-
109
- from hud.settings import settings
110
-
111
113
  headers = {}
112
114
  if settings.api_key:
113
115
  headers["Authorization"] = f"Bearer {settings.api_key}"
@@ -138,7 +140,8 @@ def _load_from_api(dataset_name: str) -> list[Task]:
138
140
  from hud.eval.task import Task
139
141
 
140
142
  raw_items = _load_raw_from_api(dataset_name)
141
- return [Task(**item) for item in raw_items]
143
+ # Default args to {} for runnable tasks (None = template)
144
+ return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
142
145
 
143
146
 
144
147
  @overload
@@ -234,15 +237,15 @@ def save_tasks(
234
237
  ) -> str:
235
238
  """Save tasks to the HUD API.
236
239
 
237
- Creates or updates an evalset with the given tasks.
240
+ Creates or updates a taskset with the given tasks.
238
241
 
239
242
  Args:
240
- name: Evalset name/slug (e.g., "my-evals/benchmark-v1").
243
+ name: Taskset name/slug (e.g., "my-evals/benchmark-v1").
241
244
  If no org prefix, uses user's default org.
242
245
  tasks: List of Task objects (v5 format) to save.
243
246
 
244
247
  Returns:
245
- The evalset ID of the created/updated evalset.
248
+ The taskset ID of the created/updated taskset.
246
249
 
247
250
  Example:
248
251
  ```python
@@ -258,7 +261,7 @@ def save_tasks(
258
261
  ]
259
262
 
260
263
  # Save to HUD API
261
- evalset_id = save_tasks("my-evals/benchmark-v1", tasks)
264
+ taskset_id = save_tasks("my-evals/benchmark-v1", tasks)
262
265
 
263
266
  # Later, load them back
264
267
  loaded = load_tasks("my-evals/benchmark-v1")
@@ -268,10 +271,6 @@ def save_tasks(
268
271
  TypeError: If any task is not a v5 Task object (must have 'scenario')
269
272
  ValueError: If API key is not set or save fails
270
273
  """
271
- import httpx
272
-
273
- from hud.settings import settings
274
-
275
274
  if not settings.api_key:
276
275
  raise ValueError("HUD_API_KEY is required to save tasks")
277
276
 
@@ -303,9 +302,9 @@ def save_tasks(
303
302
  )
304
303
  response.raise_for_status()
305
304
  data = response.json()
306
- evalset_id = data.get("evalset_id") or data.get("id") or name
307
- logger.info("Saved %d tasks to evalset: %s", len(tasks), evalset_id)
308
- return evalset_id
305
+ taskset_id = data.get("evalset_id") or data.get("id") or name
306
+ logger.info("Saved %d tasks to taskset: %s", len(tasks), taskset_id)
307
+ return taskset_id
309
308
  except httpx.HTTPStatusError as e:
310
309
  raise ValueError(f"Failed to save tasks: {e.response.text}") from e
311
310
  except Exception as e:
hud/datasets/runner.py CHANGED
@@ -29,6 +29,7 @@ async def run_dataset(
29
29
  max_concurrent: int = 30,
30
30
  group_size: int = 1,
31
31
  quiet: bool = True,
32
+ taskset: str | None = None,
32
33
  ) -> list[EvalContext]:
33
34
  """Run an agent on a dataset of tasks.
34
35
 
@@ -40,7 +41,7 @@ async def run_dataset(
40
41
  - A source string (file path, API slug) - loaded via load_tasks()
41
42
  - A single TaskInput (Task, LegacyTask, or dict)
42
43
  - A list of TaskInput objects
43
- agent_type: Type of agent to create (e.g., "claude", "openai", AgentType.CLAUDE).
44
+ agent_type: Agent type (e.g., "claude", "openai", AgentType.CLAUDE).
44
45
  agent_params: Parameters to pass to agent.create().
45
46
  max_steps: Maximum steps per task.
46
47
  max_concurrent: Maximum concurrent tasks (for parallel execution).
@@ -70,6 +71,10 @@ async def run_dataset(
70
71
  from hud.datasets.loader import load_tasks
71
72
  from hud.eval.task import Task
72
73
 
74
+ # Normalize agent_type to AgentType enum
75
+ if isinstance(agent_type, str):
76
+ agent_type = AgentType(agent_type)
77
+
73
78
  # Normalize tasks to list[Task]
74
79
  task_list: list[Task]
75
80
  if isinstance(tasks, str):
@@ -86,19 +91,21 @@ async def run_dataset(
86
91
  if not task_list:
87
92
  raise ValueError("No tasks to run")
88
93
 
89
- # Resolve agent class
90
- agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type)
91
- agent_cls = agent_type_enum.cls
92
-
93
94
  # Use hud.eval() for both single and parallel execution
94
95
  async with hud.eval(
95
96
  task_list,
96
97
  group=group_size,
97
98
  max_concurrent=max_concurrent,
98
99
  quiet=quiet,
100
+ taskset=taskset,
99
101
  ) as ctx:
100
- # Create agent fresh for each context (ensures correct tool initialization)
101
- agent = agent_cls.create(**(agent_params or {}))
102
+ # Build agent params - use system_prompt from ctx (set from task.agent_config)
103
+ final_agent_params = dict(agent_params or {})
104
+ if ctx.system_prompt and "system_prompt" not in final_agent_params:
105
+ final_agent_params["system_prompt"] = ctx.system_prompt
106
+
107
+ # Create agent using AgentType.cls.create()
108
+ agent = agent_type.cls.create(**final_agent_params)
102
109
  await agent.run(ctx, max_steps=max_steps)
103
110
  # Reward is computed by EvalContext.__aexit__ from evaluate tools
104
111
 
@@ -198,9 +205,8 @@ async def run_single_task(
198
205
  if ctx.system_prompt and "system_prompt" not in final_agent_params:
199
206
  final_agent_params["system_prompt"] = ctx.system_prompt
200
207
 
201
- # Create agent inside ctx so it has access to context-derived values
202
- agent_cls = agent_type.cls
203
- agent = agent_cls.create(**final_agent_params)
208
+ # Create agent using AgentType.cls.create()
209
+ agent = agent_type.cls.create(**final_agent_params)
204
210
 
205
211
  # Store metadata if provided
206
212
  if metadata:
@@ -12,8 +12,8 @@ from hud.datasets.loader import load_tasks
12
12
  class TestLoadTasks:
13
13
  """Tests for load_tasks() function."""
14
14
 
15
- @patch("httpx.Client")
16
- @patch("hud.settings.settings")
15
+ @patch("hud.datasets.loader.httpx.Client")
16
+ @patch("hud.datasets.loader.settings")
17
17
  def test_load_tasks_success(
18
18
  self, mock_settings: MagicMock, mock_client_class: MagicMock
19
19
  ) -> None:
@@ -22,7 +22,7 @@ class TestLoadTasks:
22
22
  mock_settings.api_key = "test_key"
23
23
 
24
24
  mock_response = MagicMock()
25
- # New EvalsetTasksResponse format: tasks keyed by task ID
25
+ # EvalsetTasksResponse format: tasks keyed by task ID
26
26
  mock_response.json.return_value = {
27
27
  "evalset_id": "evalset-123",
28
28
  "evalset_name": "test-dataset",
@@ -62,8 +62,8 @@ class TestLoadTasks:
62
62
  params={"all": "true"},
63
63
  )
64
64
 
65
- @patch("httpx.Client")
66
- @patch("hud.settings.settings")
65
+ @patch("hud.datasets.loader.httpx.Client")
66
+ @patch("hud.datasets.loader.settings")
67
67
  def test_load_tasks_single_task(
68
68
  self, mock_settings: MagicMock, mock_client_class: MagicMock
69
69
  ) -> None:
@@ -97,8 +97,8 @@ class TestLoadTasks:
97
97
  assert tasks[0].scenario == "checkout"
98
98
  assert tasks[0].id == "task-1"
99
99
 
100
- @patch("httpx.Client")
101
- @patch("hud.settings.settings")
100
+ @patch("hud.datasets.loader.httpx.Client")
101
+ @patch("hud.datasets.loader.settings")
102
102
  def test_load_tasks_no_api_key(
103
103
  self, mock_settings: MagicMock, mock_client_class: MagicMock
104
104
  ) -> None:
@@ -129,8 +129,8 @@ class TestLoadTasks:
129
129
  params={"all": "true"},
130
130
  )
131
131
 
132
- @patch("httpx.Client")
133
- @patch("hud.settings.settings")
132
+ @patch("hud.datasets.loader.httpx.Client")
133
+ @patch("hud.datasets.loader.settings")
134
134
  def test_load_tasks_http_error(
135
135
  self, mock_settings: MagicMock, mock_client_class: MagicMock
136
136
  ) -> None:
@@ -149,8 +149,8 @@ class TestLoadTasks:
149
149
  with pytest.raises(ValueError, match="Failed to load tasks"):
150
150
  load_tasks("test-org/test-dataset")
151
151
 
152
- @patch("httpx.Client")
153
- @patch("hud.settings.settings")
152
+ @patch("hud.datasets.loader.httpx.Client")
153
+ @patch("hud.datasets.loader.settings")
154
154
  def test_load_tasks_json_error(
155
155
  self, mock_settings: MagicMock, mock_client_class: MagicMock
156
156
  ) -> None:
@@ -171,8 +171,8 @@ class TestLoadTasks:
171
171
  with pytest.raises(ValueError, match="Failed to load tasks"):
172
172
  load_tasks("test-org/test-dataset")
173
173
 
174
- @patch("httpx.Client")
175
- @patch("hud.settings.settings")
174
+ @patch("hud.datasets.loader.httpx.Client")
175
+ @patch("hud.datasets.loader.settings")
176
176
  def test_load_tasks_empty(self, mock_settings: MagicMock, mock_client_class: MagicMock) -> None:
177
177
  """load_tasks() handles empty dataset."""
178
178
  mock_settings.hud_api_url = "https://api.hud.ai"
@@ -192,8 +192,8 @@ class TestLoadTasks:
192
192
 
193
193
  assert len(tasks) == 0
194
194
 
195
- @patch("httpx.Client")
196
- @patch("hud.settings.settings")
195
+ @patch("hud.datasets.loader.httpx.Client")
196
+ @patch("hud.datasets.loader.settings")
197
197
  def test_load_tasks_missing_fields(
198
198
  self, mock_settings: MagicMock, mock_client_class: MagicMock
199
199
  ) -> None:
@@ -27,8 +27,8 @@ Usage:
27
27
  from hud.environment.connection import ConnectionConfig, ConnectionType, Connector
28
28
  from hud.environment.environment import Environment
29
29
  from hud.environment.mock import MockMixin, generate_mock_value
30
- from hud.environment.router import ConflictResolution, ToolRouter
31
- from hud.environment.scenarios import ScenarioMixin
30
+ from hud.environment.router import ConflictResolution, MCPRouter, ToolRouter
31
+ from hud.environment.scenarios import ScenarioMixin, ScenarioSession
32
32
  from hud.environment.types import EnvConfig
33
33
  from hud.environment.utils import ToolFormat, format_result, parse_tool_call, parse_tool_calls
34
34
 
@@ -39,10 +39,12 @@ __all__ = [
39
39
  "Connector",
40
40
  "EnvConfig",
41
41
  "Environment",
42
+ "MCPRouter",
42
43
  "MockMixin",
43
44
  "ScenarioMixin",
45
+ "ScenarioSession",
44
46
  "ToolFormat",
45
- "ToolRouter",
47
+ "ToolRouter", # Backwards compat alias for MCPRouter
46
48
  "format_result",
47
49
  "generate_mock_value",
48
50
  "parse_tool_call",