hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +1 -1
- hud/agents/__init__.py +65 -6
- hud/agents/base.py +33 -15
- hud/agents/claude.py +60 -31
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +15 -26
- hud/agents/gemini_cua.py +6 -17
- hud/agents/misc/response_agent.py +7 -0
- hud/agents/openai.py +16 -29
- hud/agents/openai_chat.py +3 -19
- hud/agents/operator.py +5 -17
- hud/agents/resolver.py +70 -0
- hud/agents/tests/test_claude.py +2 -4
- hud/agents/tests/test_openai.py +2 -1
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +34 -3
- hud/cli/build.py +37 -5
- hud/cli/dev.py +11 -2
- hud/cli/eval.py +51 -39
- hud/cli/flows/init.py +1 -1
- hud/cli/pull.py +1 -1
- hud/cli/push.py +9 -2
- hud/cli/tests/test_build.py +2 -2
- hud/cli/tests/test_push.py +1 -1
- hud/cli/utils/metadata.py +1 -1
- hud/cli/utils/tests/test_metadata.py +1 -1
- hud/clients/mcp_use.py +6 -1
- hud/datasets/loader.py +17 -18
- hud/datasets/runner.py +16 -10
- hud/datasets/tests/test_loader.py +15 -15
- hud/environment/__init__.py +5 -3
- hud/environment/connection.py +58 -6
- hud/environment/connectors/mcp_config.py +29 -1
- hud/environment/environment.py +218 -77
- hud/environment/router.py +175 -24
- hud/environment/scenarios.py +313 -186
- hud/environment/tests/test_connectors.py +10 -23
- hud/environment/tests/test_environment.py +432 -0
- hud/environment/tests/test_local_connectors.py +81 -40
- hud/environment/tests/test_scenarios.py +820 -14
- hud/eval/context.py +63 -10
- hud/eval/instrument.py +4 -2
- hud/eval/manager.py +79 -12
- hud/eval/task.py +36 -4
- hud/eval/tests/test_eval.py +1 -1
- hud/eval/tests/test_task.py +147 -1
- hud/eval/types.py +2 -0
- hud/eval/utils.py +14 -3
- hud/patches/mcp_patches.py +178 -21
- hud/telemetry/instrument.py +8 -1
- hud/telemetry/tests/test_eval_telemetry.py +8 -8
- hud/tools/__init__.py +2 -0
- hud/tools/agent.py +223 -0
- hud/tools/computer/__init__.py +34 -5
- hud/tools/shell.py +3 -3
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/types.py +62 -34
- hud/utils/hud_console.py +30 -17
- hud/utils/strict_schema.py +1 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/dev.py
CHANGED
|
@@ -250,6 +250,15 @@ async def run_mcp_module(
|
|
|
250
250
|
elif hasattr(module, "__dict__") and attr_name in module.__dict__:
|
|
251
251
|
mcp_server = module.__dict__[attr_name]
|
|
252
252
|
|
|
253
|
+
# If default 'mcp' not found, try 'env' as fallback
|
|
254
|
+
if mcp_server is None and attr_name == "mcp":
|
|
255
|
+
for fallback in ["env", "environment", "server"]:
|
|
256
|
+
if hasattr(module, fallback):
|
|
257
|
+
mcp_server = getattr(module, fallback)
|
|
258
|
+
if verbose:
|
|
259
|
+
hud_console.info(f"Found '{fallback}' instead of 'mcp'")
|
|
260
|
+
break
|
|
261
|
+
|
|
253
262
|
if mcp_server is None:
|
|
254
263
|
hud_console.error(f"Module '{module_name}' does not have '{attr_name}' defined")
|
|
255
264
|
hud_console.info("")
|
|
@@ -258,8 +267,8 @@ async def run_mcp_module(
|
|
|
258
267
|
hud_console.info("")
|
|
259
268
|
hud_console.info("[bold cyan]Expected structure:[/bold cyan]")
|
|
260
269
|
hud_console.info(" from hud.environment import Environment")
|
|
261
|
-
hud_console.info(
|
|
262
|
-
raise AttributeError(f"Module '{module_name}' must define '
|
|
270
|
+
hud_console.info(" env = Environment('my-env') # or mcp = ...")
|
|
271
|
+
raise AttributeError(f"Module '{module_name}' must define 'mcp', 'env', or 'environment'")
|
|
263
272
|
|
|
264
273
|
# Only show full header on first run, brief message on reload
|
|
265
274
|
if is_reload:
|
hud/cli/eval.py
CHANGED
|
@@ -164,6 +164,7 @@ class EvalConfig(BaseModel):
|
|
|
164
164
|
"auto_respond",
|
|
165
165
|
"quiet",
|
|
166
166
|
"gateway",
|
|
167
|
+
"taskset",
|
|
167
168
|
}
|
|
168
169
|
# Fields loaded from [agent] section
|
|
169
170
|
_AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
|
|
@@ -184,6 +185,7 @@ class EvalConfig(BaseModel):
|
|
|
184
185
|
remote: bool = False
|
|
185
186
|
quiet: bool = False # Suppress opening browser for eval links
|
|
186
187
|
gateway: bool = False # Use HUD Gateway for LLM API calls
|
|
188
|
+
taskset: str | None = None # Taskset slug to associate job with
|
|
187
189
|
|
|
188
190
|
# Base agent config (these merge with task's agent_config)
|
|
189
191
|
allowed_tools: list[str] | None = None
|
|
@@ -338,47 +340,27 @@ class EvalConfig(BaseModel):
|
|
|
338
340
|
|
|
339
341
|
# Configure gateway mode - route LLM API calls through HUD gateway
|
|
340
342
|
if self.gateway:
|
|
341
|
-
|
|
342
|
-
if not hud_api_key:
|
|
343
|
+
if not settings.api_key:
|
|
343
344
|
raise typer.Exit(1) # Already validated in validate_api_keys()
|
|
344
345
|
|
|
345
|
-
|
|
346
|
-
from anthropic import AsyncAnthropic
|
|
346
|
+
from hud.agents.gateway import build_gateway_client
|
|
347
347
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
348
|
+
# Map AgentType to provider
|
|
349
|
+
agent_to_provider = {
|
|
350
|
+
AgentType.CLAUDE: "anthropic",
|
|
351
|
+
AgentType.OPENAI: "openai",
|
|
352
|
+
AgentType.OPERATOR: "openai",
|
|
353
|
+
AgentType.GEMINI: "gemini",
|
|
354
|
+
AgentType.GEMINI_CUA: "gemini",
|
|
355
|
+
AgentType.OPENAI_COMPATIBLE: "openai",
|
|
356
|
+
}
|
|
357
|
+
provider = agent_to_provider.get(self.agent_type, "openai")
|
|
358
|
+
client = build_gateway_client(provider)
|
|
355
359
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
hud_console.info("🌐 Using HUD Gateway for OpenAI API")
|
|
361
|
-
elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
|
|
362
|
-
from openai import AsyncOpenAI
|
|
363
|
-
|
|
364
|
-
kwargs["openai_client"] = AsyncOpenAI(
|
|
365
|
-
api_key=hud_api_key,
|
|
366
|
-
base_url=settings.hud_gateway_url,
|
|
367
|
-
)
|
|
368
|
-
hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
|
|
369
|
-
elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
|
|
370
|
-
from google import genai
|
|
371
|
-
from google.genai.types import HttpOptions
|
|
372
|
-
|
|
373
|
-
kwargs["model_client"] = genai.Client(
|
|
374
|
-
api_key="PLACEHOLDER",
|
|
375
|
-
http_options=HttpOptions(
|
|
376
|
-
api_version="v1beta",
|
|
377
|
-
base_url=settings.hud_gateway_url,
|
|
378
|
-
headers={"Authorization": f"Bearer {hud_api_key}"},
|
|
379
|
-
),
|
|
380
|
-
)
|
|
381
|
-
hud_console.info("🌐 Using HUD Gateway for Gemini API")
|
|
360
|
+
# OpenAI-compatible uses openai_client key
|
|
361
|
+
is_oai_compat = self.agent_type == AgentType.OPENAI_COMPATIBLE
|
|
362
|
+
kwargs["openai_client" if is_oai_compat else "model_client"] = client
|
|
363
|
+
hud_console.info(f"🌐 Using HUD Gateway for {provider} API")
|
|
382
364
|
|
|
383
365
|
return kwargs
|
|
384
366
|
|
|
@@ -584,7 +566,7 @@ class EvalConfig(BaseModel):
|
|
|
584
566
|
table.add_row("", "")
|
|
585
567
|
table.add_row(f"[dim]{self.agent_type.value} config[/dim]", "")
|
|
586
568
|
|
|
587
|
-
config_cls = self.agent_type.
|
|
569
|
+
config_cls = self.agent_type.config_cls
|
|
588
570
|
defaults = config_cls()
|
|
589
571
|
overrides = self.agent_config.get(self.agent_type.value, {})
|
|
590
572
|
skip = {
|
|
@@ -675,16 +657,41 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
|
675
657
|
agent_kwargs = {
|
|
676
658
|
k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
|
|
677
659
|
}
|
|
678
|
-
# Create a job ID for tracking
|
|
679
660
|
import uuid
|
|
680
661
|
|
|
681
662
|
from hud.datasets.utils import submit_rollouts
|
|
663
|
+
from hud.eval.manager import _send_job_enter
|
|
682
664
|
|
|
683
665
|
job_id = str(uuid.uuid4())
|
|
684
666
|
hud_console.info(
|
|
685
667
|
f"Submitting {len(tasks)} task(s) for remote execution (job_id: {job_id})…"
|
|
686
668
|
)
|
|
687
669
|
|
|
670
|
+
if cfg.taskset:
|
|
671
|
+
tasks_to_create = [t for t in tasks if not t.id]
|
|
672
|
+
tasks_data = (
|
|
673
|
+
[t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
|
|
674
|
+
if tasks_to_create
|
|
675
|
+
else None
|
|
676
|
+
)
|
|
677
|
+
ids = await _send_job_enter(
|
|
678
|
+
job_id=job_id,
|
|
679
|
+
name=f"eval ({cfg.source})" if cfg.source else "eval",
|
|
680
|
+
variants=None,
|
|
681
|
+
group=cfg.group_size,
|
|
682
|
+
api_key=None,
|
|
683
|
+
taskset=cfg.taskset,
|
|
684
|
+
tasks=tasks_data,
|
|
685
|
+
)
|
|
686
|
+
if ids:
|
|
687
|
+
if len(ids) != len(tasks_to_create):
|
|
688
|
+
hud_console.warning(
|
|
689
|
+
f"Task count mismatch: sent {len(tasks_to_create)} tasks, "
|
|
690
|
+
f"received {len(ids)} IDs. Some tasks may not be linked."
|
|
691
|
+
)
|
|
692
|
+
for task_obj, task_version_id in zip(tasks_to_create, ids, strict=False):
|
|
693
|
+
task_obj.id = task_version_id
|
|
694
|
+
|
|
688
695
|
await submit_rollouts(
|
|
689
696
|
tasks=tasks,
|
|
690
697
|
job_id=job_id,
|
|
@@ -721,6 +728,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
|
721
728
|
max_concurrent=cfg.max_concurrent,
|
|
722
729
|
group_size=cfg.group_size,
|
|
723
730
|
quiet=cfg.quiet,
|
|
731
|
+
taskset=cfg.taskset,
|
|
724
732
|
)
|
|
725
733
|
|
|
726
734
|
# Show reward for single task
|
|
@@ -787,6 +795,9 @@ def eval_command(
|
|
|
787
795
|
gateway: bool = typer.Option(
|
|
788
796
|
False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
|
|
789
797
|
),
|
|
798
|
+
taskset: str | None = typer.Option(
|
|
799
|
+
None, "--taskset", "-t", help="Taskset slug to associate job with"
|
|
800
|
+
),
|
|
790
801
|
) -> None:
|
|
791
802
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
792
803
|
|
|
@@ -821,6 +832,7 @@ def eval_command(
|
|
|
821
832
|
byok=byok,
|
|
822
833
|
quiet=quiet,
|
|
823
834
|
gateway=gateway,
|
|
835
|
+
taskset=taskset,
|
|
824
836
|
)
|
|
825
837
|
|
|
826
838
|
# Find source if not provided
|
hud/cli/flows/init.py
CHANGED
|
@@ -102,7 +102,7 @@ def smart_init(
|
|
|
102
102
|
hud_console.info(" hud set HUD_API_KEY=your-key-here")
|
|
103
103
|
hud_console.info(" Or: export HUD_API_KEY=your-key")
|
|
104
104
|
hud_console.info("")
|
|
105
|
-
hud_console.info("Get your key at: https://hud.ai/
|
|
105
|
+
hud_console.info("Get your key at: https://hud.ai/project/api-keys")
|
|
106
106
|
return
|
|
107
107
|
|
|
108
108
|
target = Path(directory).resolve()
|
hud/cli/pull.py
CHANGED
|
@@ -63,7 +63,7 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
|
|
|
63
63
|
|
|
64
64
|
# URL-encode the path segments to handle special characters in tags
|
|
65
65
|
url_safe_path = "/".join(quote(part, safe="") for part in reference.split("/"))
|
|
66
|
-
registry_url = f"{settings.
|
|
66
|
+
registry_url = f"{settings.hud_api_url.rstrip('/')}/registry/envs/{url_safe_path}"
|
|
67
67
|
|
|
68
68
|
headers = {}
|
|
69
69
|
if settings.api_key:
|
hud/cli/push.py
CHANGED
|
@@ -420,13 +420,20 @@ def push_environment(
|
|
|
420
420
|
|
|
421
421
|
# URL-encode the path segments to handle special characters in tags
|
|
422
422
|
url_safe_path = "/".join(quote(part, safe="") for part in name_with_tag.split("/"))
|
|
423
|
-
registry_url = f"{settings.
|
|
423
|
+
registry_url = f"{settings.hud_api_url.rstrip('/')}/registry/envs/{url_safe_path}"
|
|
424
|
+
|
|
425
|
+
# Detect git remote URL for matching existing GitHub-connected registries
|
|
426
|
+
from hud.cli.utils.git import get_git_remote_url
|
|
427
|
+
|
|
428
|
+
github_url = get_git_remote_url(Path(directory))
|
|
424
429
|
|
|
425
430
|
# Prepare the payload
|
|
426
|
-
payload = {
|
|
431
|
+
payload: dict[str, str | None] = {
|
|
427
432
|
"lock": yaml.dump(lock_data, default_flow_style=False, sort_keys=False),
|
|
428
433
|
"digest": pushed_digest.split("@")[-1] if "@" in pushed_digest else None,
|
|
429
434
|
}
|
|
435
|
+
if github_url:
|
|
436
|
+
payload["github_url"] = github_url
|
|
430
437
|
|
|
431
438
|
headers = {"Authorization": f"Bearer {settings.api_key}"}
|
|
432
439
|
|
hud/cli/tests/test_build.py
CHANGED
|
@@ -60,12 +60,12 @@ class TestIncrementVersion:
|
|
|
60
60
|
def test_increment_minor(self):
|
|
61
61
|
"""Test incrementing minor version."""
|
|
62
62
|
assert increment_version("1.2.3", "minor") == "1.3.0"
|
|
63
|
-
assert increment_version("0.5.
|
|
63
|
+
assert increment_version("0.5.13", "minor") == "0.6.0"
|
|
64
64
|
|
|
65
65
|
def test_increment_major(self):
|
|
66
66
|
"""Test incrementing major version."""
|
|
67
67
|
assert increment_version("1.2.3", "major") == "2.0.0"
|
|
68
|
-
assert increment_version("0.5.
|
|
68
|
+
assert increment_version("0.5.13", "major") == "1.0.0"
|
|
69
69
|
|
|
70
70
|
def test_increment_with_v_prefix(self):
|
|
71
71
|
"""Test incrementing version with v prefix."""
|
hud/cli/tests/test_push.py
CHANGED
|
@@ -160,7 +160,7 @@ class TestPushEnvironment:
|
|
|
160
160
|
mock_hud_console = mock.Mock()
|
|
161
161
|
mock_hud_console_class.return_value = mock_hud_console
|
|
162
162
|
mock_settings.api_key = "test-key"
|
|
163
|
-
mock_settings.
|
|
163
|
+
mock_settings.hud_api_url = "https://api.hud.test"
|
|
164
164
|
mock_get_username.return_value = "testuser"
|
|
165
165
|
|
|
166
166
|
# Create lock file
|
hud/cli/utils/metadata.py
CHANGED
|
@@ -32,7 +32,7 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
|
|
|
32
32
|
|
|
33
33
|
# URL-encode the path segments to handle special characters in tags
|
|
34
34
|
url_safe_path = "/".join(quote(part, safe="") for part in reference.split("/"))
|
|
35
|
-
registry_url = f"{settings.
|
|
35
|
+
registry_url = f"{settings.hud_api_url.rstrip('/')}/registry/envs/{url_safe_path}"
|
|
36
36
|
|
|
37
37
|
headers = {}
|
|
38
38
|
if settings.api_key:
|
|
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
@patch("hud.cli.utils.metadata.settings")
|
|
19
19
|
@patch("requests.get")
|
|
20
20
|
def test_fetch_lock_from_registry_success(mock_get, mock_settings):
|
|
21
|
-
mock_settings.
|
|
21
|
+
mock_settings.hud_api_url = "https://api.example.com"
|
|
22
22
|
mock_settings.api_key = None
|
|
23
23
|
resp = MagicMock(status_code=200)
|
|
24
24
|
resp.json.return_value = {"lock": "image: img\n"}
|
hud/clients/mcp_use.py
CHANGED
|
@@ -64,9 +64,14 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
64
64
|
return
|
|
65
65
|
|
|
66
66
|
# Use configurable timeout for SSE read operations to support long-running tool calls.
|
|
67
|
+
max_request_timeout = 840
|
|
67
68
|
for server_cfg in mcp_config.values():
|
|
68
69
|
if "sse_read_timeout" not in server_cfg:
|
|
69
|
-
server_cfg["sse_read_timeout"] =
|
|
70
|
+
server_cfg["sse_read_timeout"] = (
|
|
71
|
+
min(settings.client_timeout, max_request_timeout)
|
|
72
|
+
if settings.client_timeout > 0
|
|
73
|
+
else max_request_timeout
|
|
74
|
+
)
|
|
70
75
|
|
|
71
76
|
# If a server target matches HUD's MCP host and no auth is provided,
|
|
72
77
|
# inject the HUD API key as a Bearer token to avoid OAuth browser flow.
|
hud/datasets/loader.py
CHANGED
|
@@ -14,6 +14,10 @@ import warnings
|
|
|
14
14
|
from pathlib import Path
|
|
15
15
|
from typing import TYPE_CHECKING, Any, overload
|
|
16
16
|
|
|
17
|
+
import httpx
|
|
18
|
+
|
|
19
|
+
from hud.settings import settings
|
|
20
|
+
|
|
17
21
|
if TYPE_CHECKING:
|
|
18
22
|
from hud.eval.task import Task
|
|
19
23
|
|
|
@@ -63,7 +67,8 @@ def _load_from_file(path: Path) -> list[Task]:
|
|
|
63
67
|
from hud.eval.task import Task
|
|
64
68
|
|
|
65
69
|
raw_items = _load_raw_from_file(path)
|
|
66
|
-
|
|
70
|
+
# Default args to {} for runnable tasks (None = template)
|
|
71
|
+
return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
|
|
67
72
|
|
|
68
73
|
|
|
69
74
|
def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
|
|
@@ -99,15 +104,12 @@ def _load_from_huggingface(dataset_name: str) -> list[Task]:
|
|
|
99
104
|
raw_items = _load_raw_from_huggingface(dataset_name)
|
|
100
105
|
from hud.eval.task import Task
|
|
101
106
|
|
|
102
|
-
|
|
107
|
+
# Default args to {} for runnable tasks (None = template)
|
|
108
|
+
return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
|
|
103
109
|
|
|
104
110
|
|
|
105
111
|
def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
|
|
106
112
|
"""Load raw task dicts from HUD API."""
|
|
107
|
-
import httpx
|
|
108
|
-
|
|
109
|
-
from hud.settings import settings
|
|
110
|
-
|
|
111
113
|
headers = {}
|
|
112
114
|
if settings.api_key:
|
|
113
115
|
headers["Authorization"] = f"Bearer {settings.api_key}"
|
|
@@ -138,7 +140,8 @@ def _load_from_api(dataset_name: str) -> list[Task]:
|
|
|
138
140
|
from hud.eval.task import Task
|
|
139
141
|
|
|
140
142
|
raw_items = _load_raw_from_api(dataset_name)
|
|
141
|
-
|
|
143
|
+
# Default args to {} for runnable tasks (None = template)
|
|
144
|
+
return [Task(**{**item, "args": item.get("args") or {}}) for item in raw_items]
|
|
142
145
|
|
|
143
146
|
|
|
144
147
|
@overload
|
|
@@ -234,15 +237,15 @@ def save_tasks(
|
|
|
234
237
|
) -> str:
|
|
235
238
|
"""Save tasks to the HUD API.
|
|
236
239
|
|
|
237
|
-
Creates or updates
|
|
240
|
+
Creates or updates a taskset with the given tasks.
|
|
238
241
|
|
|
239
242
|
Args:
|
|
240
|
-
name:
|
|
243
|
+
name: Taskset name/slug (e.g., "my-evals/benchmark-v1").
|
|
241
244
|
If no org prefix, uses user's default org.
|
|
242
245
|
tasks: List of Task objects (v5 format) to save.
|
|
243
246
|
|
|
244
247
|
Returns:
|
|
245
|
-
The
|
|
248
|
+
The taskset ID of the created/updated taskset.
|
|
246
249
|
|
|
247
250
|
Example:
|
|
248
251
|
```python
|
|
@@ -258,7 +261,7 @@ def save_tasks(
|
|
|
258
261
|
]
|
|
259
262
|
|
|
260
263
|
# Save to HUD API
|
|
261
|
-
|
|
264
|
+
taskset_id = save_tasks("my-evals/benchmark-v1", tasks)
|
|
262
265
|
|
|
263
266
|
# Later, load them back
|
|
264
267
|
loaded = load_tasks("my-evals/benchmark-v1")
|
|
@@ -268,10 +271,6 @@ def save_tasks(
|
|
|
268
271
|
TypeError: If any task is not a v5 Task object (must have 'scenario')
|
|
269
272
|
ValueError: If API key is not set or save fails
|
|
270
273
|
"""
|
|
271
|
-
import httpx
|
|
272
|
-
|
|
273
|
-
from hud.settings import settings
|
|
274
|
-
|
|
275
274
|
if not settings.api_key:
|
|
276
275
|
raise ValueError("HUD_API_KEY is required to save tasks")
|
|
277
276
|
|
|
@@ -303,9 +302,9 @@ def save_tasks(
|
|
|
303
302
|
)
|
|
304
303
|
response.raise_for_status()
|
|
305
304
|
data = response.json()
|
|
306
|
-
|
|
307
|
-
logger.info("Saved %d tasks to
|
|
308
|
-
return
|
|
305
|
+
taskset_id = data.get("evalset_id") or data.get("id") or name
|
|
306
|
+
logger.info("Saved %d tasks to taskset: %s", len(tasks), taskset_id)
|
|
307
|
+
return taskset_id
|
|
309
308
|
except httpx.HTTPStatusError as e:
|
|
310
309
|
raise ValueError(f"Failed to save tasks: {e.response.text}") from e
|
|
311
310
|
except Exception as e:
|
hud/datasets/runner.py
CHANGED
|
@@ -29,6 +29,7 @@ async def run_dataset(
|
|
|
29
29
|
max_concurrent: int = 30,
|
|
30
30
|
group_size: int = 1,
|
|
31
31
|
quiet: bool = True,
|
|
32
|
+
taskset: str | None = None,
|
|
32
33
|
) -> list[EvalContext]:
|
|
33
34
|
"""Run an agent on a dataset of tasks.
|
|
34
35
|
|
|
@@ -40,7 +41,7 @@ async def run_dataset(
|
|
|
40
41
|
- A source string (file path, API slug) - loaded via load_tasks()
|
|
41
42
|
- A single TaskInput (Task, LegacyTask, or dict)
|
|
42
43
|
- A list of TaskInput objects
|
|
43
|
-
agent_type:
|
|
44
|
+
agent_type: Agent type (e.g., "claude", "openai", AgentType.CLAUDE).
|
|
44
45
|
agent_params: Parameters to pass to agent.create().
|
|
45
46
|
max_steps: Maximum steps per task.
|
|
46
47
|
max_concurrent: Maximum concurrent tasks (for parallel execution).
|
|
@@ -70,6 +71,10 @@ async def run_dataset(
|
|
|
70
71
|
from hud.datasets.loader import load_tasks
|
|
71
72
|
from hud.eval.task import Task
|
|
72
73
|
|
|
74
|
+
# Normalize agent_type to AgentType enum
|
|
75
|
+
if isinstance(agent_type, str):
|
|
76
|
+
agent_type = AgentType(agent_type)
|
|
77
|
+
|
|
73
78
|
# Normalize tasks to list[Task]
|
|
74
79
|
task_list: list[Task]
|
|
75
80
|
if isinstance(tasks, str):
|
|
@@ -86,19 +91,21 @@ async def run_dataset(
|
|
|
86
91
|
if not task_list:
|
|
87
92
|
raise ValueError("No tasks to run")
|
|
88
93
|
|
|
89
|
-
# Resolve agent class
|
|
90
|
-
agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type)
|
|
91
|
-
agent_cls = agent_type_enum.cls
|
|
92
|
-
|
|
93
94
|
# Use hud.eval() for both single and parallel execution
|
|
94
95
|
async with hud.eval(
|
|
95
96
|
task_list,
|
|
96
97
|
group=group_size,
|
|
97
98
|
max_concurrent=max_concurrent,
|
|
98
99
|
quiet=quiet,
|
|
100
|
+
taskset=taskset,
|
|
99
101
|
) as ctx:
|
|
100
|
-
#
|
|
101
|
-
|
|
102
|
+
# Build agent params - use system_prompt from ctx (set from task.agent_config)
|
|
103
|
+
final_agent_params = dict(agent_params or {})
|
|
104
|
+
if ctx.system_prompt and "system_prompt" not in final_agent_params:
|
|
105
|
+
final_agent_params["system_prompt"] = ctx.system_prompt
|
|
106
|
+
|
|
107
|
+
# Create agent using AgentType.cls.create()
|
|
108
|
+
agent = agent_type.cls.create(**final_agent_params)
|
|
102
109
|
await agent.run(ctx, max_steps=max_steps)
|
|
103
110
|
# Reward is computed by EvalContext.__aexit__ from evaluate tools
|
|
104
111
|
|
|
@@ -198,9 +205,8 @@ async def run_single_task(
|
|
|
198
205
|
if ctx.system_prompt and "system_prompt" not in final_agent_params:
|
|
199
206
|
final_agent_params["system_prompt"] = ctx.system_prompt
|
|
200
207
|
|
|
201
|
-
# Create agent
|
|
202
|
-
|
|
203
|
-
agent = agent_cls.create(**final_agent_params)
|
|
208
|
+
# Create agent using AgentType.cls.create()
|
|
209
|
+
agent = agent_type.cls.create(**final_agent_params)
|
|
204
210
|
|
|
205
211
|
# Store metadata if provided
|
|
206
212
|
if metadata:
|
|
@@ -12,8 +12,8 @@ from hud.datasets.loader import load_tasks
|
|
|
12
12
|
class TestLoadTasks:
|
|
13
13
|
"""Tests for load_tasks() function."""
|
|
14
14
|
|
|
15
|
-
@patch("httpx.Client")
|
|
16
|
-
@patch("hud.
|
|
15
|
+
@patch("hud.datasets.loader.httpx.Client")
|
|
16
|
+
@patch("hud.datasets.loader.settings")
|
|
17
17
|
def test_load_tasks_success(
|
|
18
18
|
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
19
19
|
) -> None:
|
|
@@ -22,7 +22,7 @@ class TestLoadTasks:
|
|
|
22
22
|
mock_settings.api_key = "test_key"
|
|
23
23
|
|
|
24
24
|
mock_response = MagicMock()
|
|
25
|
-
#
|
|
25
|
+
# EvalsetTasksResponse format: tasks keyed by task ID
|
|
26
26
|
mock_response.json.return_value = {
|
|
27
27
|
"evalset_id": "evalset-123",
|
|
28
28
|
"evalset_name": "test-dataset",
|
|
@@ -62,8 +62,8 @@ class TestLoadTasks:
|
|
|
62
62
|
params={"all": "true"},
|
|
63
63
|
)
|
|
64
64
|
|
|
65
|
-
@patch("httpx.Client")
|
|
66
|
-
@patch("hud.
|
|
65
|
+
@patch("hud.datasets.loader.httpx.Client")
|
|
66
|
+
@patch("hud.datasets.loader.settings")
|
|
67
67
|
def test_load_tasks_single_task(
|
|
68
68
|
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
69
69
|
) -> None:
|
|
@@ -97,8 +97,8 @@ class TestLoadTasks:
|
|
|
97
97
|
assert tasks[0].scenario == "checkout"
|
|
98
98
|
assert tasks[0].id == "task-1"
|
|
99
99
|
|
|
100
|
-
@patch("httpx.Client")
|
|
101
|
-
@patch("hud.
|
|
100
|
+
@patch("hud.datasets.loader.httpx.Client")
|
|
101
|
+
@patch("hud.datasets.loader.settings")
|
|
102
102
|
def test_load_tasks_no_api_key(
|
|
103
103
|
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
104
104
|
) -> None:
|
|
@@ -129,8 +129,8 @@ class TestLoadTasks:
|
|
|
129
129
|
params={"all": "true"},
|
|
130
130
|
)
|
|
131
131
|
|
|
132
|
-
@patch("httpx.Client")
|
|
133
|
-
@patch("hud.
|
|
132
|
+
@patch("hud.datasets.loader.httpx.Client")
|
|
133
|
+
@patch("hud.datasets.loader.settings")
|
|
134
134
|
def test_load_tasks_http_error(
|
|
135
135
|
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
136
136
|
) -> None:
|
|
@@ -149,8 +149,8 @@ class TestLoadTasks:
|
|
|
149
149
|
with pytest.raises(ValueError, match="Failed to load tasks"):
|
|
150
150
|
load_tasks("test-org/test-dataset")
|
|
151
151
|
|
|
152
|
-
@patch("httpx.Client")
|
|
153
|
-
@patch("hud.
|
|
152
|
+
@patch("hud.datasets.loader.httpx.Client")
|
|
153
|
+
@patch("hud.datasets.loader.settings")
|
|
154
154
|
def test_load_tasks_json_error(
|
|
155
155
|
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
156
156
|
) -> None:
|
|
@@ -171,8 +171,8 @@ class TestLoadTasks:
|
|
|
171
171
|
with pytest.raises(ValueError, match="Failed to load tasks"):
|
|
172
172
|
load_tasks("test-org/test-dataset")
|
|
173
173
|
|
|
174
|
-
@patch("httpx.Client")
|
|
175
|
-
@patch("hud.
|
|
174
|
+
@patch("hud.datasets.loader.httpx.Client")
|
|
175
|
+
@patch("hud.datasets.loader.settings")
|
|
176
176
|
def test_load_tasks_empty(self, mock_settings: MagicMock, mock_client_class: MagicMock) -> None:
|
|
177
177
|
"""load_tasks() handles empty dataset."""
|
|
178
178
|
mock_settings.hud_api_url = "https://api.hud.ai"
|
|
@@ -192,8 +192,8 @@ class TestLoadTasks:
|
|
|
192
192
|
|
|
193
193
|
assert len(tasks) == 0
|
|
194
194
|
|
|
195
|
-
@patch("httpx.Client")
|
|
196
|
-
@patch("hud.
|
|
195
|
+
@patch("hud.datasets.loader.httpx.Client")
|
|
196
|
+
@patch("hud.datasets.loader.settings")
|
|
197
197
|
def test_load_tasks_missing_fields(
|
|
198
198
|
self, mock_settings: MagicMock, mock_client_class: MagicMock
|
|
199
199
|
) -> None:
|
hud/environment/__init__.py
CHANGED
|
@@ -27,8 +27,8 @@ Usage:
|
|
|
27
27
|
from hud.environment.connection import ConnectionConfig, ConnectionType, Connector
|
|
28
28
|
from hud.environment.environment import Environment
|
|
29
29
|
from hud.environment.mock import MockMixin, generate_mock_value
|
|
30
|
-
from hud.environment.router import ConflictResolution, ToolRouter
|
|
31
|
-
from hud.environment.scenarios import ScenarioMixin
|
|
30
|
+
from hud.environment.router import ConflictResolution, MCPRouter, ToolRouter
|
|
31
|
+
from hud.environment.scenarios import ScenarioMixin, ScenarioSession
|
|
32
32
|
from hud.environment.types import EnvConfig
|
|
33
33
|
from hud.environment.utils import ToolFormat, format_result, parse_tool_call, parse_tool_calls
|
|
34
34
|
|
|
@@ -39,10 +39,12 @@ __all__ = [
|
|
|
39
39
|
"Connector",
|
|
40
40
|
"EnvConfig",
|
|
41
41
|
"Environment",
|
|
42
|
+
"MCPRouter",
|
|
42
43
|
"MockMixin",
|
|
43
44
|
"ScenarioMixin",
|
|
45
|
+
"ScenarioSession",
|
|
44
46
|
"ToolFormat",
|
|
45
|
-
"ToolRouter",
|
|
47
|
+
"ToolRouter", # Backwards compat alias for MCPRouter
|
|
46
48
|
"format_result",
|
|
47
49
|
"generate_mock_value",
|
|
48
50
|
"parse_tool_call",
|