hud-python 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +50 -1
- hud/cli/__init__.py +120 -1
- hud/cli/analyze_metadata.py +29 -41
- hud/cli/build.py +7 -0
- hud/cli/debug.py +8 -1
- hud/cli/eval.py +226 -0
- hud/cli/list_func.py +212 -0
- hud/cli/pull.py +4 -13
- hud/cli/push.py +84 -41
- hud/cli/registry.py +155 -0
- hud/cli/remove.py +200 -0
- hud/cli/tests/test_analyze_metadata.py +277 -0
- hud/cli/tests/test_build.py +450 -0
- hud/cli/tests/test_list_func.py +288 -0
- hud/cli/tests/test_pull.py +400 -0
- hud/cli/tests/test_push.py +379 -0
- hud/cli/tests/test_registry.py +264 -0
- hud/clients/base.py +13 -1
- hud/clients/fastmcp.py +13 -9
- hud/clients/mcp_use.py +1 -1
- hud/tools/__init__.py +2 -0
- hud/tools/response.py +54 -0
- hud/utils/design.py +10 -0
- hud/utils/mcp.py +14 -2
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.7.dist-info → hud_python-0.4.9.dist-info}/METADATA +13 -1
- {hud_python-0.4.7.dist-info → hud_python-0.4.9.dist-info}/RECORD +31 -20
- {hud_python-0.4.7.dist-info → hud_python-0.4.9.dist-info}/WHEEL +0 -0
- {hud_python-0.4.7.dist-info → hud_python-0.4.9.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.7.dist-info → hud_python-0.4.9.dist-info}/licenses/LICENSE +0 -0
hud/agents/base.py
CHANGED
|
@@ -85,6 +85,7 @@ class MCPAgent(ABC):
|
|
|
85
85
|
self._tool_map: dict[str, types.Tool] = {} # Simplified: just name to tool
|
|
86
86
|
self.screenshot_history: list[str] = []
|
|
87
87
|
self._auto_trace = auto_trace
|
|
88
|
+
self._auto_trace_cm: Any | None = None # Store auto-created trace context manager
|
|
88
89
|
self.initialization_complete = False
|
|
89
90
|
|
|
90
91
|
# Response agent to automatically interact with the model
|
|
@@ -303,6 +304,9 @@ class MCPAgent(ABC):
|
|
|
303
304
|
except Exception as e:
|
|
304
305
|
logger.warning("ResponseAgent failed: %s", e)
|
|
305
306
|
if decision == "STOP":
|
|
307
|
+
# Try to submit response through lifecycle tool
|
|
308
|
+
await self._maybe_submit_response(response, messages)
|
|
309
|
+
|
|
306
310
|
logger.info("Stopping execution")
|
|
307
311
|
final_response = response
|
|
308
312
|
break
|
|
@@ -483,6 +487,40 @@ class MCPAgent(ABC):
|
|
|
483
487
|
self._available_tools.append(tool)
|
|
484
488
|
# Simplified mapping - just tool name to tool
|
|
485
489
|
self._tool_map[tool.name] = tool
|
|
490
|
+
|
|
491
|
+
# Auto-detect response tool as a lifecycle tool
|
|
492
|
+
if tool.name == "response" and "response" not in self.lifecycle_tools:
|
|
493
|
+
logger.debug("Auto-detected 'response' tool as a lifecycle tool")
|
|
494
|
+
self.lifecycle_tools.append("response")
|
|
495
|
+
|
|
496
|
+
async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
|
|
497
|
+
"""Submit response through lifecycle tool if available.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
response: The agent's response
|
|
501
|
+
messages: The current message history (will be modified in-place)
|
|
502
|
+
"""
|
|
503
|
+
# Check if we have a response lifecycle tool
|
|
504
|
+
if "response" in self.lifecycle_tools and "response" in self._tool_map:
|
|
505
|
+
logger.debug("Calling response lifecycle tool")
|
|
506
|
+
try:
|
|
507
|
+
# Call the response tool with the agent's response
|
|
508
|
+
response_tool_call = MCPToolCall(
|
|
509
|
+
name="response",
|
|
510
|
+
arguments={"response": response.content, "messages": messages}
|
|
511
|
+
)
|
|
512
|
+
response_results = await self.call_tools(response_tool_call)
|
|
513
|
+
|
|
514
|
+
# Format and add the response tool results to messages
|
|
515
|
+
response_messages = await self.format_tool_results(
|
|
516
|
+
[response_tool_call], response_results
|
|
517
|
+
)
|
|
518
|
+
messages.extend(response_messages)
|
|
519
|
+
|
|
520
|
+
# Mark the task as done
|
|
521
|
+
logger.info("Response lifecycle tool executed, marking task as done")
|
|
522
|
+
except Exception as e:
|
|
523
|
+
logger.error("Response lifecycle tool failed: %s", e)
|
|
486
524
|
|
|
487
525
|
async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
|
|
488
526
|
"""Inject metadata into the metadata of the initialize request."""
|
|
@@ -491,7 +529,7 @@ class MCPAgent(ABC):
|
|
|
491
529
|
mcp_config,
|
|
492
530
|
MCPConfigPatch(meta=self.metadata),
|
|
493
531
|
)
|
|
494
|
-
setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
|
|
532
|
+
self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
|
|
495
533
|
|
|
496
534
|
def get_available_tools(self) -> list[types.Tool]:
|
|
497
535
|
"""Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
|
|
@@ -532,6 +570,17 @@ class MCPAgent(ABC):
|
|
|
532
570
|
|
|
533
571
|
async def _cleanup(self) -> None:
|
|
534
572
|
"""Cleanup resources."""
|
|
573
|
+
# Clean up auto-created trace if any
|
|
574
|
+
if self._auto_trace_cm:
|
|
575
|
+
try:
|
|
576
|
+
self._auto_trace_cm.__exit__(None, None, None)
|
|
577
|
+
logger.info("Closed auto-created trace")
|
|
578
|
+
except Exception as e:
|
|
579
|
+
logger.warning("Failed to close auto-created trace: %s", e)
|
|
580
|
+
finally:
|
|
581
|
+
self._auto_trace_cm = None
|
|
582
|
+
|
|
583
|
+
# Clean up auto-created client
|
|
535
584
|
if self._auto_created_client and self.mcp_client:
|
|
536
585
|
try:
|
|
537
586
|
await self.mcp_client.shutdown()
|
hud/cli/__init__.py
CHANGED
|
@@ -23,10 +23,13 @@ from .clone import clone_repository, get_clone_message, print_error, print_tutor
|
|
|
23
23
|
from .cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
|
|
24
24
|
from .debug import debug_mcp_stdio
|
|
25
25
|
from .init import create_environment
|
|
26
|
+
from . import list_func as list_module
|
|
26
27
|
from .mcp_server import run_mcp_dev_server
|
|
27
28
|
from .pull import pull_command
|
|
28
29
|
from .push import push_command
|
|
30
|
+
from .remove import remove_command
|
|
29
31
|
from .utils import CaptureLogger
|
|
32
|
+
from .eval import eval_command
|
|
30
33
|
|
|
31
34
|
# Create the main Typer app
|
|
32
35
|
app = typer.Typer(
|
|
@@ -442,7 +445,8 @@ def run(
|
|
|
442
445
|
|
|
443
446
|
# Get URL from options or environment
|
|
444
447
|
if not url:
|
|
445
|
-
|
|
448
|
+
from hud.settings import settings
|
|
449
|
+
url = settings.hud_mcp_url
|
|
446
450
|
|
|
447
451
|
run_remote_server(image, docker_args, transport, port, url, api_key, run_id, verbose)
|
|
448
452
|
|
|
@@ -561,6 +565,63 @@ def pull(
|
|
|
561
565
|
pull_command(target, lock_file, yes, verify_only, verbose)
|
|
562
566
|
|
|
563
567
|
|
|
568
|
+
@app.command(name="list")
|
|
569
|
+
def list_environments(
|
|
570
|
+
filter_name: str | None = typer.Option(
|
|
571
|
+
None, "--filter", "-f", help="Filter environments by name (case-insensitive)"
|
|
572
|
+
),
|
|
573
|
+
json_output: bool = typer.Option(
|
|
574
|
+
False, "--json", help="Output as JSON"
|
|
575
|
+
),
|
|
576
|
+
show_all: bool = typer.Option(
|
|
577
|
+
False, "--all", "-a", help="Show all columns including digest"
|
|
578
|
+
),
|
|
579
|
+
verbose: bool = typer.Option(
|
|
580
|
+
False, "--verbose", "-v", help="Show detailed output"
|
|
581
|
+
),
|
|
582
|
+
) -> None:
|
|
583
|
+
"""📋 List all HUD environments in local registry.
|
|
584
|
+
|
|
585
|
+
Shows environments pulled with 'hud pull' stored in ~/.hud/envs/
|
|
586
|
+
|
|
587
|
+
Examples:
|
|
588
|
+
hud list # List all environments
|
|
589
|
+
hud list --filter text # Filter by name
|
|
590
|
+
hud list --json # Output as JSON
|
|
591
|
+
hud list --all # Show digest column
|
|
592
|
+
hud list --verbose # Show full descriptions
|
|
593
|
+
"""
|
|
594
|
+
list_module.list_command(filter_name, json_output, show_all, verbose)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
@app.command()
|
|
598
|
+
def remove(
|
|
599
|
+
target: str | None = typer.Argument(
|
|
600
|
+
None,
|
|
601
|
+
help="Environment to remove (digest, name, or 'all' for all environments)"
|
|
602
|
+
),
|
|
603
|
+
yes: bool = typer.Option(
|
|
604
|
+
False, "--yes", "-y", help="Skip confirmation prompt"
|
|
605
|
+
),
|
|
606
|
+
verbose: bool = typer.Option(
|
|
607
|
+
False, "--verbose", "-v", help="Show detailed output"
|
|
608
|
+
),
|
|
609
|
+
) -> None:
|
|
610
|
+
"""🗑️ Remove HUD environments from local registry.
|
|
611
|
+
|
|
612
|
+
Removes environment metadata from ~/.hud/envs/
|
|
613
|
+
Note: This does not remove the Docker images.
|
|
614
|
+
|
|
615
|
+
Examples:
|
|
616
|
+
hud remove abc123 # Remove by digest
|
|
617
|
+
hud remove text_2048 # Remove by name
|
|
618
|
+
hud remove hudpython/test_init # Remove by full name
|
|
619
|
+
hud remove all # Remove all environments
|
|
620
|
+
hud remove all --yes # Remove all without confirmation
|
|
621
|
+
"""
|
|
622
|
+
remove_command(target, yes, verbose)
|
|
623
|
+
|
|
624
|
+
|
|
564
625
|
@app.command()
|
|
565
626
|
def init(
|
|
566
627
|
name: str = typer.Argument(None, help="Environment name (default: current directory name)"),
|
|
@@ -592,6 +653,64 @@ def quickstart() -> None:
|
|
|
592
653
|
clone("https://github.com/hud-evals/quickstart.git")
|
|
593
654
|
|
|
594
655
|
|
|
656
|
+
@app.command()
|
|
657
|
+
def eval(
|
|
658
|
+
source: str = typer.Argument(
|
|
659
|
+
...,
|
|
660
|
+
help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
|
|
661
|
+
),
|
|
662
|
+
full: bool = typer.Option(
|
|
663
|
+
False,
|
|
664
|
+
"--full",
|
|
665
|
+
help="Run the entire dataset (omit for single-task debug mode)",
|
|
666
|
+
),
|
|
667
|
+
agent: str = typer.Option(
|
|
668
|
+
"claude",
|
|
669
|
+
"--agent",
|
|
670
|
+
help="Agent backend to use (claude or openai)",
|
|
671
|
+
),
|
|
672
|
+
model: str | None = typer.Option(
|
|
673
|
+
None,
|
|
674
|
+
"--model",
|
|
675
|
+
help="Model name for the chosen agent",
|
|
676
|
+
),
|
|
677
|
+
allowed_tools: str | None = typer.Option(
|
|
678
|
+
None,
|
|
679
|
+
"--allowed-tools",
|
|
680
|
+
help="Comma-separated list of allowed tools",
|
|
681
|
+
),
|
|
682
|
+
max_concurrent: int = typer.Option(
|
|
683
|
+
30,
|
|
684
|
+
"--max-concurrent",
|
|
685
|
+
help="Concurrency level for full-dataset mode",
|
|
686
|
+
),
|
|
687
|
+
max_steps: int = typer.Option(
|
|
688
|
+
30,
|
|
689
|
+
"--max-steps",
|
|
690
|
+
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
691
|
+
),
|
|
692
|
+
) -> None:
|
|
693
|
+
"""🚀 Run evaluation on datasets or individual tasks with agents."""
|
|
694
|
+
# Validate agent choice
|
|
695
|
+
valid_agents = ["claude", "openai"]
|
|
696
|
+
if agent not in valid_agents:
|
|
697
|
+
from hud.utils.design import HUDDesign
|
|
698
|
+
design = HUDDesign()
|
|
699
|
+
design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
|
|
700
|
+
raise typer.Exit(1)
|
|
701
|
+
|
|
702
|
+
# Import and run the command
|
|
703
|
+
eval_command(
|
|
704
|
+
source=source,
|
|
705
|
+
full=full,
|
|
706
|
+
agent=agent, # type: ignore
|
|
707
|
+
model=model,
|
|
708
|
+
allowed_tools=allowed_tools,
|
|
709
|
+
max_concurrent=max_concurrent,
|
|
710
|
+
max_steps=max_steps,
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
|
|
595
714
|
def main() -> None:
|
|
596
715
|
"""Main entry point for the CLI."""
|
|
597
716
|
# Show header for main help
|
hud/cli/analyze_metadata.py
CHANGED
|
@@ -12,6 +12,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
|
12
12
|
from hud.settings import settings
|
|
13
13
|
from hud.utils.design import HUDDesign
|
|
14
14
|
|
|
15
|
+
from .registry import get_registry_dir, list_registry_entries, extract_digest_from_image, load_from_registry
|
|
16
|
+
|
|
15
17
|
console = Console()
|
|
16
18
|
design = HUDDesign()
|
|
17
19
|
|
|
@@ -50,38 +52,31 @@ def fetch_lock_from_registry(reference: str) -> dict | None:
|
|
|
50
52
|
|
|
51
53
|
def check_local_cache(reference: str) -> dict | None:
|
|
52
54
|
"""Check local cache for lock file."""
|
|
53
|
-
#
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
# Check specific digest directory
|
|
80
|
-
lock_file = Path.home() / ".hud" / "envs" / digest / "hud.lock.yaml"
|
|
81
|
-
if lock_file.exists():
|
|
82
|
-
with open(lock_file) as f:
|
|
83
|
-
return yaml.safe_load(f)
|
|
84
|
-
|
|
55
|
+
# First try exact digest match
|
|
56
|
+
digest = extract_digest_from_image(reference)
|
|
57
|
+
lock_data = load_from_registry(digest)
|
|
58
|
+
if lock_data:
|
|
59
|
+
return lock_data
|
|
60
|
+
|
|
61
|
+
# If not found and reference has a name, search by name pattern
|
|
62
|
+
if "/" in reference:
|
|
63
|
+
# Look for any cached version of this image
|
|
64
|
+
ref_base = reference.split("@")[0].split(":")[0]
|
|
65
|
+
|
|
66
|
+
for digest, lock_file in list_registry_entries():
|
|
67
|
+
try:
|
|
68
|
+
with open(lock_file) as f:
|
|
69
|
+
lock_data = yaml.safe_load(f)
|
|
70
|
+
# Check if this matches our reference
|
|
71
|
+
if lock_data and "image" in lock_data:
|
|
72
|
+
image = lock_data["image"]
|
|
73
|
+
# Match by name (ignoring tag/digest)
|
|
74
|
+
img_base = image.split("@")[0].split(":")[0]
|
|
75
|
+
if ref_base in img_base or img_base in ref_base:
|
|
76
|
+
return lock_data
|
|
77
|
+
except Exception:
|
|
78
|
+
continue
|
|
79
|
+
|
|
85
80
|
return None
|
|
86
81
|
|
|
87
82
|
|
|
@@ -147,15 +142,8 @@ async def analyze_from_metadata(reference: str, output_format: str, verbose: boo
|
|
|
147
142
|
source = "registry"
|
|
148
143
|
|
|
149
144
|
# Save to local cache for next time
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
else:
|
|
153
|
-
digest = "latest"
|
|
154
|
-
|
|
155
|
-
cache_dir = Path.home() / ".hud" / "envs" / digest
|
|
156
|
-
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
157
|
-
with open(cache_dir / "hud.lock.yaml", "w") as f: # noqa: ASYNC230
|
|
158
|
-
yaml.dump(lock_data, f, default_flow_style=False, sort_keys=False)
|
|
145
|
+
from .registry import save_to_registry
|
|
146
|
+
save_to_registry(lock_data, lock_data.get("image", ""), verbose=False)
|
|
159
147
|
else:
|
|
160
148
|
progress.update(task, description="[red]✗ Not found[/red]")
|
|
161
149
|
|
hud/cli/build.py
CHANGED
|
@@ -17,6 +17,8 @@ from hud.clients import MCPClient
|
|
|
17
17
|
from hud.utils.design import HUDDesign
|
|
18
18
|
from hud.version import __version__ as hud_version
|
|
19
19
|
|
|
20
|
+
from .registry import save_to_registry
|
|
21
|
+
|
|
20
22
|
|
|
21
23
|
def parse_version(version_str: str) -> tuple[int, int, int]:
|
|
22
24
|
"""Parse version string like '1.0.0' or '1.0' into tuple of integers."""
|
|
@@ -459,6 +461,11 @@ def build_environment(
|
|
|
459
461
|
# Remove temp image after we're done
|
|
460
462
|
subprocess.run(["docker", "rmi", temp_tag], capture_output=True) # noqa: S603, S607
|
|
461
463
|
|
|
464
|
+
# Add to local registry
|
|
465
|
+
if image_id:
|
|
466
|
+
# Save to local registry using the helper
|
|
467
|
+
save_to_registry(lock_content, lock_content.get("image", tag), verbose)
|
|
468
|
+
|
|
462
469
|
# Print summary
|
|
463
470
|
design.section_title("Build Complete")
|
|
464
471
|
|
hud/cli/debug.py
CHANGED
|
@@ -167,7 +167,14 @@ async def debug_mcp_stdio(command: list[str], logger: CaptureLogger, max_phase:
|
|
|
167
167
|
break
|
|
168
168
|
except Exception as e:
|
|
169
169
|
logger.error(f"Failed to parse MCP response: {e}")
|
|
170
|
-
|
|
170
|
+
logger.error(f"Raw output that caused the error: {repr(line)}")
|
|
171
|
+
logger.hint("This usually means non-JSON output is being sent to STDOUT")
|
|
172
|
+
logger.hint("Common causes:")
|
|
173
|
+
logger.hint(" - Print statements in your server code")
|
|
174
|
+
logger.hint(" - Library warnings (use warnings.filterwarnings)")
|
|
175
|
+
logger.hint(" - Import-time output from dependencies")
|
|
176
|
+
phases_completed = 1 # Mark as failed
|
|
177
|
+
break # Stop trying to parse
|
|
171
178
|
|
|
172
179
|
if response and "result" in response:
|
|
173
180
|
logger.success("MCP server initialized successfully")
|
hud/cli/eval.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""HUD evaluation command for running tasks and datasets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
import typer
|
|
12
|
+
from datasets import load_dataset
|
|
13
|
+
|
|
14
|
+
import hud
|
|
15
|
+
from hud.agents import ClaudeAgent, OperatorAgent
|
|
16
|
+
from hud.agents.misc.response_agent import ResponseAgent
|
|
17
|
+
from hud.datasets import Task, run_dataset
|
|
18
|
+
from hud.utils.design import HUDDesign
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
design = HUDDesign()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_agent(
|
|
25
|
+
agent_type: Literal["claude", "openai"],
|
|
26
|
+
*,
|
|
27
|
+
model: str | None = None,
|
|
28
|
+
allowed_tools: list[str] | None = None,
|
|
29
|
+
) -> ClaudeAgent | OperatorAgent:
|
|
30
|
+
"""Create and return the requested agent type."""
|
|
31
|
+
|
|
32
|
+
if agent_type == "openai":
|
|
33
|
+
allowed_tools = allowed_tools or ["openai_computer"]
|
|
34
|
+
|
|
35
|
+
return OperatorAgent(
|
|
36
|
+
allowed_tools=allowed_tools,
|
|
37
|
+
response_agent=ResponseAgent(),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Fallback Claude agent (Anthropic)
|
|
41
|
+
model = model or "claude-sonnet-4-20250514"
|
|
42
|
+
allowed_tools = allowed_tools or ["anthropic_computer"]
|
|
43
|
+
|
|
44
|
+
return ClaudeAgent(
|
|
45
|
+
model=model,
|
|
46
|
+
allowed_tools=allowed_tools,
|
|
47
|
+
response_agent=ResponseAgent(),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
async def run_single_task(
|
|
52
|
+
source: str,
|
|
53
|
+
*,
|
|
54
|
+
agent_type: Literal["claude", "openai"] = "claude",
|
|
55
|
+
model: str | None = None,
|
|
56
|
+
allowed_tools: list[str] | None = None,
|
|
57
|
+
max_steps: int = 10,
|
|
58
|
+
) -> None:
|
|
59
|
+
"""Load one task and execute it."""
|
|
60
|
+
|
|
61
|
+
design.info("📊 Loading dataset…")
|
|
62
|
+
|
|
63
|
+
# Check if it's a single task JSON file
|
|
64
|
+
path = Path(source)
|
|
65
|
+
if path.exists() and path.suffix == ".json":
|
|
66
|
+
with open(path, "r") as f:
|
|
67
|
+
task_data = json.load(f)
|
|
68
|
+
task = Task(**task_data)
|
|
69
|
+
else:
|
|
70
|
+
# Load from HuggingFace dataset
|
|
71
|
+
dataset = load_dataset(source, split="train")
|
|
72
|
+
|
|
73
|
+
# Get first task from dataset
|
|
74
|
+
sample_task = dataset[0] # type: ignore[index]
|
|
75
|
+
task = Task(**sample_task) # type: ignore[arg-type]
|
|
76
|
+
|
|
77
|
+
task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
|
|
78
|
+
|
|
79
|
+
with hud.trace(name=task_prompt):
|
|
80
|
+
agent = build_agent(
|
|
81
|
+
agent_type,
|
|
82
|
+
model=model,
|
|
83
|
+
allowed_tools=allowed_tools,
|
|
84
|
+
)
|
|
85
|
+
design.info(task.prompt)
|
|
86
|
+
result = await agent.run(task, max_steps=max_steps)
|
|
87
|
+
design.success(f"Reward: {result.reward}")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
async def run_full_dataset(
|
|
91
|
+
source: str,
|
|
92
|
+
*,
|
|
93
|
+
agent_type: Literal["claude", "openai"] = "claude",
|
|
94
|
+
model: str | None = None,
|
|
95
|
+
allowed_tools: list[str] | None = None,
|
|
96
|
+
max_concurrent: int = 30,
|
|
97
|
+
max_steps: int = 50,
|
|
98
|
+
) -> list[Any]:
|
|
99
|
+
"""Run evaluation across the entire dataset using hud.datasets.run_dataset."""
|
|
100
|
+
|
|
101
|
+
# Build agent class + config for run_dataset
|
|
102
|
+
if agent_type == "openai":
|
|
103
|
+
agent_class = OperatorAgent
|
|
104
|
+
agent_config: dict[str, Any] = {
|
|
105
|
+
"allowed_tools": allowed_tools or ["openai_computer"],
|
|
106
|
+
}
|
|
107
|
+
else:
|
|
108
|
+
agent_class = ClaudeAgent
|
|
109
|
+
agent_config = {
|
|
110
|
+
"model": model or "claude-sonnet-4-20250514",
|
|
111
|
+
"allowed_tools": allowed_tools or ["anthropic_computer"],
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
design.info("🚀 Running evaluation…")
|
|
115
|
+
return await run_dataset(
|
|
116
|
+
name=f"Evaluation {source.split('/')[-1]}",
|
|
117
|
+
dataset=source,
|
|
118
|
+
agent_class=agent_class,
|
|
119
|
+
agent_config=agent_config,
|
|
120
|
+
max_concurrent=max_concurrent,
|
|
121
|
+
metadata={"dataset": source},
|
|
122
|
+
max_steps=max_steps,
|
|
123
|
+
auto_respond=True,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def eval_command(
|
|
128
|
+
source: str = typer.Argument(
|
|
129
|
+
...,
|
|
130
|
+
help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file",
|
|
131
|
+
),
|
|
132
|
+
full: bool = typer.Option(
|
|
133
|
+
False,
|
|
134
|
+
"--full",
|
|
135
|
+
help="Run the entire dataset (omit for single-task debug mode)",
|
|
136
|
+
),
|
|
137
|
+
agent: Literal["claude", "openai"] = typer.Option(
|
|
138
|
+
"claude",
|
|
139
|
+
"--agent",
|
|
140
|
+
help="Agent backend to use",
|
|
141
|
+
),
|
|
142
|
+
model: str | None = typer.Option(
|
|
143
|
+
None,
|
|
144
|
+
"--model",
|
|
145
|
+
help="Model name for the chosen agent",
|
|
146
|
+
),
|
|
147
|
+
allowed_tools: str | None = typer.Option(
|
|
148
|
+
None,
|
|
149
|
+
"--allowed-tools",
|
|
150
|
+
help="Comma-separated list of allowed tools",
|
|
151
|
+
),
|
|
152
|
+
max_concurrent: int = typer.Option(
|
|
153
|
+
50,
|
|
154
|
+
"--max-concurrent",
|
|
155
|
+
help="Concurrency level for full-dataset mode",
|
|
156
|
+
),
|
|
157
|
+
max_steps: int = typer.Option(
|
|
158
|
+
None,
|
|
159
|
+
"--max-steps",
|
|
160
|
+
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
161
|
+
),
|
|
162
|
+
) -> None:
|
|
163
|
+
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
164
|
+
|
|
165
|
+
Examples:
|
|
166
|
+
# Evaluate a single task from SheetBench
|
|
167
|
+
hud eval hud-evals/SheetBench-50
|
|
168
|
+
|
|
169
|
+
# Evaluate the FULL SheetBench dataset with Claude
|
|
170
|
+
hud eval hud-evals/SheetBench-50 --full --agent claude
|
|
171
|
+
|
|
172
|
+
# Run a single task from a JSON file
|
|
173
|
+
hud eval task.json
|
|
174
|
+
|
|
175
|
+
# Run with OpenAI Operator agent
|
|
176
|
+
hud eval hud-evals/OSWorld-Gold-Beta --agent openai
|
|
177
|
+
"""
|
|
178
|
+
from hud.settings import settings
|
|
179
|
+
import os
|
|
180
|
+
|
|
181
|
+
# Check for required API keys
|
|
182
|
+
if agent == "claude":
|
|
183
|
+
if not settings.anthropic_api_key or not os.environ.get("ANTHROPIC_API_KEY"):
|
|
184
|
+
design.error("ANTHROPIC_API_KEY is required for Claude agent")
|
|
185
|
+
design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
|
|
186
|
+
raise typer.Exit(1)
|
|
187
|
+
elif agent == "openai":
|
|
188
|
+
if not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY"):
|
|
189
|
+
design.error("OPENAI_API_KEY is required for OpenAI agent")
|
|
190
|
+
design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
|
|
191
|
+
raise typer.Exit(1)
|
|
192
|
+
|
|
193
|
+
# Check for HUD_API_KEY if using HUD services
|
|
194
|
+
if not settings.api_key or not os.environ.get("HUD_API_KEY"):
|
|
195
|
+
design.warning("HUD_API_KEY not set. Some features may be limited.")
|
|
196
|
+
design.info("Get your API key at: https://app.hud.so")
|
|
197
|
+
|
|
198
|
+
# Parse allowed tools
|
|
199
|
+
allowed_tools_list = (
|
|
200
|
+
[t.strip() for t in allowed_tools.split(",") if t.strip()]
|
|
201
|
+
if allowed_tools
|
|
202
|
+
else None
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Set default max_steps if not provided
|
|
206
|
+
if max_steps is None:
|
|
207
|
+
max_steps = 50 if full else 10
|
|
208
|
+
|
|
209
|
+
# Run evaluation
|
|
210
|
+
if full:
|
|
211
|
+
asyncio.run(run_full_dataset(
|
|
212
|
+
source,
|
|
213
|
+
agent_type=agent,
|
|
214
|
+
model=model,
|
|
215
|
+
allowed_tools=allowed_tools_list,
|
|
216
|
+
max_concurrent=max_concurrent,
|
|
217
|
+
max_steps=max_steps,
|
|
218
|
+
))
|
|
219
|
+
else:
|
|
220
|
+
asyncio.run(run_single_task(
|
|
221
|
+
source,
|
|
222
|
+
agent_type=agent,
|
|
223
|
+
model=model,
|
|
224
|
+
allowed_tools=allowed_tools_list,
|
|
225
|
+
max_steps=max_steps,
|
|
226
|
+
))
|