hud-python 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +118 -33
- hud/agents/claude.py +1 -1
- hud/agents/openai.py +5 -16
- hud/agents/tests/test_openai.py +24 -79
- hud/cli/__init__.py +137 -15
- hud/cli/analyze.py +2 -4
- hud/cli/build.py +6 -2
- hud/cli/dev.py +67 -0
- hud/cli/eval.py +90 -35
- hud/cli/hf.py +406 -0
- hud/cli/init.py +49 -30
- hud/cli/tests/test_mcp_server.py +1 -4
- hud/clients/base.py +2 -0
- hud/clients/fastmcp.py +7 -2
- hud/clients/mcp_use.py +3 -1
- hud/clients/utils/retry_transport.py +34 -8
- hud/datasets/__init__.py +32 -0
- hud/datasets/execution/__init__.py +13 -0
- hud/datasets/execution/parallel.py +592 -0
- hud/datasets/execution/runner.py +123 -0
- hud/datasets/task.py +107 -0
- hud/datasets/utils.py +118 -0
- hud/otel/instrumentation.py +6 -1
- hud/server/server.py +58 -21
- hud/settings.py +12 -0
- hud/types.py +31 -10
- hud/utils/design.py +168 -2
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/METADATA +4 -3
- {hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/RECORD +34 -28
- hud/datasets.py +0 -327
- {hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/WHEEL +0 -0
- {hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/licenses/LICENSE +0 -0
hud/cli/__init__.py
CHANGED
|
@@ -22,10 +22,14 @@ from .build import build_command
|
|
|
22
22
|
from .clone import clone_repository, get_clone_message, print_error, print_tutorial
|
|
23
23
|
from .debug import debug_mcp_stdio
|
|
24
24
|
from .dev import run_mcp_dev_server
|
|
25
|
+
|
|
26
|
+
# Import new commands
|
|
27
|
+
from .hf import hf_command
|
|
25
28
|
from .init import create_environment
|
|
26
29
|
from .pull import pull_command
|
|
27
30
|
from .push import push_command
|
|
28
31
|
from .remove import remove_command
|
|
32
|
+
from .rl import rl_app
|
|
29
33
|
from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
|
|
30
34
|
from .utils.logging import CaptureLogger
|
|
31
35
|
|
|
@@ -760,19 +764,24 @@ def quickstart() -> None:
|
|
|
760
764
|
|
|
761
765
|
@app.command()
|
|
762
766
|
def eval(
|
|
763
|
-
source: str = typer.Argument(
|
|
764
|
-
|
|
765
|
-
help=
|
|
767
|
+
source: str | None = typer.Argument(
|
|
768
|
+
None,
|
|
769
|
+
help=(
|
|
770
|
+
"HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50') or task JSON file. "
|
|
771
|
+
"If not provided, looks for task.json in current directory."
|
|
772
|
+
),
|
|
766
773
|
),
|
|
767
774
|
full: bool = typer.Option(
|
|
768
775
|
False,
|
|
769
776
|
"--full",
|
|
770
777
|
help="Run the entire dataset (omit for single-task debug mode)",
|
|
771
778
|
),
|
|
772
|
-
agent: str = typer.Option(
|
|
773
|
-
|
|
779
|
+
agent: str | None = typer.Option(
|
|
780
|
+
None,
|
|
774
781
|
"--agent",
|
|
775
|
-
help=
|
|
782
|
+
help=(
|
|
783
|
+
"Agent backend to use (claude or openai). If not provided, will prompt interactively."
|
|
784
|
+
),
|
|
776
785
|
),
|
|
777
786
|
model: str | None = typer.Option(
|
|
778
787
|
None,
|
|
@@ -785,23 +794,99 @@ def eval(
|
|
|
785
794
|
help="Comma-separated list of allowed tools",
|
|
786
795
|
),
|
|
787
796
|
max_concurrent: int = typer.Option(
|
|
788
|
-
|
|
797
|
+
50,
|
|
789
798
|
"--max-concurrent",
|
|
790
|
-
help="
|
|
799
|
+
help="Max concurrent tasks (prevents rate limits in both asyncio and parallel modes)",
|
|
791
800
|
),
|
|
792
801
|
max_steps: int = typer.Option(
|
|
793
802
|
30,
|
|
794
803
|
"--max-steps",
|
|
795
804
|
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
796
805
|
),
|
|
806
|
+
parallel: bool = typer.Option(
|
|
807
|
+
False,
|
|
808
|
+
"--parallel",
|
|
809
|
+
help="Use process-based parallel execution for large datasets (100+ tasks)",
|
|
810
|
+
),
|
|
811
|
+
max_workers: int | None = typer.Option(
|
|
812
|
+
None,
|
|
813
|
+
"--max-workers",
|
|
814
|
+
help="Number of worker processes for parallel mode (auto-optimized if not set)",
|
|
815
|
+
),
|
|
816
|
+
max_concurrent_per_worker: int = typer.Option(
|
|
817
|
+
20,
|
|
818
|
+
"--max-concurrent-per-worker",
|
|
819
|
+
help="Maximum concurrent tasks per worker in parallel mode",
|
|
820
|
+
),
|
|
797
821
|
) -> None:
|
|
798
822
|
"""🚀 Run evaluation on datasets or individual tasks with agents."""
|
|
823
|
+
from hud.utils.design import HUDDesign
|
|
824
|
+
|
|
825
|
+
design = HUDDesign()
|
|
826
|
+
|
|
827
|
+
# If no source provided, look for task/eval JSON files in current directory
|
|
828
|
+
if source is None:
|
|
829
|
+
# Search for JSON files with "task" or "eval" in the name (case-insensitive)
|
|
830
|
+
json_files = []
|
|
831
|
+
patterns = [
|
|
832
|
+
"*task*.json",
|
|
833
|
+
"*eval*.json",
|
|
834
|
+
"*Task*.json",
|
|
835
|
+
"*Eval*.json",
|
|
836
|
+
"*TASK*.json",
|
|
837
|
+
"*EVAL*.json",
|
|
838
|
+
]
|
|
839
|
+
|
|
840
|
+
# First check current directory
|
|
841
|
+
for pattern in patterns:
|
|
842
|
+
json_files.extend(Path(".").glob(pattern))
|
|
843
|
+
|
|
844
|
+
# If no files found, search recursively (but limit depth to avoid deep searches)
|
|
845
|
+
if not json_files:
|
|
846
|
+
for pattern in patterns:
|
|
847
|
+
# Search up to 2 levels deep
|
|
848
|
+
json_files.extend(Path(".").glob(f"*/{pattern}"))
|
|
849
|
+
json_files.extend(Path(".").glob(f"*/*/{pattern}"))
|
|
850
|
+
|
|
851
|
+
# Remove duplicates and sort
|
|
852
|
+
json_files = sorted(set(json_files))
|
|
853
|
+
|
|
854
|
+
if not json_files:
|
|
855
|
+
design.error(
|
|
856
|
+
"No source provided and no task/eval JSON files found in current directory"
|
|
857
|
+
)
|
|
858
|
+
design.info(
|
|
859
|
+
"Usage: hud eval <source> or create a task JSON file "
|
|
860
|
+
"(e.g., task.json, eval_config.json)"
|
|
861
|
+
)
|
|
862
|
+
raise typer.Exit(1)
|
|
863
|
+
elif len(json_files) == 1:
|
|
864
|
+
source = str(json_files[0])
|
|
865
|
+
design.info(f"Found task file: {source}")
|
|
866
|
+
else:
|
|
867
|
+
# Multiple files found, let user choose
|
|
868
|
+
design.info("Multiple task files found:")
|
|
869
|
+
file_choice = design.select(
|
|
870
|
+
"Select a task file to run:",
|
|
871
|
+
choices=[str(f) for f in json_files],
|
|
872
|
+
)
|
|
873
|
+
source = file_choice
|
|
874
|
+
design.success(f"Selected: {source}")
|
|
875
|
+
|
|
876
|
+
# If no agent specified, prompt for selection
|
|
877
|
+
if agent is None:
|
|
878
|
+
agent = design.select(
|
|
879
|
+
"Select an agent to use:",
|
|
880
|
+
choices=[
|
|
881
|
+
{"name": "Claude 4 Sonnet", "value": "claude"},
|
|
882
|
+
{"name": "OpenAI Computer Use", "value": "openai"},
|
|
883
|
+
],
|
|
884
|
+
default="Claude 4 Sonnet",
|
|
885
|
+
)
|
|
886
|
+
|
|
799
887
|
# Validate agent choice
|
|
800
888
|
valid_agents = ["claude", "openai"]
|
|
801
889
|
if agent not in valid_agents:
|
|
802
|
-
from hud.utils.design import HUDDesign
|
|
803
|
-
|
|
804
|
-
design = HUDDesign()
|
|
805
890
|
design.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
|
|
806
891
|
raise typer.Exit(1)
|
|
807
892
|
|
|
@@ -809,9 +894,6 @@ def eval(
|
|
|
809
894
|
try:
|
|
810
895
|
from .eval import eval_command
|
|
811
896
|
except ImportError as e:
|
|
812
|
-
from hud.utils.design import HUDDesign
|
|
813
|
-
|
|
814
|
-
design = HUDDesign()
|
|
815
897
|
design.error(
|
|
816
898
|
"Evaluation dependencies are not installed. "
|
|
817
899
|
"Please install with: pip install 'hud-python[agent]'"
|
|
@@ -827,9 +909,45 @@ def eval(
|
|
|
827
909
|
allowed_tools=allowed_tools,
|
|
828
910
|
max_concurrent=max_concurrent,
|
|
829
911
|
max_steps=max_steps,
|
|
912
|
+
parallel=parallel,
|
|
913
|
+
max_workers=max_workers,
|
|
914
|
+
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
830
915
|
)
|
|
831
916
|
|
|
832
917
|
|
|
918
|
+
# Add the RL subcommand group
|
|
919
|
+
app.add_typer(rl_app, name="rl")
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
@app.command()
|
|
923
|
+
def hf(
|
|
924
|
+
tasks_file: Path | None = typer.Argument( # noqa: B008
|
|
925
|
+
None, help="JSON file containing tasks (auto-detected if not provided)"
|
|
926
|
+
),
|
|
927
|
+
name: str | None = typer.Option(
|
|
928
|
+
None, "--name", "-n", help="Dataset name (e.g., 'my-org/my-dataset')"
|
|
929
|
+
),
|
|
930
|
+
push: bool = typer.Option(True, "--push/--no-push", help="Push to HuggingFace Hub"),
|
|
931
|
+
private: bool = typer.Option(False, "--private", help="Make dataset private on Hub"),
|
|
932
|
+
update_lock: bool = typer.Option(
|
|
933
|
+
True, "--update-lock/--no-update-lock", help="Update hud.lock.yaml"
|
|
934
|
+
),
|
|
935
|
+
token: str | None = typer.Option(None, "--token", help="HuggingFace API token"),
|
|
936
|
+
) -> None:
|
|
937
|
+
"""📊 Convert tasks to HuggingFace dataset format.
|
|
938
|
+
|
|
939
|
+
Automatically detects task files if not specified.
|
|
940
|
+
Suggests dataset name based on environment if not provided.
|
|
941
|
+
|
|
942
|
+
Examples:
|
|
943
|
+
hud hf # Auto-detect tasks and suggest name
|
|
944
|
+
hud hf tasks.json # Use specific file, suggest name
|
|
945
|
+
hud hf --name my-org/my-tasks # Auto-detect tasks, use name
|
|
946
|
+
hud hf tasks.json --name hud-evals/web-tasks --private
|
|
947
|
+
"""
|
|
948
|
+
hf_command(tasks_file, name, push, private, update_lock, token)
|
|
949
|
+
|
|
950
|
+
|
|
833
951
|
def main() -> None:
|
|
834
952
|
"""Main entry point for the CLI."""
|
|
835
953
|
# Show header for main help
|
|
@@ -846,7 +964,11 @@ def main() -> None:
|
|
|
846
964
|
console.print(" 3. Build for production: [cyan]hud build[/cyan]")
|
|
847
965
|
console.print(" 4. Share your environment: [cyan]hud push[/cyan]")
|
|
848
966
|
console.print(" 5. Get shared environments: [cyan]hud pull <org/name:tag>[/cyan]")
|
|
849
|
-
console.print(" 6. Run and test: [cyan]hud run <image>[/cyan]
|
|
967
|
+
console.print(" 6. Run and test: [cyan]hud run <image>[/cyan]")
|
|
968
|
+
console.print("\n[yellow]RL Training:[/yellow]")
|
|
969
|
+
console.print(" 1. Generate config: [cyan]hud rl init my-env:latest[/cyan]")
|
|
970
|
+
console.print(" 2. Create dataset: [cyan]hud hf tasks.json --name my-org/my-tasks[/cyan]")
|
|
971
|
+
console.print(" 3. Start training: [cyan]hud rl --model Qwen/Qwen2.5-3B[/cyan]\n")
|
|
850
972
|
|
|
851
973
|
app()
|
|
852
974
|
|
hud/cli/analyze.py
CHANGED
|
@@ -3,7 +3,8 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
-
from
|
|
6
|
+
from pathlib import Path # noqa: TC003
|
|
7
|
+
from typing import Any
|
|
7
8
|
|
|
8
9
|
from rich.console import Console
|
|
9
10
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
@@ -14,9 +15,6 @@ from rich.tree import Tree
|
|
|
14
15
|
from hud.clients import MCPClient
|
|
15
16
|
from hud.utils.design import HUDDesign
|
|
16
17
|
|
|
17
|
-
if TYPE_CHECKING:
|
|
18
|
-
from pathlib import Path
|
|
19
|
-
|
|
20
18
|
console = Console()
|
|
21
19
|
design = HUDDesign()
|
|
22
20
|
|
hud/cli/build.py
CHANGED
|
@@ -431,10 +431,14 @@ def build_environment(
|
|
|
431
431
|
if optional_env:
|
|
432
432
|
lock_content["environment"]["variables"]["optional"] = optional_env
|
|
433
433
|
|
|
434
|
-
# Add
|
|
434
|
+
# Add tools with full schemas for RL config generation
|
|
435
435
|
if analysis["tools"]:
|
|
436
436
|
lock_content["tools"] = [
|
|
437
|
-
{
|
|
437
|
+
{
|
|
438
|
+
"name": tool["name"],
|
|
439
|
+
"description": tool.get("description", ""),
|
|
440
|
+
"inputSchema": tool.get("inputSchema", {}),
|
|
441
|
+
}
|
|
438
442
|
for tool in analysis["tools"]
|
|
439
443
|
]
|
|
440
444
|
|
hud/cli/dev.py
CHANGED
|
@@ -7,6 +7,7 @@ import base64
|
|
|
7
7
|
import json
|
|
8
8
|
import subprocess
|
|
9
9
|
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
10
11
|
|
|
11
12
|
import click
|
|
12
13
|
from fastmcp import FastMCP
|
|
@@ -155,6 +156,7 @@ async def start_mcp_proxy(
|
|
|
155
156
|
import asyncio
|
|
156
157
|
import logging
|
|
157
158
|
import os
|
|
159
|
+
import signal
|
|
158
160
|
import sys
|
|
159
161
|
|
|
160
162
|
from .utils.logging import find_free_port
|
|
@@ -440,12 +442,30 @@ async def start_mcp_proxy(
|
|
|
440
442
|
log_design.warning(f"Traceback: {traceback.format_exc()}") # noqa: G004
|
|
441
443
|
await asyncio.sleep(1)
|
|
442
444
|
|
|
445
|
+
# Import contextlib here so it's available in the finally block
|
|
446
|
+
import contextlib
|
|
447
|
+
|
|
443
448
|
# CRITICAL: Create proxy AFTER all logging setup to prevent it from resetting logging config
|
|
444
449
|
# This is important because FastMCP might initialize loggers during creation
|
|
445
450
|
proxy = create_proxy_server(
|
|
446
451
|
directory, image_name, no_reload, full_reload, verbose, docker_args or [], interactive
|
|
447
452
|
)
|
|
448
453
|
|
|
454
|
+
# Set up signal handlers for graceful shutdown
|
|
455
|
+
shutdown_event = asyncio.Event()
|
|
456
|
+
|
|
457
|
+
def signal_handler(signum: int, frame: Any) -> None:
|
|
458
|
+
"""Handle signals by setting shutdown event."""
|
|
459
|
+
design.info(f"\n📡 Received signal {signum}, shutting down gracefully...")
|
|
460
|
+
shutdown_event.set()
|
|
461
|
+
|
|
462
|
+
# Register signal handlers - SIGINT is available on all platforms
|
|
463
|
+
signal.signal(signal.SIGINT, signal_handler)
|
|
464
|
+
|
|
465
|
+
# SIGTERM is not available on Windows
|
|
466
|
+
if hasattr(signal, "SIGTERM"):
|
|
467
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
|
468
|
+
|
|
449
469
|
# One more attempt to suppress the FastMCP server log
|
|
450
470
|
if not verbose:
|
|
451
471
|
# Re-apply the filter in case new handlers were created
|
|
@@ -479,6 +499,47 @@ async def start_mcp_proxy(
|
|
|
479
499
|
for handler in logger.handlers:
|
|
480
500
|
handler.addFilter(block_filter)
|
|
481
501
|
|
|
502
|
+
# Track if container has been stopped to avoid duplicate stops
|
|
503
|
+
container_stopped = False
|
|
504
|
+
|
|
505
|
+
# Function to stop the container gracefully
|
|
506
|
+
async def stop_container() -> None:
|
|
507
|
+
"""Stop the Docker container gracefully with SIGTERM, wait 30s, then SIGKILL if needed."""
|
|
508
|
+
nonlocal container_stopped
|
|
509
|
+
if container_stopped:
|
|
510
|
+
return # Already stopped, don't do it again
|
|
511
|
+
|
|
512
|
+
try:
|
|
513
|
+
# Check if container exists
|
|
514
|
+
check_result = await asyncio.create_subprocess_exec(
|
|
515
|
+
"docker",
|
|
516
|
+
"ps",
|
|
517
|
+
"--format",
|
|
518
|
+
"{{.Names}}",
|
|
519
|
+
"--filter",
|
|
520
|
+
f"name={container_name}",
|
|
521
|
+
stdout=asyncio.subprocess.PIPE,
|
|
522
|
+
stderr=asyncio.subprocess.DEVNULL,
|
|
523
|
+
)
|
|
524
|
+
stdout, _ = await check_result.communicate()
|
|
525
|
+
|
|
526
|
+
if container_name in stdout.decode():
|
|
527
|
+
design.info("🛑 Stopping container gracefully...")
|
|
528
|
+
# Stop with 30 second timeout before SIGKILL
|
|
529
|
+
stop_result = await asyncio.create_subprocess_exec(
|
|
530
|
+
"docker",
|
|
531
|
+
"stop",
|
|
532
|
+
"--time=30",
|
|
533
|
+
container_name,
|
|
534
|
+
stdout=asyncio.subprocess.DEVNULL,
|
|
535
|
+
stderr=asyncio.subprocess.DEVNULL,
|
|
536
|
+
)
|
|
537
|
+
await stop_result.communicate()
|
|
538
|
+
design.success("✅ Container stopped successfully")
|
|
539
|
+
container_stopped = True
|
|
540
|
+
except Exception as e:
|
|
541
|
+
design.warning(f"Failed to stop container: {e}")
|
|
542
|
+
|
|
482
543
|
try:
|
|
483
544
|
# Start Docker logs streaming if enabled
|
|
484
545
|
log_task = None
|
|
@@ -530,6 +591,9 @@ async def start_mcp_proxy(
|
|
|
530
591
|
except KeyboardInterrupt:
|
|
531
592
|
design.info("\n👋 Shutting down...")
|
|
532
593
|
|
|
594
|
+
# Stop the container before showing next steps
|
|
595
|
+
await stop_container()
|
|
596
|
+
|
|
533
597
|
# Show next steps tutorial
|
|
534
598
|
if not interactive: # Only show if not in interactive mode
|
|
535
599
|
design.section_title("Next Steps")
|
|
@@ -565,6 +629,9 @@ async def start_mcp_proxy(
|
|
|
565
629
|
except asyncio.CancelledError:
|
|
566
630
|
contextlib.suppress(asyncio.CancelledError)
|
|
567
631
|
|
|
632
|
+
# Always try to stop container on exit
|
|
633
|
+
await stop_container()
|
|
634
|
+
|
|
568
635
|
|
|
569
636
|
def run_mcp_dev_server(
|
|
570
637
|
directory: str = ".",
|
hud/cli/eval.py
CHANGED
|
@@ -76,8 +76,6 @@ async def run_single_task(
|
|
|
76
76
|
) -> None:
|
|
77
77
|
"""Load one task and execute it, or detect if JSON contains a list and run as dataset."""
|
|
78
78
|
|
|
79
|
-
design.info("📊 Loading dataset…")
|
|
80
|
-
|
|
81
79
|
# Import Task and run_dataset lazily
|
|
82
80
|
try:
|
|
83
81
|
from hud.datasets import Task, run_dataset
|
|
@@ -91,6 +89,7 @@ async def run_single_task(
|
|
|
91
89
|
# Check if it's a JSON file
|
|
92
90
|
path = Path(source)
|
|
93
91
|
if path.exists() and path.suffix == ".json":
|
|
92
|
+
design.info("📊 Loading task file…")
|
|
94
93
|
with open(path) as f: # noqa: ASYNC230
|
|
95
94
|
json_data = json.load(f)
|
|
96
95
|
|
|
@@ -111,8 +110,7 @@ async def run_single_task(
|
|
|
111
110
|
)
|
|
112
111
|
raise typer.Exit(1) from e
|
|
113
112
|
|
|
114
|
-
agent_config: dict[str, Any] = {
|
|
115
|
-
}
|
|
113
|
+
agent_config: dict[str, Any] = {}
|
|
116
114
|
if allowed_tools:
|
|
117
115
|
agent_config["allowed_tools"] = allowed_tools
|
|
118
116
|
|
|
@@ -161,6 +159,7 @@ async def run_single_task(
|
|
|
161
159
|
raise typer.Exit(1)
|
|
162
160
|
else:
|
|
163
161
|
# Load from HuggingFace dataset
|
|
162
|
+
design.info(f"📊 Loading dataset from HuggingFace: {source}…")
|
|
164
163
|
try:
|
|
165
164
|
from datasets import load_dataset
|
|
166
165
|
except ImportError as e:
|
|
@@ -195,14 +194,20 @@ async def run_full_dataset(
|
|
|
195
194
|
agent_type: Literal["claude", "openai"] = "claude",
|
|
196
195
|
model: str | None = None,
|
|
197
196
|
allowed_tools: list[str] | None = None,
|
|
198
|
-
max_concurrent: int =
|
|
199
|
-
max_steps: int =
|
|
197
|
+
max_concurrent: int = 50,
|
|
198
|
+
max_steps: int = 10,
|
|
199
|
+
parallel: bool = False,
|
|
200
|
+
max_workers: int | None = None,
|
|
201
|
+
max_concurrent_per_worker: int = 25,
|
|
200
202
|
) -> list[Any]:
|
|
201
|
-
"""Run evaluation across the entire dataset
|
|
203
|
+
"""Run evaluation across the entire dataset.
|
|
204
|
+
|
|
205
|
+
Uses either asyncio-based run_dataset or process-based parallel execution
|
|
206
|
+
depending on the parallel flag."""
|
|
202
207
|
|
|
203
208
|
# Import run_dataset lazily
|
|
204
209
|
try:
|
|
205
|
-
from hud.datasets import run_dataset
|
|
210
|
+
from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
|
|
206
211
|
except ImportError as e:
|
|
207
212
|
design.error(
|
|
208
213
|
"Dataset dependencies are not installed. "
|
|
@@ -240,8 +245,7 @@ async def run_full_dataset(
|
|
|
240
245
|
)
|
|
241
246
|
raise typer.Exit(1) from e
|
|
242
247
|
|
|
243
|
-
agent_config: dict[str, Any] = {
|
|
244
|
-
}
|
|
248
|
+
agent_config: dict[str, Any] = {}
|
|
245
249
|
if allowed_tools:
|
|
246
250
|
agent_config["allowed_tools"] = allowed_tools
|
|
247
251
|
|
|
@@ -263,16 +267,47 @@ async def run_full_dataset(
|
|
|
263
267
|
if allowed_tools:
|
|
264
268
|
agent_config["allowed_tools"] = allowed_tools
|
|
265
269
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
270
|
+
if parallel:
|
|
271
|
+
design.info(
|
|
272
|
+
f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501
|
|
273
|
+
)
|
|
274
|
+
if max_workers is None:
|
|
275
|
+
# Use auto-optimization (now the default run_dataset_parallel)
|
|
276
|
+
return await run_dataset_parallel(
|
|
277
|
+
name=f"Evaluation {dataset_name}",
|
|
278
|
+
dataset=dataset_or_tasks,
|
|
279
|
+
agent_class=agent_class,
|
|
280
|
+
agent_config=agent_config,
|
|
281
|
+
max_concurrent=max_concurrent,
|
|
282
|
+
metadata={"dataset": source, "parallel": True},
|
|
283
|
+
max_steps=max_steps,
|
|
284
|
+
auto_respond=True,
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
# Use manual configuration
|
|
288
|
+
return await run_dataset_parallel_manual(
|
|
289
|
+
name=f"Evaluation {dataset_name}",
|
|
290
|
+
dataset=dataset_or_tasks,
|
|
291
|
+
agent_class=agent_class,
|
|
292
|
+
agent_config=agent_config,
|
|
293
|
+
max_workers=max_workers,
|
|
294
|
+
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
295
|
+
max_concurrent=max_concurrent,
|
|
296
|
+
metadata={"dataset": source, "parallel": True},
|
|
297
|
+
max_steps=max_steps,
|
|
298
|
+
auto_respond=True,
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
design.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
|
|
302
|
+
return await run_dataset(
|
|
303
|
+
name=f"Evaluation {dataset_name}",
|
|
304
|
+
dataset=dataset_or_tasks,
|
|
305
|
+
agent_class=agent_class,
|
|
306
|
+
agent_config=agent_config,
|
|
307
|
+
max_concurrent=max_concurrent,
|
|
308
|
+
metadata={"dataset": source},
|
|
309
|
+
max_steps=max_steps,
|
|
310
|
+
)
|
|
276
311
|
|
|
277
312
|
|
|
278
313
|
def eval_command(
|
|
@@ -303,13 +338,28 @@ def eval_command(
|
|
|
303
338
|
max_concurrent: int = typer.Option(
|
|
304
339
|
50,
|
|
305
340
|
"--max-concurrent",
|
|
306
|
-
help="Concurrency level for
|
|
341
|
+
help="Concurrency level for asyncio mode (ignored in parallel mode)",
|
|
307
342
|
),
|
|
308
|
-
max_steps: int = typer.Option(
|
|
343
|
+
max_steps: int | None = typer.Option(
|
|
309
344
|
None,
|
|
310
345
|
"--max-steps",
|
|
311
346
|
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
312
347
|
),
|
|
348
|
+
parallel: bool = typer.Option(
|
|
349
|
+
False,
|
|
350
|
+
"--parallel",
|
|
351
|
+
help="Use process-based parallel execution for large datasets (100+ tasks)",
|
|
352
|
+
),
|
|
353
|
+
max_workers: int | None = typer.Option(
|
|
354
|
+
None,
|
|
355
|
+
"--max-workers",
|
|
356
|
+
help="Number of worker processes for parallel mode (auto-optimized if not set)",
|
|
357
|
+
),
|
|
358
|
+
max_concurrent_per_worker: int = typer.Option(
|
|
359
|
+
20,
|
|
360
|
+
"--max-concurrent-per-worker",
|
|
361
|
+
help="Maximum concurrent tasks per worker in parallel mode",
|
|
362
|
+
),
|
|
313
363
|
) -> None:
|
|
314
364
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
315
365
|
|
|
@@ -317,40 +367,42 @@ def eval_command(
|
|
|
317
367
|
# Evaluate a single task from SheetBench
|
|
318
368
|
hud eval hud-evals/SheetBench-50
|
|
319
369
|
|
|
320
|
-
# Evaluate the FULL SheetBench dataset with Claude
|
|
370
|
+
# Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
|
|
321
371
|
hud eval hud-evals/SheetBench-50 --full --agent claude
|
|
322
372
|
|
|
373
|
+
# Run large dataset with PARALLEL execution (auto-optimized)
|
|
374
|
+
hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
|
|
375
|
+
|
|
376
|
+
# Parallel mode with manual configuration (16 workers, 25 tasks each)
|
|
377
|
+
hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
|
|
378
|
+
|
|
379
|
+
# Limit total concurrent tasks to prevent rate limits
|
|
380
|
+
hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
|
|
381
|
+
|
|
323
382
|
# Run a single task from a JSON file
|
|
324
383
|
hud eval task.json
|
|
325
384
|
|
|
326
|
-
# Run multiple tasks from a JSON file
|
|
327
|
-
hud eval tasks.json
|
|
328
|
-
|
|
329
|
-
# Run JSON list with full dataset mode and concurrency
|
|
330
|
-
hud eval tasks.json --full --max-concurrent 10
|
|
385
|
+
# Run multiple tasks from a JSON file with parallel execution
|
|
386
|
+
hud eval tasks.json --full --parallel
|
|
331
387
|
|
|
332
388
|
# Run with OpenAI Operator agent
|
|
333
389
|
hud eval hud-evals/OSWorld-Gold-Beta --agent openai
|
|
334
390
|
"""
|
|
335
|
-
import os
|
|
336
|
-
|
|
337
391
|
from hud.settings import settings
|
|
338
392
|
|
|
339
393
|
# Check for required API keys
|
|
340
394
|
if agent == "claude":
|
|
341
|
-
if not settings.anthropic_api_key
|
|
395
|
+
if not settings.anthropic_api_key:
|
|
342
396
|
design.error("ANTHROPIC_API_KEY is required for Claude agent")
|
|
343
397
|
design.info("Set it in your environment or .env file: ANTHROPIC_API_KEY=your-key-here")
|
|
344
398
|
raise typer.Exit(1)
|
|
345
|
-
elif agent == "openai" and
|
|
346
|
-
not settings.openai_api_key or not os.environ.get("OPENAI_API_KEY")
|
|
347
|
-
):
|
|
399
|
+
elif agent == "openai" and not settings.openai_api_key:
|
|
348
400
|
design.error("OPENAI_API_KEY is required for OpenAI agent")
|
|
349
401
|
design.info("Set it in your environment or .env file: OPENAI_API_KEY=your-key-here")
|
|
350
402
|
raise typer.Exit(1)
|
|
351
403
|
|
|
352
404
|
# Check for HUD_API_KEY if using HUD services
|
|
353
|
-
if not settings.api_key
|
|
405
|
+
if not settings.api_key:
|
|
354
406
|
design.warning("HUD_API_KEY not set. Some features may be limited.")
|
|
355
407
|
design.info("Get your API key at: https://app.hud.so")
|
|
356
408
|
|
|
@@ -373,6 +425,9 @@ def eval_command(
|
|
|
373
425
|
allowed_tools=allowed_tools_list,
|
|
374
426
|
max_concurrent=max_concurrent,
|
|
375
427
|
max_steps=max_steps,
|
|
428
|
+
parallel=parallel,
|
|
429
|
+
max_workers=max_workers,
|
|
430
|
+
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
376
431
|
)
|
|
377
432
|
)
|
|
378
433
|
else:
|