hud-python 0.4.27__py3-none-any.whl → 0.4.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +2 -1
- hud/agents/base.py +73 -45
- hud/agents/claude.py +8 -4
- hud/agents/openai_chat_generic.py +65 -40
- hud/agents/tests/test_base.py +0 -4
- hud/agents/tests/test_openai.py +1 -1
- hud/cli/__init__.py +182 -52
- hud/cli/dev.py +8 -9
- hud/cli/eval.py +317 -119
- hud/cli/flows/__init__.py +0 -0
- hud/cli/flows/tasks.py +0 -0
- hud/cli/get.py +160 -0
- hud/cli/rl/__init__.py +563 -71
- hud/cli/rl/config.py +94 -0
- hud/cli/rl/display.py +133 -0
- hud/cli/rl/gpu.py +63 -0
- hud/cli/rl/gpu_utils.py +318 -0
- hud/cli/rl/presets.py +96 -0
- hud/cli/rl/remote_runner.py +348 -0
- hud/cli/rl/rl_api.py +150 -0
- hud/cli/rl/vllm.py +177 -0
- hud/cli/tests/test_analyze_metadata.py +0 -1
- hud/cli/utils/tasks.py +26 -0
- hud/clients/base.py +21 -23
- hud/clients/mcp_use.py +36 -44
- hud/clients/tests/test_mcp_use_retry.py +10 -10
- hud/datasets/__init__.py +4 -3
- hud/datasets/{execution/parallel.py → parallel.py} +1 -1
- hud/datasets/{execution/runner.py → runner.py} +1 -1
- hud/datasets/utils.py +1 -1
- hud/native/tests/test_native_init.py +1 -1
- hud/otel/config.py +1 -1
- hud/otel/instrumentation.py +35 -0
- hud/rl/README.md +31 -0
- hud/rl/__init__.py +1 -0
- hud/rl/actor.py +174 -0
- hud/rl/buffer.py +371 -0
- hud/rl/chat_template.jinja +101 -0
- hud/rl/config.py +184 -0
- hud/rl/distributed.py +95 -0
- hud/rl/learner.py +586 -0
- hud/rl/tests/__init__.py +1 -0
- hud/rl/tests/test_learner.py +171 -0
- hud/rl/train.py +354 -0
- hud/rl/types.py +101 -0
- hud/rl/utils/start_vllm_server.sh +30 -0
- hud/rl/utils.py +524 -0
- hud/rl/vllm_adapter.py +125 -0
- hud/settings.py +6 -0
- hud/telemetry/__init__.py +2 -1
- hud/telemetry/job.py +46 -3
- hud/telemetry/tests/test_trace.py +3 -3
- hud/telemetry/trace.py +85 -13
- hud/tools/computer/hud.py +4 -4
- hud/tools/tests/test_computer.py +3 -3
- hud/tools/tests/test_computer_actions.py +1 -1
- hud/types.py +123 -2
- hud/utils/group_eval.py +223 -0
- hud/utils/hud_console.py +113 -13
- hud/utils/tasks.py +119 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/RECORD +67 -47
- hud/cli/hf.py +0 -406
- hud/cli/rl/README.md +0 -243
- hud/cli/rl/init.py +0 -370
- hud/cli/rl/pod.py +0 -501
- hud/cli/rl/ssh.py +0 -322
- hud/cli/rl/train.py +0 -562
- hud/cli/rl/utils.py +0 -165
- hud/datasets/execution/__init__.py +0 -13
- hud/datasets/task.py +0 -116
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
hud/cli/__init__.py
CHANGED
|
@@ -24,12 +24,10 @@ from .debug import debug_mcp_stdio
|
|
|
24
24
|
from .dev import run_mcp_dev_server
|
|
25
25
|
|
|
26
26
|
# Import new commands
|
|
27
|
-
from .hf import hf_command
|
|
28
27
|
from .init import create_environment
|
|
29
28
|
from .pull import pull_command
|
|
30
29
|
from .push import push_command
|
|
31
30
|
from .remove import remove_command
|
|
32
|
-
from .rl import rl_app
|
|
33
31
|
from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
|
|
34
32
|
from .utils.logging import CaptureLogger
|
|
35
33
|
|
|
@@ -777,18 +775,17 @@ def eval(
|
|
|
777
775
|
"If not provided, looks for task.json in current directory."
|
|
778
776
|
),
|
|
779
777
|
),
|
|
778
|
+
agent: str | None = typer.Argument(
|
|
779
|
+
None,
|
|
780
|
+
help=(
|
|
781
|
+
"Agent backend to use (claude, openai, or vllm). If not provided, will prompt interactively." # noqa: E501
|
|
782
|
+
),
|
|
783
|
+
),
|
|
780
784
|
full: bool = typer.Option(
|
|
781
785
|
False,
|
|
782
786
|
"--full",
|
|
783
787
|
help="Run the entire dataset (omit for single-task debug mode)",
|
|
784
788
|
),
|
|
785
|
-
agent: str | None = typer.Option(
|
|
786
|
-
None,
|
|
787
|
-
"--agent",
|
|
788
|
-
help=(
|
|
789
|
-
"Agent backend to use (claude or openai). If not provided, will prompt interactively."
|
|
790
|
-
),
|
|
791
|
-
),
|
|
792
789
|
model: str | None = typer.Option(
|
|
793
790
|
None,
|
|
794
791
|
"--model",
|
|
@@ -829,8 +826,19 @@ def eval(
|
|
|
829
826
|
"--verbose",
|
|
830
827
|
help="Enable verbose output from the agent",
|
|
831
828
|
),
|
|
829
|
+
vllm_base_url: str | None = typer.Option(
|
|
830
|
+
None,
|
|
831
|
+
"--vllm-base-url",
|
|
832
|
+
help="Base URL for vLLM server (when using --agent vllm)",
|
|
833
|
+
),
|
|
834
|
+
group_size: int = typer.Option(
|
|
835
|
+
1,
|
|
836
|
+
"--group-size",
|
|
837
|
+
help="Number of times to run each task (similar to RL training)",
|
|
838
|
+
),
|
|
832
839
|
) -> None:
|
|
833
840
|
"""🚀 Run evaluation on datasets or individual tasks with agents."""
|
|
841
|
+
from hud.settings import settings
|
|
834
842
|
from hud.utils.hud_console import HUDConsole
|
|
835
843
|
|
|
836
844
|
hud_console = HUDConsole()
|
|
@@ -884,33 +892,70 @@ def eval(
|
|
|
884
892
|
source = file_choice
|
|
885
893
|
hud_console.success(f"Selected: {source}")
|
|
886
894
|
|
|
887
|
-
#
|
|
895
|
+
# Import eval_command lazily to avoid importing agent dependencies
|
|
896
|
+
try:
|
|
897
|
+
from .eval import eval_command, get_available_models
|
|
898
|
+
except ImportError as e:
|
|
899
|
+
hud_console.error(
|
|
900
|
+
"Evaluation dependencies are not installed. "
|
|
901
|
+
"Please install with: pip install 'hud-python[agent]'"
|
|
902
|
+
)
|
|
903
|
+
raise typer.Exit(1) from e
|
|
904
|
+
|
|
905
|
+
# If no agent specified, fetch available models and prompt for selection
|
|
906
|
+
base_model = None
|
|
888
907
|
if agent is None:
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
908
|
+
# Get available HUD models first
|
|
909
|
+
hud_models = get_available_models()
|
|
910
|
+
|
|
911
|
+
# Build choices starting with HUD models
|
|
912
|
+
choices = []
|
|
913
|
+
|
|
914
|
+
# Add HUD models as agent choices
|
|
915
|
+
for hud_model in hud_models:
|
|
916
|
+
model_name = hud_model["name"]
|
|
917
|
+
base_model = hud_model["base_model"]
|
|
918
|
+
vllm_status = " ⚡" if hud_model.get("vllm_url") else ""
|
|
919
|
+
choices.append({"name": f"{model_name}{vllm_status}", "value": f"{model_name}"})
|
|
920
|
+
|
|
921
|
+
# Add standard agent choices
|
|
922
|
+
choices.extend(
|
|
923
|
+
[
|
|
892
924
|
{"name": "Claude 4 Sonnet", "value": "claude"},
|
|
893
925
|
{"name": "OpenAI Computer Use", "value": "openai"},
|
|
894
|
-
|
|
895
|
-
|
|
926
|
+
{"name": "vLLM (Local Server)", "value": "vllm"},
|
|
927
|
+
]
|
|
896
928
|
)
|
|
897
929
|
|
|
930
|
+
agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
|
|
931
|
+
|
|
932
|
+
# Handle HUD model selection
|
|
933
|
+
if agent and agent not in ["claude", "openai", "vllm"]:
|
|
934
|
+
# Find remote model name
|
|
935
|
+
model = agent
|
|
936
|
+
if not vllm_base_url:
|
|
937
|
+
vllm_base_url = f"{settings.hud_rl_url}/models/{model}/vllm"
|
|
938
|
+
|
|
939
|
+
# Set model to base model for the vllm endpoint
|
|
940
|
+
if not base_model:
|
|
941
|
+
hud_models = get_available_models()
|
|
942
|
+
for hud_model in hud_models:
|
|
943
|
+
if hud_model["name"] == model:
|
|
944
|
+
base_model = hud_model["base_model"]
|
|
945
|
+
break
|
|
946
|
+
if not base_model:
|
|
947
|
+
hud_console.error(f"Model {model} not found")
|
|
948
|
+
raise typer.Exit(1)
|
|
949
|
+
model = base_model
|
|
950
|
+
agent = "vllm" # Use vLLM backend for HUD models
|
|
951
|
+
hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
|
|
952
|
+
|
|
898
953
|
# Validate agent choice
|
|
899
|
-
valid_agents = ["claude", "openai"]
|
|
954
|
+
valid_agents = ["claude", "openai", "vllm"]
|
|
900
955
|
if agent not in valid_agents:
|
|
901
956
|
hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
|
|
902
957
|
raise typer.Exit(1)
|
|
903
958
|
|
|
904
|
-
# Import eval_command lazily to avoid importing agent dependencies
|
|
905
|
-
try:
|
|
906
|
-
from .eval import eval_command
|
|
907
|
-
except ImportError as e:
|
|
908
|
-
hud_console.error(
|
|
909
|
-
"Evaluation dependencies are not installed. "
|
|
910
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
911
|
-
)
|
|
912
|
-
raise typer.Exit(1) from e
|
|
913
|
-
|
|
914
959
|
# Run the command
|
|
915
960
|
eval_command(
|
|
916
961
|
source=source,
|
|
@@ -924,40 +969,117 @@ def eval(
|
|
|
924
969
|
max_workers=max_workers,
|
|
925
970
|
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
926
971
|
verbose=verbose,
|
|
972
|
+
vllm_base_url=vllm_base_url,
|
|
973
|
+
group_size=group_size,
|
|
927
974
|
)
|
|
928
975
|
|
|
929
976
|
|
|
930
|
-
|
|
931
|
-
|
|
977
|
+
@app.command()
|
|
978
|
+
def get(
|
|
979
|
+
dataset_name: str = typer.Argument(
|
|
980
|
+
..., help="HuggingFace dataset name (e.g., 'hud-evals/browser-2048-tasks')"
|
|
981
|
+
),
|
|
982
|
+
split: str = typer.Option(
|
|
983
|
+
"train", "--split", "-s", help="Dataset split to download (train/test/validation)"
|
|
984
|
+
),
|
|
985
|
+
output: Path | None = typer.Option( # noqa: B008
|
|
986
|
+
None, "--output", "-o", help="Output filename (defaults to dataset_name.jsonl)"
|
|
987
|
+
),
|
|
988
|
+
limit: int | None = typer.Option(
|
|
989
|
+
None, "--limit", "-l", help="Limit number of examples to download"
|
|
990
|
+
),
|
|
991
|
+
format: str = typer.Option(
|
|
992
|
+
"json",
|
|
993
|
+
"--format",
|
|
994
|
+
"-f",
|
|
995
|
+
help="Output format: json (list) or jsonl (one task per line)",
|
|
996
|
+
),
|
|
997
|
+
) -> None:
|
|
998
|
+
"""📥 Download a HuggingFace dataset and save it as JSONL."""
|
|
999
|
+
from .get import get_command
|
|
1000
|
+
|
|
1001
|
+
get_command(
|
|
1002
|
+
dataset_name=dataset_name,
|
|
1003
|
+
split=split,
|
|
1004
|
+
output=output,
|
|
1005
|
+
limit=limit,
|
|
1006
|
+
format=format,
|
|
1007
|
+
)
|
|
932
1008
|
|
|
933
1009
|
|
|
934
1010
|
@app.command()
|
|
935
|
-
def
|
|
936
|
-
tasks_file:
|
|
937
|
-
None,
|
|
1011
|
+
def rl(
|
|
1012
|
+
tasks_file: str | None = typer.Argument(
|
|
1013
|
+
None,
|
|
1014
|
+
help=(
|
|
1015
|
+
"Path to tasks file (JSON/JSONL) or HuggingFace dataset name. "
|
|
1016
|
+
"If not provided, looks for tasks.json or tasks.jsonl in current directory."
|
|
1017
|
+
),
|
|
1018
|
+
),
|
|
1019
|
+
model: str | None = typer.Argument(
|
|
1020
|
+
None,
|
|
1021
|
+
help="Model to train (default: interactive selection)",
|
|
1022
|
+
),
|
|
1023
|
+
config_file: Path | None = typer.Option( # noqa: B008
|
|
1024
|
+
None,
|
|
1025
|
+
"--config",
|
|
1026
|
+
"-c",
|
|
1027
|
+
help="Path to existing configuration file",
|
|
1028
|
+
),
|
|
1029
|
+
output_dir: str = typer.Option(
|
|
1030
|
+
"checkpoints",
|
|
1031
|
+
"--output-dir",
|
|
1032
|
+
"-o",
|
|
1033
|
+
help="Output directory for checkpoints",
|
|
938
1034
|
),
|
|
939
|
-
|
|
940
|
-
|
|
1035
|
+
restart: bool = typer.Option(
|
|
1036
|
+
False,
|
|
1037
|
+
"--restart",
|
|
1038
|
+
help="Restart the vLLM server before training",
|
|
941
1039
|
),
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
1040
|
+
verbose: bool = typer.Option(
|
|
1041
|
+
False,
|
|
1042
|
+
"--verbose",
|
|
1043
|
+
"-v",
|
|
1044
|
+
help="Enable verbose output",
|
|
1045
|
+
),
|
|
1046
|
+
local: bool = typer.Option(
|
|
1047
|
+
False,
|
|
1048
|
+
"--local",
|
|
1049
|
+
help="Run training locally instead of using remote API server",
|
|
1050
|
+
),
|
|
1051
|
+
no_ddp: bool = typer.Option(
|
|
1052
|
+
False,
|
|
1053
|
+
"--no-ddp",
|
|
1054
|
+
help="Disable DDP even with multiple GPUs",
|
|
1055
|
+
),
|
|
1056
|
+
ddp_gpus: str | None = typer.Option(
|
|
1057
|
+
None,
|
|
1058
|
+
"--ddp-gpus",
|
|
1059
|
+
help="Specific GPUs for DDP (e.g., '0,1,2,3')",
|
|
1060
|
+
),
|
|
1061
|
+
vllm_gpu: int | None = typer.Option(
|
|
1062
|
+
None,
|
|
1063
|
+
"--vllm-gpu",
|
|
1064
|
+
help="Specific GPU for vLLM server",
|
|
946
1065
|
),
|
|
947
|
-
token: str | None = typer.Option(None, "--token", help="HuggingFace API token"),
|
|
948
1066
|
) -> None:
|
|
949
|
-
"""
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
Suggests dataset name based on environment if not provided.
|
|
1067
|
+
"""🎯 Run GRPO reinforcement learning training on tasks."""
|
|
1068
|
+
# Import from the rl module
|
|
1069
|
+
from .rl import rl_command
|
|
953
1070
|
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
1071
|
+
rl_command(
|
|
1072
|
+
tasks_file=tasks_file,
|
|
1073
|
+
model=model,
|
|
1074
|
+
config_file=config_file,
|
|
1075
|
+
output_dir=output_dir,
|
|
1076
|
+
restart=restart,
|
|
1077
|
+
verbose=verbose,
|
|
1078
|
+
local=local,
|
|
1079
|
+
no_ddp=no_ddp,
|
|
1080
|
+
ddp_gpus=ddp_gpus,
|
|
1081
|
+
vllm_gpu=vllm_gpu,
|
|
1082
|
+
)
|
|
961
1083
|
|
|
962
1084
|
|
|
963
1085
|
def main() -> None:
|
|
@@ -990,12 +1112,20 @@ def main() -> None:
|
|
|
990
1112
|
console.print(" 4. Share your environment: [cyan]hud push[/cyan]")
|
|
991
1113
|
console.print(" 5. Get shared environments: [cyan]hud pull <org/name:tag>[/cyan]")
|
|
992
1114
|
console.print(" 6. Run and test: [cyan]hud run <image>[/cyan]")
|
|
993
|
-
console.print("\n[yellow]RL Training:[/yellow]")
|
|
994
|
-
console.print(" 1.
|
|
1115
|
+
console.print("\n[yellow]Datasets & RL Training:[/yellow]")
|
|
1116
|
+
console.print(" 1. Get dataset: [cyan]hud get hud-evals/browser-2048-tasks[/cyan]")
|
|
995
1117
|
console.print(
|
|
996
1118
|
" 2. Create dataset: [cyan]hud hf tasks.json --name my-org/my-tasks[/cyan]"
|
|
997
1119
|
)
|
|
998
|
-
console.print(
|
|
1120
|
+
console.print(
|
|
1121
|
+
" 3. Start training: [cyan]hud rl browser-2048-tasks.jsonl --local[/cyan]"
|
|
1122
|
+
)
|
|
1123
|
+
console.print(
|
|
1124
|
+
" 4. Custom model: [cyan]hud rl tasks.jsonl --model meta-llama/Llama-3.2-3B --local[/cyan]" # noqa: E501
|
|
1125
|
+
)
|
|
1126
|
+
console.print(
|
|
1127
|
+
" 5. Restart server: [cyan]hud rl tasks.jsonl --restart --local[/cyan]\n"
|
|
1128
|
+
)
|
|
999
1129
|
|
|
1000
1130
|
app()
|
|
1001
1131
|
except typer.Exit as e:
|
hud/cli/dev.py
CHANGED
|
@@ -775,15 +775,14 @@ def run_mcp_dev_server(
|
|
|
775
775
|
hud_console.progress_message("🧪 Run with --interactive for interactive testing mode")
|
|
776
776
|
|
|
777
777
|
# Disable logs and hot-reload if interactive mode is enabled
|
|
778
|
-
if interactive:
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
no_reload = True
|
|
778
|
+
if interactive and not no_logs:
|
|
779
|
+
hud_console.warning("Docker logs disabled in interactive mode for better UI experience")
|
|
780
|
+
no_logs = True
|
|
781
|
+
# if not no_reload:
|
|
782
|
+
# hud_console.warning(
|
|
783
|
+
# "Hot-reload disabled in interactive mode to prevent output interference"
|
|
784
|
+
# )
|
|
785
|
+
# no_reload = True
|
|
787
786
|
|
|
788
787
|
# Show configuration as JSON (just the server config, not wrapped)
|
|
789
788
|
full_config = {}
|