hud-python 0.4.27__py3-none-any.whl → 0.4.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (76) hide show
  1. hud/__init__.py +2 -1
  2. hud/agents/base.py +73 -45
  3. hud/agents/claude.py +8 -4
  4. hud/agents/openai_chat_generic.py +65 -40
  5. hud/agents/tests/test_base.py +0 -4
  6. hud/agents/tests/test_openai.py +1 -1
  7. hud/cli/__init__.py +182 -52
  8. hud/cli/dev.py +8 -9
  9. hud/cli/eval.py +317 -119
  10. hud/cli/flows/__init__.py +0 -0
  11. hud/cli/flows/tasks.py +0 -0
  12. hud/cli/get.py +160 -0
  13. hud/cli/rl/__init__.py +563 -71
  14. hud/cli/rl/config.py +94 -0
  15. hud/cli/rl/display.py +133 -0
  16. hud/cli/rl/gpu.py +63 -0
  17. hud/cli/rl/gpu_utils.py +318 -0
  18. hud/cli/rl/presets.py +96 -0
  19. hud/cli/rl/remote_runner.py +348 -0
  20. hud/cli/rl/rl_api.py +150 -0
  21. hud/cli/rl/vllm.py +177 -0
  22. hud/cli/tests/test_analyze_metadata.py +0 -1
  23. hud/cli/utils/tasks.py +26 -0
  24. hud/clients/base.py +21 -23
  25. hud/clients/mcp_use.py +36 -44
  26. hud/clients/tests/test_mcp_use_retry.py +10 -10
  27. hud/datasets/__init__.py +4 -3
  28. hud/datasets/{execution/parallel.py → parallel.py} +1 -1
  29. hud/datasets/{execution/runner.py → runner.py} +1 -1
  30. hud/datasets/utils.py +1 -1
  31. hud/native/tests/test_native_init.py +1 -1
  32. hud/otel/config.py +1 -1
  33. hud/otel/instrumentation.py +35 -0
  34. hud/rl/README.md +31 -0
  35. hud/rl/__init__.py +1 -0
  36. hud/rl/actor.py +174 -0
  37. hud/rl/buffer.py +371 -0
  38. hud/rl/chat_template.jinja +101 -0
  39. hud/rl/config.py +184 -0
  40. hud/rl/distributed.py +95 -0
  41. hud/rl/learner.py +586 -0
  42. hud/rl/tests/__init__.py +1 -0
  43. hud/rl/tests/test_learner.py +171 -0
  44. hud/rl/train.py +354 -0
  45. hud/rl/types.py +101 -0
  46. hud/rl/utils/start_vllm_server.sh +30 -0
  47. hud/rl/utils.py +524 -0
  48. hud/rl/vllm_adapter.py +125 -0
  49. hud/settings.py +6 -0
  50. hud/telemetry/__init__.py +2 -1
  51. hud/telemetry/job.py +46 -3
  52. hud/telemetry/tests/test_trace.py +3 -3
  53. hud/telemetry/trace.py +85 -13
  54. hud/tools/computer/hud.py +4 -4
  55. hud/tools/tests/test_computer.py +3 -3
  56. hud/tools/tests/test_computer_actions.py +1 -1
  57. hud/types.py +123 -2
  58. hud/utils/group_eval.py +223 -0
  59. hud/utils/hud_console.py +113 -13
  60. hud/utils/tasks.py +119 -0
  61. hud/utils/tests/test_version.py +1 -1
  62. hud/version.py +1 -1
  63. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
  64. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/RECORD +67 -47
  65. hud/cli/hf.py +0 -406
  66. hud/cli/rl/README.md +0 -243
  67. hud/cli/rl/init.py +0 -370
  68. hud/cli/rl/pod.py +0 -501
  69. hud/cli/rl/ssh.py +0 -322
  70. hud/cli/rl/train.py +0 -562
  71. hud/cli/rl/utils.py +0 -165
  72. hud/datasets/execution/__init__.py +0 -13
  73. hud/datasets/task.py +0 -116
  74. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
  75. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
  76. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
hud/cli/__init__.py CHANGED
@@ -24,12 +24,10 @@ from .debug import debug_mcp_stdio
24
24
  from .dev import run_mcp_dev_server
25
25
 
26
26
  # Import new commands
27
- from .hf import hf_command
28
27
  from .init import create_environment
29
28
  from .pull import pull_command
30
29
  from .push import push_command
31
30
  from .remove import remove_command
32
- from .rl import rl_app
33
31
  from .utils.cursor import get_cursor_config_path, list_cursor_servers, parse_cursor_config
34
32
  from .utils.logging import CaptureLogger
35
33
 
@@ -777,18 +775,17 @@ def eval(
777
775
  "If not provided, looks for task.json in current directory."
778
776
  ),
779
777
  ),
778
+ agent: str | None = typer.Argument(
779
+ None,
780
+ help=(
781
+ "Agent backend to use (claude, openai, or vllm). If not provided, will prompt interactively." # noqa: E501
782
+ ),
783
+ ),
780
784
  full: bool = typer.Option(
781
785
  False,
782
786
  "--full",
783
787
  help="Run the entire dataset (omit for single-task debug mode)",
784
788
  ),
785
- agent: str | None = typer.Option(
786
- None,
787
- "--agent",
788
- help=(
789
- "Agent backend to use (claude or openai). If not provided, will prompt interactively."
790
- ),
791
- ),
792
789
  model: str | None = typer.Option(
793
790
  None,
794
791
  "--model",
@@ -829,8 +826,19 @@ def eval(
829
826
  "--verbose",
830
827
  help="Enable verbose output from the agent",
831
828
  ),
829
+ vllm_base_url: str | None = typer.Option(
830
+ None,
831
+ "--vllm-base-url",
832
+ help="Base URL for vLLM server (when using --agent vllm)",
833
+ ),
834
+ group_size: int = typer.Option(
835
+ 1,
836
+ "--group-size",
837
+ help="Number of times to run each task (similar to RL training)",
838
+ ),
832
839
  ) -> None:
833
840
  """🚀 Run evaluation on datasets or individual tasks with agents."""
841
+ from hud.settings import settings
834
842
  from hud.utils.hud_console import HUDConsole
835
843
 
836
844
  hud_console = HUDConsole()
@@ -884,33 +892,70 @@ def eval(
884
892
  source = file_choice
885
893
  hud_console.success(f"Selected: {source}")
886
894
 
887
- # If no agent specified, prompt for selection
895
+ # Import eval_command lazily to avoid importing agent dependencies
896
+ try:
897
+ from .eval import eval_command, get_available_models
898
+ except ImportError as e:
899
+ hud_console.error(
900
+ "Evaluation dependencies are not installed. "
901
+ "Please install with: pip install 'hud-python[agent]'"
902
+ )
903
+ raise typer.Exit(1) from e
904
+
905
+ # If no agent specified, fetch available models and prompt for selection
906
+ base_model = None
888
907
  if agent is None:
889
- agent = hud_console.select(
890
- "Select an agent to use:",
891
- choices=[
908
+ # Get available HUD models first
909
+ hud_models = get_available_models()
910
+
911
+ # Build choices starting with HUD models
912
+ choices = []
913
+
914
+ # Add HUD models as agent choices
915
+ for hud_model in hud_models:
916
+ model_name = hud_model["name"]
917
+ base_model = hud_model["base_model"]
918
+ vllm_status = " ⚡" if hud_model.get("vllm_url") else ""
919
+ choices.append({"name": f"{model_name}{vllm_status}", "value": f"{model_name}"})
920
+
921
+ # Add standard agent choices
922
+ choices.extend(
923
+ [
892
924
  {"name": "Claude 4 Sonnet", "value": "claude"},
893
925
  {"name": "OpenAI Computer Use", "value": "openai"},
894
- ],
895
- default="Claude 4 Sonnet",
926
+ {"name": "vLLM (Local Server)", "value": "vllm"},
927
+ ]
896
928
  )
897
929
 
930
+ agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
931
+
932
+ # Handle HUD model selection
933
+ if agent and agent not in ["claude", "openai", "vllm"]:
934
+ # Find remote model name
935
+ model = agent
936
+ if not vllm_base_url:
937
+ vllm_base_url = f"{settings.hud_rl_url}/models/{model}/vllm"
938
+
939
+ # Set model to base model for the vllm endpoint
940
+ if not base_model:
941
+ hud_models = get_available_models()
942
+ for hud_model in hud_models:
943
+ if hud_model["name"] == model:
944
+ base_model = hud_model["base_model"]
945
+ break
946
+ if not base_model:
947
+ hud_console.error(f"Model {model} not found")
948
+ raise typer.Exit(1)
949
+ model = base_model
950
+ agent = "vllm" # Use vLLM backend for HUD models
951
+ hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
952
+
898
953
  # Validate agent choice
899
- valid_agents = ["claude", "openai"]
954
+ valid_agents = ["claude", "openai", "vllm"]
900
955
  if agent not in valid_agents:
901
956
  hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
902
957
  raise typer.Exit(1)
903
958
 
904
- # Import eval_command lazily to avoid importing agent dependencies
905
- try:
906
- from .eval import eval_command
907
- except ImportError as e:
908
- hud_console.error(
909
- "Evaluation dependencies are not installed. "
910
- "Please install with: pip install 'hud-python[agent]'"
911
- )
912
- raise typer.Exit(1) from e
913
-
914
959
  # Run the command
915
960
  eval_command(
916
961
  source=source,
@@ -924,40 +969,117 @@ def eval(
924
969
  max_workers=max_workers,
925
970
  max_concurrent_per_worker=max_concurrent_per_worker,
926
971
  verbose=verbose,
972
+ vllm_base_url=vllm_base_url,
973
+ group_size=group_size,
927
974
  )
928
975
 
929
976
 
930
- # Add the RL subcommand group
931
- app.add_typer(rl_app, name="rl")
977
+ @app.command()
978
+ def get(
979
+ dataset_name: str = typer.Argument(
980
+ ..., help="HuggingFace dataset name (e.g., 'hud-evals/browser-2048-tasks')"
981
+ ),
982
+ split: str = typer.Option(
983
+ "train", "--split", "-s", help="Dataset split to download (train/test/validation)"
984
+ ),
985
+ output: Path | None = typer.Option( # noqa: B008
986
+ None, "--output", "-o", help="Output filename (defaults to dataset_name.jsonl)"
987
+ ),
988
+ limit: int | None = typer.Option(
989
+ None, "--limit", "-l", help="Limit number of examples to download"
990
+ ),
991
+ format: str = typer.Option(
992
+ "json",
993
+ "--format",
994
+ "-f",
995
+ help="Output format: json (list) or jsonl (one task per line)",
996
+ ),
997
+ ) -> None:
998
+ """📥 Download a HuggingFace dataset and save it as JSONL."""
999
+ from .get import get_command
1000
+
1001
+ get_command(
1002
+ dataset_name=dataset_name,
1003
+ split=split,
1004
+ output=output,
1005
+ limit=limit,
1006
+ format=format,
1007
+ )
932
1008
 
933
1009
 
934
1010
  @app.command()
935
- def hf(
936
- tasks_file: Path | None = typer.Argument( # noqa: B008
937
- None, help="JSON file containing tasks (auto-detected if not provided)"
1011
+ def rl(
1012
+ tasks_file: str | None = typer.Argument(
1013
+ None,
1014
+ help=(
1015
+ "Path to tasks file (JSON/JSONL) or HuggingFace dataset name. "
1016
+ "If not provided, looks for tasks.json or tasks.jsonl in current directory."
1017
+ ),
1018
+ ),
1019
+ model: str | None = typer.Argument(
1020
+ None,
1021
+ help="Model to train (default: interactive selection)",
1022
+ ),
1023
+ config_file: Path | None = typer.Option( # noqa: B008
1024
+ None,
1025
+ "--config",
1026
+ "-c",
1027
+ help="Path to existing configuration file",
1028
+ ),
1029
+ output_dir: str = typer.Option(
1030
+ "checkpoints",
1031
+ "--output-dir",
1032
+ "-o",
1033
+ help="Output directory for checkpoints",
938
1034
  ),
939
- name: str | None = typer.Option(
940
- None, "--name", "-n", help="Dataset name (e.g., 'my-org/my-dataset')"
1035
+ restart: bool = typer.Option(
1036
+ False,
1037
+ "--restart",
1038
+ help="Restart the vLLM server before training",
941
1039
  ),
942
- push: bool = typer.Option(True, "--push/--no-push", help="Push to HuggingFace Hub"),
943
- private: bool = typer.Option(False, "--private", help="Make dataset private on Hub"),
944
- update_lock: bool = typer.Option(
945
- True, "--update-lock/--no-update-lock", help="Update hud.lock.yaml"
1040
+ verbose: bool = typer.Option(
1041
+ False,
1042
+ "--verbose",
1043
+ "-v",
1044
+ help="Enable verbose output",
1045
+ ),
1046
+ local: bool = typer.Option(
1047
+ False,
1048
+ "--local",
1049
+ help="Run training locally instead of using remote API server",
1050
+ ),
1051
+ no_ddp: bool = typer.Option(
1052
+ False,
1053
+ "--no-ddp",
1054
+ help="Disable DDP even with multiple GPUs",
1055
+ ),
1056
+ ddp_gpus: str | None = typer.Option(
1057
+ None,
1058
+ "--ddp-gpus",
1059
+ help="Specific GPUs for DDP (e.g., '0,1,2,3')",
1060
+ ),
1061
+ vllm_gpu: int | None = typer.Option(
1062
+ None,
1063
+ "--vllm-gpu",
1064
+ help="Specific GPU for vLLM server",
946
1065
  ),
947
- token: str | None = typer.Option(None, "--token", help="HuggingFace API token"),
948
1066
  ) -> None:
949
- """📊 Convert tasks to HuggingFace dataset format.
950
-
951
- Automatically detects task files if not specified.
952
- Suggests dataset name based on environment if not provided.
1067
+ """🎯 Run GRPO reinforcement learning training on tasks."""
1068
+ # Import from the rl module
1069
+ from .rl import rl_command
953
1070
 
954
- Examples:
955
- hud hf # Auto-detect tasks and suggest name
956
- hud hf tasks.json # Use specific file, suggest name
957
- hud hf --name my-org/my-tasks # Auto-detect tasks, use name
958
- hud hf tasks.json --name hud-evals/web-tasks --private
959
- """
960
- hf_command(tasks_file, name, push, private, update_lock, token)
1071
+ rl_command(
1072
+ tasks_file=tasks_file,
1073
+ model=model,
1074
+ config_file=config_file,
1075
+ output_dir=output_dir,
1076
+ restart=restart,
1077
+ verbose=verbose,
1078
+ local=local,
1079
+ no_ddp=no_ddp,
1080
+ ddp_gpus=ddp_gpus,
1081
+ vllm_gpu=vllm_gpu,
1082
+ )
961
1083
 
962
1084
 
963
1085
  def main() -> None:
@@ -990,12 +1112,20 @@ def main() -> None:
990
1112
  console.print(" 4. Share your environment: [cyan]hud push[/cyan]")
991
1113
  console.print(" 5. Get shared environments: [cyan]hud pull <org/name:tag>[/cyan]")
992
1114
  console.print(" 6. Run and test: [cyan]hud run <image>[/cyan]")
993
- console.print("\n[yellow]RL Training:[/yellow]")
994
- console.print(" 1. Generate config: [cyan]hud rl init my-env:latest[/cyan]")
1115
+ console.print("\n[yellow]Datasets & RL Training:[/yellow]")
1116
+ console.print(" 1. Get dataset: [cyan]hud get hud-evals/browser-2048-tasks[/cyan]")
995
1117
  console.print(
996
1118
  " 2. Create dataset: [cyan]hud hf tasks.json --name my-org/my-tasks[/cyan]"
997
1119
  )
998
- console.print(" 3. Start training: [cyan]hud rl --model Qwen/Qwen2.5-3B[/cyan]\n")
1120
+ console.print(
1121
+ " 3. Start training: [cyan]hud rl browser-2048-tasks.jsonl --local[/cyan]"
1122
+ )
1123
+ console.print(
1124
+ " 4. Custom model: [cyan]hud rl tasks.jsonl --model meta-llama/Llama-3.2-3B --local[/cyan]" # noqa: E501
1125
+ )
1126
+ console.print(
1127
+ " 5. Restart server: [cyan]hud rl tasks.jsonl --restart --local[/cyan]\n"
1128
+ )
999
1129
 
1000
1130
  app()
1001
1131
  except typer.Exit as e:
hud/cli/dev.py CHANGED
@@ -775,15 +775,14 @@ def run_mcp_dev_server(
775
775
  hud_console.progress_message("🧪 Run with --interactive for interactive testing mode")
776
776
 
777
777
  # Disable logs and hot-reload if interactive mode is enabled
778
- if interactive:
779
- if not no_logs:
780
- hud_console.warning("Docker logs disabled in interactive mode for better UI experience")
781
- no_logs = True
782
- if not no_reload:
783
- hud_console.warning(
784
- "Hot-reload disabled in interactive mode to prevent output interference"
785
- )
786
- no_reload = True
778
+ if interactive and not no_logs:
779
+ hud_console.warning("Docker logs disabled in interactive mode for better UI experience")
780
+ no_logs = True
781
+ # if not no_reload:
782
+ # hud_console.warning(
783
+ # "Hot-reload disabled in interactive mode to prevent output interference"
784
+ # )
785
+ # no_reload = True
787
786
 
788
787
  # Show configuration as JSON (just the server config, not wrapped)
789
788
  full_config = {}