hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/rl/remote_runner.py
DELETED
|
@@ -1,463 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Remote runner for HUD RL training via API server.
|
|
3
|
-
|
|
4
|
-
This module implements the new interactive flow for RL training.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import time
|
|
10
|
-
import uuid
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
|
|
13
|
-
from rich.console import Console
|
|
14
|
-
|
|
15
|
-
from hud.cli.rl.celebrate import show_confetti_async
|
|
16
|
-
from hud.cli.rl.gpu_utils import adjust_config_for_ddp
|
|
17
|
-
from hud.cli.rl.viewer import show_json_interactive
|
|
18
|
-
from hud.cli.rl.wait_utils import wait_for_enter_cancel_or_change
|
|
19
|
-
from hud.utils.hud_console import hud_console
|
|
20
|
-
from hud.utils.tasks import load_tasks
|
|
21
|
-
|
|
22
|
-
from . import rl_api
|
|
23
|
-
from .config import generate_config_interactive, load_config, save_config
|
|
24
|
-
from .presets import get_training_presets
|
|
25
|
-
|
|
26
|
-
console = Console()
|
|
27
|
-
|
|
28
|
-
# GPU pricing information
|
|
29
|
-
GPU_PRICING = {
|
|
30
|
-
"A100": {"price": "1", "memory": "80GB"},
|
|
31
|
-
"H100": {"price": "2", "memory": "80GB"},
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def ensure_vllm_deployed(
|
|
36
|
-
model_name: str, gpu_type: str = "A100", gpu_count: int = 1, timeout: int = 600
|
|
37
|
-
) -> None:
|
|
38
|
-
"""Deploy vLLM for a model if needed and wait until it's ready.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
model_name: The name of the model to deploy vLLM for
|
|
42
|
-
gpu_type: GPU type to use for deployment (e.g., A100, H100)
|
|
43
|
-
timeout: Max seconds to wait for vLLM to be ready
|
|
44
|
-
"""
|
|
45
|
-
# Check current model status
|
|
46
|
-
info = rl_api.get_model(model_name)
|
|
47
|
-
if info.vllm_url:
|
|
48
|
-
hud_console.success("vLLM server already running")
|
|
49
|
-
return
|
|
50
|
-
|
|
51
|
-
hud_console.info(f"Deploying vLLM server for {model_name}...")
|
|
52
|
-
rl_api.deploy_vllm(model_name, gpu_type=gpu_type, gpu_count=gpu_count)
|
|
53
|
-
hud_console.success("vLLM deployment started")
|
|
54
|
-
|
|
55
|
-
hud_console.info("Waiting for vLLM server to be ready...")
|
|
56
|
-
start_time = time.time()
|
|
57
|
-
with hud_console.progress() as progress:
|
|
58
|
-
progress.update("Checking deployment status (see live status on https://hud.so/models)")
|
|
59
|
-
while True:
|
|
60
|
-
if time.time() - start_time > timeout:
|
|
61
|
-
hud_console.error("Timeout waiting for vLLM deployment")
|
|
62
|
-
raise ValueError("vLLM deployment timeout")
|
|
63
|
-
info = rl_api.get_model(model_name)
|
|
64
|
-
if info.status == "ready":
|
|
65
|
-
hud_console.success(
|
|
66
|
-
f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
|
|
67
|
-
)
|
|
68
|
-
break
|
|
69
|
-
time.sleep(5)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def run_remote_training(
|
|
73
|
-
tasks_file: str | None,
|
|
74
|
-
model: str | None,
|
|
75
|
-
config_file: Path | None,
|
|
76
|
-
output_dir: str,
|
|
77
|
-
vllm_gpu_count: int = 1,
|
|
78
|
-
yes: bool = False,
|
|
79
|
-
) -> None:
|
|
80
|
-
"""Run RL training remotely via the API server following the new interactive flow."""
|
|
81
|
-
from hud.settings import settings
|
|
82
|
-
|
|
83
|
-
if not settings.api_key:
|
|
84
|
-
hud_console.error("API key not found")
|
|
85
|
-
console.print(
|
|
86
|
-
"[yellow]Set it in your environment or run: hud set HUD_API_KEY=your-key-here[/yellow]"
|
|
87
|
-
)
|
|
88
|
-
raise ValueError("API key not found")
|
|
89
|
-
|
|
90
|
-
# Step 1: CONFIRMATION - Load tasks
|
|
91
|
-
if tasks_file:
|
|
92
|
-
tasks: list[Task] = load_tasks(tasks_file) # type: ignore[assignment]
|
|
93
|
-
# Resolve tasks immediately after loading (validate + fill defaults)
|
|
94
|
-
from hud.types import Task
|
|
95
|
-
|
|
96
|
-
resolved_tasks: list[dict] = []
|
|
97
|
-
for t in tasks:
|
|
98
|
-
try:
|
|
99
|
-
resolved = Task(**t.model_dump()).model_dump()
|
|
100
|
-
except Exception:
|
|
101
|
-
resolved = t.model_dump()
|
|
102
|
-
resolved_tasks.append(resolved)
|
|
103
|
-
|
|
104
|
-
# Preview resolved task
|
|
105
|
-
if resolved_tasks and not yes:
|
|
106
|
-
try:
|
|
107
|
-
show_json_interactive(resolved_tasks[0], title="Task Preview")
|
|
108
|
-
except Exception as e:
|
|
109
|
-
hud_console.warning(f"Interactive viewer failed: {e}")
|
|
110
|
-
else:
|
|
111
|
-
raise ValueError("Tasks file not found")
|
|
112
|
-
|
|
113
|
-
# Show example task for confirmation
|
|
114
|
-
# hud_console.section_title("Example Task from Dataset")
|
|
115
|
-
|
|
116
|
-
# if tasks:
|
|
117
|
-
# # Display task with truncated values
|
|
118
|
-
# try:
|
|
119
|
-
# task_data = resolved_tasks[0]
|
|
120
|
-
# except Exception:
|
|
121
|
-
# task_data = tasks[0].model_dump()
|
|
122
|
-
# truncated_data = {}
|
|
123
|
-
# max_value_length = 120 # Maximum characters to show per line
|
|
124
|
-
|
|
125
|
-
# for key, value in task_data.items():
|
|
126
|
-
# value_str = str(value)
|
|
127
|
-
# if len(value_str) > max_value_length:
|
|
128
|
-
# truncated_data[key] = value_str[:max_value_length] + "..."
|
|
129
|
-
# else:
|
|
130
|
-
# truncated_data[key] = value_str
|
|
131
|
-
|
|
132
|
-
# hud_console.key_value_table(truncated_data)
|
|
133
|
-
|
|
134
|
-
# if not hud_console.confirm("Proceed with training on this dataset?", default=True):
|
|
135
|
-
# hud_console.error("Training cancelled")
|
|
136
|
-
# return
|
|
137
|
-
|
|
138
|
-
# Step 2: MODEL SELECTION
|
|
139
|
-
hud_console.section_title("Model Selection")
|
|
140
|
-
|
|
141
|
-
# Fetch existing models
|
|
142
|
-
hud_console.info("Fetching your models from https://hud.so/models")
|
|
143
|
-
|
|
144
|
-
try:
|
|
145
|
-
models = rl_api.list_models()
|
|
146
|
-
# Filter for active/training models and sort by recency
|
|
147
|
-
active_models = [m for m in models if m.status in ["ready", "training"]]
|
|
148
|
-
active_models.sort(key=lambda m: m.created_at or "", reverse=True)
|
|
149
|
-
|
|
150
|
-
if active_models or model is None:
|
|
151
|
-
# Build choices
|
|
152
|
-
choices = []
|
|
153
|
-
for m in active_models:
|
|
154
|
-
status_emoji = {
|
|
155
|
-
"ready": "✅",
|
|
156
|
-
"training": "🔄",
|
|
157
|
-
"deploying": "🚀",
|
|
158
|
-
"pending": "⏳",
|
|
159
|
-
}.get(m.status, "❓")
|
|
160
|
-
|
|
161
|
-
choices.append({"name": f"{status_emoji} {m.name} ({m.status})", "value": m.name})
|
|
162
|
-
|
|
163
|
-
choices.append({"name": "Create new model", "value": "__new__"})
|
|
164
|
-
|
|
165
|
-
if not model:
|
|
166
|
-
if yes:
|
|
167
|
-
# In yes mode, always create a new model to avoid conflicts
|
|
168
|
-
selected = "__new__"
|
|
169
|
-
hud_console.info("Auto-creating new model (--yes mode)")
|
|
170
|
-
elif choices:
|
|
171
|
-
selected = hud_console.select("Select a model:", choices=choices)
|
|
172
|
-
else:
|
|
173
|
-
selected = "__new__"
|
|
174
|
-
hud_console.hint("No existing models found. Creating new model...")
|
|
175
|
-
else:
|
|
176
|
-
# Model was provided via CLI
|
|
177
|
-
selected = model
|
|
178
|
-
|
|
179
|
-
else:
|
|
180
|
-
selected = "__new__"
|
|
181
|
-
|
|
182
|
-
# Handle model selection
|
|
183
|
-
if selected == "__new__":
|
|
184
|
-
# Create new model flow
|
|
185
|
-
hud_console.info("Creating new model...")
|
|
186
|
-
|
|
187
|
-
# Ask for model type
|
|
188
|
-
if yes:
|
|
189
|
-
if config_file:
|
|
190
|
-
config = load_config(config_file)
|
|
191
|
-
model_type = config.model.base_model
|
|
192
|
-
else:
|
|
193
|
-
model_type = "Qwen/Qwen2.5-VL-3B-Instruct"
|
|
194
|
-
hud_console.info(f"Auto-selecting base model: {model_type} (--yes mode)")
|
|
195
|
-
else:
|
|
196
|
-
model_type = hud_console.select(
|
|
197
|
-
"Select base model type:",
|
|
198
|
-
choices=[
|
|
199
|
-
{"name": "Qwen2.5-VL-3B-Instruct", "value": "Qwen/Qwen2.5-VL-3B-Instruct"},
|
|
200
|
-
{"name": "Qwen2.5-3B-Instruct", "value": "Qwen/Qwen2.5-3B-Instruct"},
|
|
201
|
-
],
|
|
202
|
-
default=0,
|
|
203
|
-
)
|
|
204
|
-
from rich.prompt import Prompt
|
|
205
|
-
|
|
206
|
-
# Ask for model name
|
|
207
|
-
base_default = model_type.split("/")[-1].lower()
|
|
208
|
-
default_name = base_default
|
|
209
|
-
existing_names = {m.name for m in active_models}
|
|
210
|
-
suffix = 1
|
|
211
|
-
while default_name in existing_names:
|
|
212
|
-
default_name = f"{base_default}-{suffix}"
|
|
213
|
-
suffix += 1
|
|
214
|
-
|
|
215
|
-
if yes:
|
|
216
|
-
model_name = default_name
|
|
217
|
-
hud_console.info(f"Auto-using model name: {model_name} (--yes mode)")
|
|
218
|
-
else:
|
|
219
|
-
hud_console.info(f"Enter model name (default: {default_name}):")
|
|
220
|
-
model_name = Prompt.ask("Model name", default=default_name)
|
|
221
|
-
model_name = model_name.replace("/", "-").lower()
|
|
222
|
-
|
|
223
|
-
# Create the model with retry on name conflict
|
|
224
|
-
hud_console.info(f"Creating model: {model_name}")
|
|
225
|
-
try:
|
|
226
|
-
rl_api.create_model(model_name, model_type)
|
|
227
|
-
hud_console.success(f"Created model: {model_name}")
|
|
228
|
-
ensure_vllm_deployed(model_name, gpu_type="A100", gpu_count=vllm_gpu_count)
|
|
229
|
-
|
|
230
|
-
except Exception as e:
|
|
231
|
-
# If the name already exists, suggest a new name and prompt once
|
|
232
|
-
message = str(e)
|
|
233
|
-
if "already exists" in message or "409" in message:
|
|
234
|
-
alt_name = f"{model_name}-1"
|
|
235
|
-
i = 1
|
|
236
|
-
while True:
|
|
237
|
-
candidate = f"{model_name}-{str(uuid.uuid4())[:4]}"
|
|
238
|
-
if candidate not in existing_names:
|
|
239
|
-
alt_name = candidate
|
|
240
|
-
break
|
|
241
|
-
i += 1
|
|
242
|
-
hud_console.warning(
|
|
243
|
-
f"Model '{model_name}' exists. Suggesting '{alt_name}' instead."
|
|
244
|
-
)
|
|
245
|
-
try:
|
|
246
|
-
from rich.prompt import Prompt as _Prompt
|
|
247
|
-
|
|
248
|
-
if yes:
|
|
249
|
-
chosen = alt_name
|
|
250
|
-
hud_console.info(f"Auto-using suggested name: {chosen} (--yes mode)")
|
|
251
|
-
else:
|
|
252
|
-
chosen = _Prompt.ask("Use different name", default=alt_name)
|
|
253
|
-
chosen = chosen.replace("/", "-").lower()
|
|
254
|
-
rl_api.create_model(chosen, model_type)
|
|
255
|
-
hud_console.success(f"Created model: {chosen}")
|
|
256
|
-
model_name = chosen
|
|
257
|
-
ensure_vllm_deployed(model_name, gpu_type="A100", gpu_count=vllm_gpu_count)
|
|
258
|
-
except Exception as e2:
|
|
259
|
-
hud_console.error(f"Failed to create model: {e2}")
|
|
260
|
-
raise
|
|
261
|
-
else:
|
|
262
|
-
hud_console.error(f"Failed to create model: {e}")
|
|
263
|
-
raise
|
|
264
|
-
|
|
265
|
-
else:
|
|
266
|
-
# Existing model selected
|
|
267
|
-
model_name = selected
|
|
268
|
-
model_info = rl_api.get_model(model_name)
|
|
269
|
-
|
|
270
|
-
# Check if model is in training
|
|
271
|
-
if model_info.status == "training":
|
|
272
|
-
if yes:
|
|
273
|
-
# In yes mode, skip training if model is already training
|
|
274
|
-
hud_console.warning(f"{model_name} is already training, skipping (--yes mode)")
|
|
275
|
-
return
|
|
276
|
-
elif hud_console.confirm(
|
|
277
|
-
f"{model_name} is currently training. Stop current training?", default=False
|
|
278
|
-
):
|
|
279
|
-
hud_console.info(f"Stopping training for {model_name}...")
|
|
280
|
-
try:
|
|
281
|
-
rl_api.stop_training(model_name)
|
|
282
|
-
hud_console.success("Training stopped")
|
|
283
|
-
except Exception as e:
|
|
284
|
-
hud_console.error(f"Failed to stop training: {e}")
|
|
285
|
-
raise
|
|
286
|
-
else:
|
|
287
|
-
hud_console.error("Cannot start new training while model is already training")
|
|
288
|
-
return
|
|
289
|
-
|
|
290
|
-
# Ensure vLLM is deployed
|
|
291
|
-
ensure_vllm_deployed(model_name, gpu_type="A100", gpu_count=vllm_gpu_count)
|
|
292
|
-
except KeyboardInterrupt:
|
|
293
|
-
hud_console.dim_info("Training cancelled", "")
|
|
294
|
-
return
|
|
295
|
-
except Exception as e:
|
|
296
|
-
hud_console.error(f"Error during model selection: {e}")
|
|
297
|
-
raise
|
|
298
|
-
|
|
299
|
-
# Get final model info
|
|
300
|
-
model_info = rl_api.get_model(model_name)
|
|
301
|
-
|
|
302
|
-
# Step 3: TRAINING CONFIG
|
|
303
|
-
hud_console.section_title("Training Configuration")
|
|
304
|
-
|
|
305
|
-
if not config_file:
|
|
306
|
-
# Ask about number of GPUs with pricing
|
|
307
|
-
# hud_console.info("GPU Selection (Pricing per GPU):")
|
|
308
|
-
|
|
309
|
-
# gpu_table = Table(show_header=True, header_style="bold magenta")
|
|
310
|
-
# gpu_table.add_column("GPU Type", style="cyan")
|
|
311
|
-
# gpu_table.add_column("Memory", style="green")
|
|
312
|
-
# gpu_table.add_column("Price/hr", style="yellow")
|
|
313
|
-
|
|
314
|
-
# for gpu, info in GPU_PRICING.items():
|
|
315
|
-
# gpu_table.add_row(gpu, info["memory"], "see pricing on hud.so")
|
|
316
|
-
|
|
317
|
-
# console.print(gpu_table)
|
|
318
|
-
|
|
319
|
-
if yes:
|
|
320
|
-
gpu_choice = "A100"
|
|
321
|
-
hud_console.info(f"Auto-selecting GPU: {gpu_choice} 80GB (--yes mode)")
|
|
322
|
-
else:
|
|
323
|
-
gpu_choice = hud_console.select(
|
|
324
|
-
"Select GPU type:",
|
|
325
|
-
choices=[
|
|
326
|
-
{"name": "A100 80GB", "value": "A100"},
|
|
327
|
-
{"name": "H100 80GB", "value": "H100"},
|
|
328
|
-
],
|
|
329
|
-
default=0,
|
|
330
|
-
)
|
|
331
|
-
|
|
332
|
-
if yes:
|
|
333
|
-
num_gpus = 2 # Default to 2 GPUs in yes mode
|
|
334
|
-
hud_console.info(f"Auto-selecting {num_gpus} GPU(s) (--yes mode)")
|
|
335
|
-
else:
|
|
336
|
-
num_gpus = hud_console.select(
|
|
337
|
-
"Number of GPUs:",
|
|
338
|
-
choices=[
|
|
339
|
-
{"name": "1 GPU", "value": 1},
|
|
340
|
-
{"name": "2 GPUs", "value": 2},
|
|
341
|
-
{"name": "4 GPUs", "value": 4},
|
|
342
|
-
{"name": "8 GPUs", "value": 8},
|
|
343
|
-
],
|
|
344
|
-
default=1,
|
|
345
|
-
)
|
|
346
|
-
|
|
347
|
-
# Generate config with presets
|
|
348
|
-
hud_console.info("Generating training configuration...")
|
|
349
|
-
gpu_memory_gb = 80.0 if gpu_choice in ["A100", "H100"] else 48.0
|
|
350
|
-
presets = get_training_presets(gpu_memory_gb)
|
|
351
|
-
|
|
352
|
-
config, _ = generate_config_interactive(
|
|
353
|
-
model_name=model_info.base_model,
|
|
354
|
-
presets=presets,
|
|
355
|
-
yes=yes,
|
|
356
|
-
)
|
|
357
|
-
|
|
358
|
-
config = adjust_config_for_ddp(config, int(num_gpus))
|
|
359
|
-
|
|
360
|
-
config.training.gpu_type = gpu_choice
|
|
361
|
-
|
|
362
|
-
# Use a short label for tasks (avoid full absolute paths)
|
|
363
|
-
try:
|
|
364
|
-
if tasks_file and Path(tasks_file).exists():
|
|
365
|
-
tasks_label = Path(tasks_file).name
|
|
366
|
-
else:
|
|
367
|
-
# Fallback: last segment of a non-existent path or dataset name
|
|
368
|
-
tasks_label = str(tasks_file).replace("\\", "/").split("/")[-1]
|
|
369
|
-
except Exception:
|
|
370
|
-
tasks_label = str(tasks_file)
|
|
371
|
-
|
|
372
|
-
config.job_name = f"RL {tasks_label} | {model_name}"
|
|
373
|
-
|
|
374
|
-
# Save config so user can review/edit externally
|
|
375
|
-
temp_config_path = Path(f".rl_config_temp_{model_name}.json")
|
|
376
|
-
save_config(config, temp_config_path)
|
|
377
|
-
|
|
378
|
-
# Interactive review loop: show preview, allow external edits, press Enter to start
|
|
379
|
-
hud_console.info(
|
|
380
|
-
f"Using training configuration from [underline cyan]{temp_config_path.absolute()}[/underline cyan]" # noqa: E501
|
|
381
|
-
)
|
|
382
|
-
|
|
383
|
-
if yes:
|
|
384
|
-
# In yes mode, skip the interactive review loop
|
|
385
|
-
hud_console.info("Auto-accepting config (--yes mode)")
|
|
386
|
-
# Still show the config briefly
|
|
387
|
-
try:
|
|
388
|
-
show_json_interactive(
|
|
389
|
-
config.to_dict() if hasattr(config, "to_dict") else {},
|
|
390
|
-
title="RL Config Preview",
|
|
391
|
-
prompt=False,
|
|
392
|
-
)
|
|
393
|
-
except Exception as e:
|
|
394
|
-
hud_console.warning(f"Interactive viewer failed: {e}")
|
|
395
|
-
else:
|
|
396
|
-
while True:
|
|
397
|
-
# Reload latest config from file each cycle
|
|
398
|
-
try:
|
|
399
|
-
config = load_config(temp_config_path)
|
|
400
|
-
except Exception as e:
|
|
401
|
-
hud_console.warning(f"Failed to load config from disk, using in-memory: {e}")
|
|
402
|
-
|
|
403
|
-
# Preview current config (no extra prompt here; main loop handles start/cancel)
|
|
404
|
-
try:
|
|
405
|
-
show_json_interactive(
|
|
406
|
-
config.to_dict() if hasattr(config, "to_dict") else {},
|
|
407
|
-
title="RL Config Preview",
|
|
408
|
-
prompt=False,
|
|
409
|
-
)
|
|
410
|
-
except Exception as e:
|
|
411
|
-
hud_console.warning(f"Interactive viewer failed: {e}")
|
|
412
|
-
|
|
413
|
-
console.print(
|
|
414
|
-
"\n[dim]Edit the config file above if needed, then save.[/dim]\n"
|
|
415
|
-
"[bold]Press Enter to start training[/bold], or press 'q' to cancel."
|
|
416
|
-
)
|
|
417
|
-
|
|
418
|
-
start_training, cancelled, changed = wait_for_enter_cancel_or_change(
|
|
419
|
-
temp_config_path
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
if cancelled:
|
|
423
|
-
hud_console.error("Training cancelled")
|
|
424
|
-
return
|
|
425
|
-
if start_training:
|
|
426
|
-
break # proceed
|
|
427
|
-
if changed:
|
|
428
|
-
hud_console.info("Detected configuration changes. Reloading preview...")
|
|
429
|
-
|
|
430
|
-
config_dict = config.to_dict()
|
|
431
|
-
else:
|
|
432
|
-
# Load provided config
|
|
433
|
-
hud_console.info(f"Loading configuration from: {config_file}")
|
|
434
|
-
config = load_config(config_file)
|
|
435
|
-
gpu_choice = config.training.gpu_type
|
|
436
|
-
num_gpus = config.training.num_gpus
|
|
437
|
-
|
|
438
|
-
config = adjust_config_for_ddp(config, int(num_gpus))
|
|
439
|
-
config_dict = config.to_dict()
|
|
440
|
-
|
|
441
|
-
# Launch training
|
|
442
|
-
try:
|
|
443
|
-
# Little celebration before launching
|
|
444
|
-
try:
|
|
445
|
-
show_confetti_async(console)
|
|
446
|
-
except Exception:
|
|
447
|
-
hud_console.info("Launching training...")
|
|
448
|
-
|
|
449
|
-
rl_api.launch_training(
|
|
450
|
-
model_name=model_name,
|
|
451
|
-
config=config_dict,
|
|
452
|
-
tasks=resolved_tasks,
|
|
453
|
-
gpu_type=gpu_choice,
|
|
454
|
-
gpu_count=int(num_gpus),
|
|
455
|
-
)
|
|
456
|
-
|
|
457
|
-
hud_console.info(f"Your model {model_name} has started training")
|
|
458
|
-
hud_console.hint("Launch another training run via: hud rl <tasks_file>")
|
|
459
|
-
hud_console.hint("Or evaluate the model via: hud eval <tasks_file>")
|
|
460
|
-
|
|
461
|
-
except Exception as e:
|
|
462
|
-
hud_console.error(f"Failed to launch training: {e}")
|
|
463
|
-
raise
|
hud/cli/rl/rl_api.py
DELETED
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Direct API functions for HUD RL remote endpoints using shared requests module.
|
|
3
|
-
|
|
4
|
-
This module provides functions for interacting with the HUD RL API server.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
from typing import TYPE_CHECKING, Any
|
|
10
|
-
|
|
11
|
-
from pydantic import BaseModel
|
|
12
|
-
|
|
13
|
-
from hud.settings import settings
|
|
14
|
-
from hud.shared.requests import make_request_sync
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from collections.abc import Iterator
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class RLModelInfo(BaseModel):
|
|
21
|
-
"""Model information from the API."""
|
|
22
|
-
|
|
23
|
-
name: str
|
|
24
|
-
base_model: str
|
|
25
|
-
vllm_url: str | None = None
|
|
26
|
-
trainer_name: str | None = None
|
|
27
|
-
checkpoint_volume: str | None = None
|
|
28
|
-
status: str = "pending" # pending, deploying, ready, training, terminated
|
|
29
|
-
created_at: str | None = None
|
|
30
|
-
updated_at: str | None = None
|
|
31
|
-
terminated_at: str | None = None
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def create_model(name: str, base_model: str) -> dict[str, Any]:
|
|
35
|
-
"""Create a new model."""
|
|
36
|
-
return make_request_sync(
|
|
37
|
-
method="POST",
|
|
38
|
-
url=f"{settings.hud_rl_url}/models",
|
|
39
|
-
json={"name": name, "base_model": base_model},
|
|
40
|
-
api_key=settings.api_key,
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def get_model(name: str) -> RLModelInfo:
|
|
45
|
-
"""Get model information."""
|
|
46
|
-
response = make_request_sync(
|
|
47
|
-
method="GET", url=f"{settings.hud_rl_url}/models/{name}", api_key=settings.api_key
|
|
48
|
-
)
|
|
49
|
-
return RLModelInfo(**response)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def list_models() -> list[RLModelInfo]:
|
|
53
|
-
"""List all models."""
|
|
54
|
-
response = make_request_sync(
|
|
55
|
-
method="GET", url=f"{settings.hud_rl_url}/models", api_key=settings.api_key
|
|
56
|
-
)
|
|
57
|
-
if not isinstance(response, list):
|
|
58
|
-
response = [response]
|
|
59
|
-
return [
|
|
60
|
-
RLModelInfo(**(model if isinstance(model, dict) else model.__dict__)) for model in response
|
|
61
|
-
]
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def deploy_vllm(model_name: str, gpu_type: str = "A100", gpu_count: int = 1) -> dict[str, Any]:
|
|
65
|
-
"""Deploy a vLLM server for a model."""
|
|
66
|
-
return make_request_sync(
|
|
67
|
-
method="POST",
|
|
68
|
-
url=f"{settings.hud_rl_url}/models/{model_name}/deploy",
|
|
69
|
-
json={"gpu_type": gpu_type, "gpu_count": gpu_count},
|
|
70
|
-
api_key=settings.api_key,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def stop_vllm(model_name: str) -> dict[str, Any]:
|
|
75
|
-
"""Stop the vLLM server for a model."""
|
|
76
|
-
return make_request_sync(
|
|
77
|
-
method="DELETE",
|
|
78
|
-
url=f"{settings.hud_rl_url}/models/{model_name}/deploy",
|
|
79
|
-
api_key=settings.api_key,
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def stop_training(model_name: str) -> dict[str, Any]:
|
|
84
|
-
"""Stop the training for a model."""
|
|
85
|
-
return make_request_sync(
|
|
86
|
-
method="DELETE",
|
|
87
|
-
url=f"{settings.hud_rl_url}/models/{model_name}/training",
|
|
88
|
-
api_key=settings.api_key,
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def launch_training(
|
|
93
|
-
model_name: str,
|
|
94
|
-
config: dict[str, Any],
|
|
95
|
-
tasks: list[dict[str, Any]],
|
|
96
|
-
gpu_type: str = "A100",
|
|
97
|
-
gpu_count: int = 1,
|
|
98
|
-
) -> dict[str, Any]:
|
|
99
|
-
"""Launch a training run for a model."""
|
|
100
|
-
return make_request_sync(
|
|
101
|
-
method="POST",
|
|
102
|
-
url=f"{settings.hud_rl_url}/models/{model_name}/training/launch",
|
|
103
|
-
json={"config": config, "tasks": tasks, "gpu_type": gpu_type, "gpu_count": gpu_count},
|
|
104
|
-
api_key=settings.api_key,
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def get_training_status(model_name: str) -> dict[str, Any]:
|
|
109
|
-
"""Get the status of a training run."""
|
|
110
|
-
return make_request_sync(
|
|
111
|
-
method="GET",
|
|
112
|
-
url=f"{settings.hud_rl_url}/models/{model_name}/training/status",
|
|
113
|
-
api_key=settings.api_key,
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def get_training_logs(model_name: str, lines: int = 100, follow: bool = False) -> Iterator[str]:
|
|
118
|
-
"""Get training logs for a model.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
model_name: Name of the model
|
|
122
|
-
lines: Number of lines to return
|
|
123
|
-
follow: If True, stream logs as they arrive
|
|
124
|
-
|
|
125
|
-
Yields:
|
|
126
|
-
Log lines as strings
|
|
127
|
-
"""
|
|
128
|
-
# For streaming logs, we need to use httpx directly
|
|
129
|
-
# as the shared requests module expects JSON responses
|
|
130
|
-
import httpx
|
|
131
|
-
|
|
132
|
-
params = {"lines": lines}
|
|
133
|
-
if follow:
|
|
134
|
-
params["follow"] = True
|
|
135
|
-
|
|
136
|
-
headers = {"Authorization": f"Bearer {settings.api_key}"}
|
|
137
|
-
|
|
138
|
-
with (
|
|
139
|
-
httpx.Client(timeout=300.0) as client,
|
|
140
|
-
client.stream(
|
|
141
|
-
"GET",
|
|
142
|
-
f"{settings.hud_rl_url}/models/{model_name}/training/logs",
|
|
143
|
-
params=params,
|
|
144
|
-
headers=headers,
|
|
145
|
-
) as response,
|
|
146
|
-
):
|
|
147
|
-
response.raise_for_status()
|
|
148
|
-
for line in response.iter_lines():
|
|
149
|
-
if line:
|
|
150
|
-
yield line
|