hud-python 0.4.27__py3-none-any.whl → 0.4.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (76) hide show
  1. hud/__init__.py +2 -1
  2. hud/agents/base.py +73 -45
  3. hud/agents/claude.py +8 -4
  4. hud/agents/openai_chat_generic.py +65 -40
  5. hud/agents/tests/test_base.py +0 -4
  6. hud/agents/tests/test_openai.py +1 -1
  7. hud/cli/__init__.py +182 -52
  8. hud/cli/dev.py +8 -9
  9. hud/cli/eval.py +317 -119
  10. hud/cli/flows/__init__.py +0 -0
  11. hud/cli/flows/tasks.py +0 -0
  12. hud/cli/get.py +160 -0
  13. hud/cli/rl/__init__.py +563 -71
  14. hud/cli/rl/config.py +94 -0
  15. hud/cli/rl/display.py +133 -0
  16. hud/cli/rl/gpu.py +63 -0
  17. hud/cli/rl/gpu_utils.py +318 -0
  18. hud/cli/rl/presets.py +96 -0
  19. hud/cli/rl/remote_runner.py +348 -0
  20. hud/cli/rl/rl_api.py +150 -0
  21. hud/cli/rl/vllm.py +177 -0
  22. hud/cli/tests/test_analyze_metadata.py +0 -1
  23. hud/cli/utils/tasks.py +26 -0
  24. hud/clients/base.py +21 -23
  25. hud/clients/mcp_use.py +36 -44
  26. hud/clients/tests/test_mcp_use_retry.py +10 -10
  27. hud/datasets/__init__.py +4 -3
  28. hud/datasets/{execution/parallel.py → parallel.py} +1 -1
  29. hud/datasets/{execution/runner.py → runner.py} +1 -1
  30. hud/datasets/utils.py +1 -1
  31. hud/native/tests/test_native_init.py +1 -1
  32. hud/otel/config.py +1 -1
  33. hud/otel/instrumentation.py +35 -0
  34. hud/rl/README.md +31 -0
  35. hud/rl/__init__.py +1 -0
  36. hud/rl/actor.py +174 -0
  37. hud/rl/buffer.py +371 -0
  38. hud/rl/chat_template.jinja +101 -0
  39. hud/rl/config.py +184 -0
  40. hud/rl/distributed.py +95 -0
  41. hud/rl/learner.py +586 -0
  42. hud/rl/tests/__init__.py +1 -0
  43. hud/rl/tests/test_learner.py +171 -0
  44. hud/rl/train.py +354 -0
  45. hud/rl/types.py +101 -0
  46. hud/rl/utils/start_vllm_server.sh +30 -0
  47. hud/rl/utils.py +524 -0
  48. hud/rl/vllm_adapter.py +125 -0
  49. hud/settings.py +6 -0
  50. hud/telemetry/__init__.py +2 -1
  51. hud/telemetry/job.py +46 -3
  52. hud/telemetry/tests/test_trace.py +3 -3
  53. hud/telemetry/trace.py +85 -13
  54. hud/tools/computer/hud.py +4 -4
  55. hud/tools/tests/test_computer.py +3 -3
  56. hud/tools/tests/test_computer_actions.py +1 -1
  57. hud/types.py +123 -2
  58. hud/utils/group_eval.py +223 -0
  59. hud/utils/hud_console.py +113 -13
  60. hud/utils/tasks.py +119 -0
  61. hud/utils/tests/test_version.py +1 -1
  62. hud/version.py +1 -1
  63. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/METADATA +20 -2
  64. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/RECORD +67 -47
  65. hud/cli/hf.py +0 -406
  66. hud/cli/rl/README.md +0 -243
  67. hud/cli/rl/init.py +0 -370
  68. hud/cli/rl/pod.py +0 -501
  69. hud/cli/rl/ssh.py +0 -322
  70. hud/cli/rl/train.py +0 -562
  71. hud/cli/rl/utils.py +0 -165
  72. hud/datasets/execution/__init__.py +0 -13
  73. hud/datasets/task.py +0 -116
  74. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/WHEEL +0 -0
  75. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/entry_points.txt +0 -0
  76. {hud_python-0.4.27.dist-info → hud_python-0.4.29.dist-info}/licenses/LICENSE +0 -0
hud/cli/rl/presets.py ADDED
@@ -0,0 +1,96 @@
1
+ """Training configuration presets for different GPU configurations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+
8
+ def get_training_presets(gpu_memory_gb: float) -> list[dict[str, Any]]:
9
+ """Get training configuration presets based on GPU memory."""
10
+ # Time estimates based on provided benchmarks
11
+ if gpu_memory_gb >= 40: # A100 40GB or better
12
+ presets = [
13
+ {
14
+ "name": "More Steps",
15
+ "max_steps_per_episode": 12,
16
+ "mini_batch_size": 1,
17
+ "group_size": 4,
18
+ "batch_size": 8,
19
+ "max_new_tokens": 256,
20
+ "tasks_per_hour": 847,
21
+ "steps_per_hour": 424,
22
+ "lr": 3e-5,
23
+ "epochs": 2,
24
+ },
25
+ {
26
+ "name": "Balanced (Recommended)",
27
+ "max_steps_per_episode": 5,
28
+ "mini_batch_size": 1,
29
+ "group_size": 6,
30
+ "batch_size": 12,
31
+ "max_new_tokens": 1024,
32
+ "tasks_per_hour": 738,
33
+ "steps_per_hour": 415,
34
+ "lr": 3e-5,
35
+ "epochs": 2,
36
+ },
37
+ {
38
+ "name": "Low Variance",
39
+ "max_steps_per_episode": 3,
40
+ "mini_batch_size": 2,
41
+ "group_size": 8,
42
+ "batch_size": 16,
43
+ "max_new_tokens": 512,
44
+ "tasks_per_hour": 900,
45
+ "steps_per_hour": 450,
46
+ "lr": 3e-5,
47
+ "epochs": 2,
48
+ },
49
+ ]
50
+ elif gpu_memory_gb >= 24: # RTX 4090, A10, etc
51
+ presets = [
52
+ {
53
+ "name": "Balanced (Recommended)",
54
+ "max_steps_per_episode": 4,
55
+ "mini_batch_size": 1,
56
+ "group_size": 4,
57
+ "batch_size": 16,
58
+ "lr": 1e-4,
59
+ "epochs": 2,
60
+ },
61
+ {
62
+ "name": "Low Variance",
63
+ "max_steps_per_episode": 3,
64
+ "mini_batch_size": 2,
65
+ "group_size": 4,
66
+ "batch_size": 16,
67
+ "lr": 5e-5,
68
+ "epochs": 2,
69
+ },
70
+ ]
71
+ else: # Smaller GPUs
72
+ presets = [
73
+ {
74
+ "name": "Test",
75
+ "max_steps_per_episode": 5,
76
+ "mini_batch_size": 1,
77
+ "group_size": 4,
78
+ "batch_size": 8,
79
+ "lr": 1e-4,
80
+ "epochs": 1,
81
+ },
82
+ ]
83
+
84
+ return presets
85
+
86
+
87
+ def estimate_memory_usage(
88
+ mini_batch_size: int, max_steps: int, max_new_tokens: int, max_pixels: int
89
+ ) -> float:
90
+ """Calculate estimated GPU memory usage using the formula from train.py."""
91
+ INITIAL_MEMORY = 8.0
92
+ SCALING_FACTOR = 4 / (28 * 28 * 256 * 1024)
93
+ token_estimate = mini_batch_size * max_steps * max_new_tokens
94
+ image_estimate = max_pixels
95
+ total_memory = INITIAL_MEMORY + SCALING_FACTOR * token_estimate * image_estimate
96
+ return total_memory
@@ -0,0 +1,348 @@
1
+ """
2
+ Remote runner for HUD RL training via API server.
3
+
4
+ This module implements the new interactive flow for RL training.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import subprocess
11
+ import time
12
+ from pathlib import Path
13
+
14
+ from rich.console import Console
15
+
16
+ from hud.utils.hud_console import hud_console
17
+ from hud.utils.tasks import load_tasks
18
+
19
+ from . import rl_api
20
+ from .config import generate_config_interactive, load_config, save_config
21
+ from .presets import get_training_presets
22
+
23
+ console = Console()
24
+
25
+ # GPU pricing information
26
+ GPU_PRICING = {
27
+ "A100": {"price": "1", "memory": "80GB"},
28
+ "H100": {"price": "2", "memory": "80GB"},
29
+ }
30
+
31
+
32
+ def run_remote_training(
33
+ tasks_file: str | None,
34
+ model: str | None,
35
+ config_file: Path | None,
36
+ output_dir: str,
37
+ ) -> None:
38
+ """Run RL training remotely via the API server following the new interactive flow."""
39
+ from hud.settings import settings
40
+
41
+ if not settings.api_key:
42
+ hud_console.error("API key not found")
43
+ console.print("[yellow]Please set HUD_API_KEY environment variable[/yellow]")
44
+ raise ValueError("API key not found")
45
+
46
+ # Step 1: CONFIRMATION - Load tasks and show example
47
+ if tasks_file:
48
+ tasks = load_tasks(tasks_file)
49
+ else:
50
+ raise ValueError("Tasks file not found")
51
+
52
+ # Show example task for confirmation
53
+ hud_console.section_title("Example Task from Dataset")
54
+
55
+ if tasks:
56
+ # Display task with truncated values
57
+ task_data = tasks[0].model_dump()
58
+ truncated_data = {}
59
+ max_value_length = 120 # Maximum characters to show per line
60
+
61
+ for key, value in task_data.items():
62
+ value_str = str(value)
63
+ if len(value_str) > max_value_length:
64
+ truncated_data[key] = value_str[:max_value_length] + "..."
65
+ else:
66
+ truncated_data[key] = value_str
67
+
68
+ hud_console.key_value_table(truncated_data)
69
+
70
+ if not hud_console.confirm("Proceed with training on this dataset?", default=True):
71
+ hud_console.error("Training cancelled")
72
+ return
73
+
74
+ # Step 2: MODEL SELECTION
75
+ hud_console.section_title("Model Selection")
76
+
77
+ # Fetch existing models
78
+ hud_console.info("Fetching your models from https://app.hud.so/models")
79
+
80
+ try:
81
+ models = rl_api.list_models()
82
+ # Filter for active/training models and sort by recency
83
+ active_models = [m for m in models if m.status in ["ready", "training"]]
84
+ active_models.sort(key=lambda m: m.created_at or "", reverse=True)
85
+
86
+ if active_models or model is None:
87
+ # Build choices
88
+ choices = []
89
+ for m in active_models:
90
+ status_emoji = {
91
+ "ready": "✅",
92
+ "training": "🔄",
93
+ "deploying": "🚀",
94
+ "pending": "⏳",
95
+ }.get(m.status, "❓")
96
+
97
+ choices.append({"name": f"{status_emoji} {m.name} ({m.status})", "value": m.name})
98
+
99
+ choices.append({"name": "Create new model", "value": "__new__"})
100
+
101
+ if not model:
102
+ if choices:
103
+ selected = hud_console.select("Select a model:", choices=choices)
104
+ else:
105
+ selected = "__new__"
106
+ hud_console.hint("No existing models found. Creating new model...")
107
+ else:
108
+ # Model was provided via CLI
109
+ selected = model
110
+
111
+ else:
112
+ selected = "__new__"
113
+
114
+ # Handle model selection
115
+ if selected == "__new__":
116
+ # Create new model flow
117
+ hud_console.info("Creating new model...")
118
+
119
+ # Ask for model type
120
+ model_type = hud_console.select(
121
+ "Select base model type:",
122
+ choices=[
123
+ {"name": "Qwen2.5-VL-3B-Instruct", "value": "Qwen/Qwen2.5-VL-3B-Instruct"},
124
+ # {"name": "Qwen2.5-VL-7B-Instruct", "value": "Qwen/Qwen2.5-VL-7B-Instruct"},
125
+ ],
126
+ default=0,
127
+ )
128
+ from rich.prompt import Prompt
129
+
130
+ # Ask for model name
131
+ default_name = model_type.split("/")[-1].lower()
132
+ hud_console.info(f"Enter model name (default: {default_name}):")
133
+ model_name = Prompt.ask("Model name", default=default_name)
134
+ model_name = model_name.replace("/", "-").lower()
135
+
136
+ # Create the model
137
+ hud_console.info(f"Creating model: {model_name}")
138
+ try:
139
+ rl_api.create_model(model_name, model_type)
140
+ hud_console.success(f"Created model: {model_name}")
141
+
142
+ # Deploy vLLM automatically
143
+ hud_console.info(f"Deploying vLLM server for {model_name}...")
144
+ rl_api.deploy_vllm(model_name, gpu_type="A100")
145
+ hud_console.success("vLLM deployment started")
146
+
147
+ # Wait for deployment
148
+ hud_console.info("Waiting for vLLM server to be ready...")
149
+ max_wait = 600 # 10 minutes
150
+ start_time = time.time()
151
+
152
+ with hud_console.progress() as progress:
153
+ progress.update(
154
+ "Checking deployment status (see live status on https://app.hud.so/models)"
155
+ )
156
+
157
+ while True:
158
+ if time.time() - start_time > max_wait:
159
+ hud_console.error("Timeout waiting for vLLM deployment")
160
+ raise ValueError("vLLM deployment timeout")
161
+
162
+ model_info = rl_api.get_model(model_name)
163
+ if model_info.status == "ready":
164
+ hud_console.success(
165
+ f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
166
+ )
167
+ break
168
+
169
+ time.sleep(5)
170
+
171
+ except Exception as e:
172
+ hud_console.error(f"Failed to create model: {e}")
173
+ raise
174
+
175
+ else:
176
+ # Existing model selected
177
+ model_name = selected
178
+ model_info = rl_api.get_model(model_name)
179
+
180
+ # Check if model is in training
181
+ if model_info.status == "training":
182
+ if hud_console.confirm(
183
+ f"{model_name} is currently training. Stop current training?", default=False
184
+ ):
185
+ hud_console.info(f"Stopping training for {model_name}...")
186
+ try:
187
+ rl_api.stop_training(model_name)
188
+ hud_console.success("Training stopped")
189
+ except Exception as e:
190
+ hud_console.error(f"Failed to stop training: {e}")
191
+ raise
192
+ else:
193
+ hud_console.error("Cannot start new training while model is already training")
194
+ return
195
+
196
+ # Ensure vLLM is deployed
197
+ if not model_info.vllm_url:
198
+ hud_console.info(f"Deploying vLLM server for {model_name}...")
199
+ rl_api.deploy_vllm(model_name, gpu_type="A100")
200
+ hud_console.success("vLLM deployment started")
201
+
202
+ # Wait for deployment
203
+ hud_console.info("Waiting for vLLM server to be ready...")
204
+ max_wait = 600 # 10 minutes
205
+ start_time = time.time()
206
+
207
+ with hud_console.progress() as progress:
208
+ progress.update(
209
+ "Checking deployment status (see live status on https://app.hud.so/models)"
210
+ )
211
+
212
+ while True:
213
+ if time.time() - start_time > max_wait:
214
+ hud_console.error("Timeout waiting for vLLM deployment")
215
+ raise ValueError("vLLM deployment timeout")
216
+
217
+ model_info = rl_api.get_model(model_name)
218
+ if model_info.vllm_url:
219
+ hud_console.success(
220
+ f"vLLM server ready at http://rl.hud.so/v1/models/{model_name}/vllm"
221
+ )
222
+ break
223
+
224
+ time.sleep(5)
225
+ else:
226
+ hud_console.success("vLLM server already running")
227
+ except KeyboardInterrupt:
228
+ hud_console.dim_info("Training cancelled", "")
229
+ return
230
+ except Exception as e:
231
+ hud_console.error(f"Error during model selection: {e}")
232
+ raise
233
+
234
+ # Get final model info
235
+ model_info = rl_api.get_model(model_name)
236
+
237
+ # Step 3: TRAINING CONFIG
238
+ hud_console.section_title("Training Configuration")
239
+
240
+ if not config_file:
241
+ # Ask about number of GPUs with pricing
242
+ # hud_console.info("GPU Selection (Pricing per GPU):")
243
+
244
+ # gpu_table = Table(show_header=True, header_style="bold magenta")
245
+ # gpu_table.add_column("GPU Type", style="cyan")
246
+ # gpu_table.add_column("Memory", style="green")
247
+ # gpu_table.add_column("Price/hr", style="yellow")
248
+
249
+ # for gpu, info in GPU_PRICING.items():
250
+ # gpu_table.add_row(gpu, info["memory"], "see pricing on hud.so")
251
+
252
+ # console.print(gpu_table)
253
+
254
+ gpu_choice = hud_console.select(
255
+ "Select GPU type:",
256
+ choices=[
257
+ {"name": "A100 80GB", "value": "A100"},
258
+ {"name": "H100 80GB", "value": "H100"},
259
+ ],
260
+ default=0,
261
+ )
262
+
263
+ num_gpus = hud_console.select(
264
+ "Number of GPUs:",
265
+ choices=[
266
+ {"name": "1 GPU", "value": 1},
267
+ {"name": "2 GPUs", "value": 2},
268
+ {"name": "4 GPUs", "value": 4},
269
+ {"name": "8 GPUs", "value": 8},
270
+ ],
271
+ default=1,
272
+ )
273
+
274
+ # Generate config with presets
275
+ hud_console.info("Generating training configuration...")
276
+ gpu_memory_gb = 80.0 if gpu_choice in ["A100", "H100"] else 48.0
277
+ presets = get_training_presets(gpu_memory_gb)
278
+
279
+ config, _ = generate_config_interactive(
280
+ model_name=model_info.base_model,
281
+ presets=presets,
282
+ )
283
+
284
+ config.job_name = f"RL {model_name} on {tasks_file}"
285
+
286
+ # Save config for editing
287
+ temp_config_path = Path(f".rl_config_temp_{model_name}.json")
288
+ save_config(config, temp_config_path)
289
+
290
+ # Ask to edit config
291
+ hud_console.info(
292
+ f"Using training configuration from [underline cyan]{temp_config_path.absolute()}[/underline cyan]" # noqa: E501
293
+ )
294
+ edit_choice = hud_console.select(
295
+ "Would you like to start training?",
296
+ choices=[
297
+ {"name": "🚀 Start training!", "value": "start"},
298
+ {"name": "✏️ Review configuration", "value": "edit"},
299
+ {"name": "❌ Cancel", "value": "cancel"},
300
+ ],
301
+ default=0,
302
+ )
303
+
304
+ if edit_choice == "cancel":
305
+ hud_console.error("Training cancelled")
306
+ return
307
+ elif edit_choice == "edit":
308
+ # Open editor
309
+ editor = os.environ.get("EDITOR", "nano")
310
+ hud_console.info(f"Opening {editor} to edit configuration...")
311
+
312
+ try:
313
+ subprocess.run([editor, str(temp_config_path)], check=True) # noqa: S603
314
+ # Reload config
315
+ config = load_config(temp_config_path)
316
+ hud_console.success("Configuration updated")
317
+ except Exception as e:
318
+ hud_console.error(f"Failed to edit config: {e}")
319
+ return
320
+
321
+ config_dict = config.to_dict()
322
+ else:
323
+ # Load provided config
324
+ hud_console.info(f"Loading configuration from: {config_file}")
325
+ config = load_config(config_file)
326
+ config_dict = config.to_dict()
327
+ gpu_choice = "A100" # Default
328
+ num_gpus = 1 # Default for non-interactive mode
329
+
330
+ # Launch training
331
+ try:
332
+ rl_api.launch_training(
333
+ model_name=model_name,
334
+ config=config_dict,
335
+ tasks=[task.model_dump() for task in tasks],
336
+ gpu_type=gpu_choice,
337
+ gpu_count=int(num_gpus),
338
+ )
339
+
340
+ hud_console.success("Training Started Successfully!")
341
+
342
+ hud_console.info(f"See your model {model_name} training on https://app.hud.so/models")
343
+ hud_console.hint("Launch another training run via: hud rl <tasks_file>")
344
+ hud_console.hint("Or evaluate the model via: hud eval <tasks_file>")
345
+
346
+ except Exception as e:
347
+ hud_console.error(f"Failed to launch training: {e}")
348
+ raise
hud/cli/rl/rl_api.py ADDED
@@ -0,0 +1,150 @@
1
+ """
2
+ Direct API functions for HUD RL remote endpoints using shared requests module.
3
+
4
+ This module provides functions for interacting with the HUD RL API server.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ from pydantic import BaseModel
12
+
13
+ from hud.settings import settings
14
+ from hud.shared.requests import make_request_sync
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import Iterator
18
+
19
+
20
+ class RLModelInfo(BaseModel):
21
+ """Model information from the API."""
22
+
23
+ name: str
24
+ base_model: str
25
+ vllm_url: str | None = None
26
+ trainer_name: str | None = None
27
+ checkpoint_volume: str | None = None
28
+ status: str = "pending" # pending, deploying, ready, training, terminated
29
+ created_at: str | None = None
30
+ updated_at: str | None = None
31
+ terminated_at: str | None = None
32
+
33
+
34
+ def create_model(name: str, base_model: str) -> dict[str, Any]:
35
+ """Create a new model."""
36
+ return make_request_sync(
37
+ method="POST",
38
+ url=f"{settings.hud_rl_url}/models",
39
+ json={"name": name, "base_model": base_model},
40
+ api_key=settings.api_key,
41
+ )
42
+
43
+
44
+ def get_model(name: str) -> RLModelInfo:
45
+ """Get model information."""
46
+ response = make_request_sync(
47
+ method="GET", url=f"{settings.hud_rl_url}/models/{name}", api_key=settings.api_key
48
+ )
49
+ return RLModelInfo(**response)
50
+
51
+
52
+ def list_models() -> list[RLModelInfo]:
53
+ """List all models."""
54
+ response = make_request_sync(
55
+ method="GET", url=f"{settings.hud_rl_url}/models", api_key=settings.api_key
56
+ )
57
+ if not isinstance(response, list):
58
+ response = [response]
59
+ return [
60
+ RLModelInfo(**(model if isinstance(model, dict) else model.__dict__)) for model in response
61
+ ]
62
+
63
+
64
+ def deploy_vllm(model_name: str, gpu_type: str = "A100") -> dict[str, Any]:
65
+ """Deploy a vLLM server for a model."""
66
+ return make_request_sync(
67
+ method="POST",
68
+ url=f"{settings.hud_rl_url}/models/{model_name}/deploy",
69
+ json={"gpu_type": gpu_type},
70
+ api_key=settings.api_key,
71
+ )
72
+
73
+
74
+ def stop_vllm(model_name: str) -> dict[str, Any]:
75
+ """Stop the vLLM server for a model."""
76
+ return make_request_sync(
77
+ method="DELETE",
78
+ url=f"{settings.hud_rl_url}/models/{model_name}/deploy",
79
+ api_key=settings.api_key,
80
+ )
81
+
82
+
83
+ def stop_training(model_name: str) -> dict[str, Any]:
84
+ """Stop the training for a model."""
85
+ return make_request_sync(
86
+ method="DELETE",
87
+ url=f"{settings.hud_rl_url}/models/{model_name}/training",
88
+ api_key=settings.api_key,
89
+ )
90
+
91
+
92
+ def launch_training(
93
+ model_name: str,
94
+ config: dict[str, Any],
95
+ tasks: list[dict[str, Any]],
96
+ gpu_type: str = "A100",
97
+ gpu_count: int = 1,
98
+ ) -> dict[str, Any]:
99
+ """Launch a training run for a model."""
100
+ return make_request_sync(
101
+ method="POST",
102
+ url=f"{settings.hud_rl_url}/models/{model_name}/training/launch",
103
+ json={"config": config, "tasks": tasks, "gpu_type": gpu_type, "gpu_count": gpu_count},
104
+ api_key=settings.api_key,
105
+ )
106
+
107
+
108
+ def get_training_status(model_name: str) -> dict[str, Any]:
109
+ """Get the status of a training run."""
110
+ return make_request_sync(
111
+ method="GET",
112
+ url=f"{settings.hud_rl_url}/models/{model_name}/training/status",
113
+ api_key=settings.api_key,
114
+ )
115
+
116
+
117
+ def get_training_logs(model_name: str, lines: int = 100, follow: bool = False) -> Iterator[str]:
118
+ """Get training logs for a model.
119
+
120
+ Args:
121
+ model_name: Name of the model
122
+ lines: Number of lines to return
123
+ follow: If True, stream logs as they arrive
124
+
125
+ Yields:
126
+ Log lines as strings
127
+ """
128
+ # For streaming logs, we need to use httpx directly
129
+ # as the shared requests module expects JSON responses
130
+ import httpx
131
+
132
+ params = {"lines": lines}
133
+ if follow:
134
+ params["follow"] = True
135
+
136
+ headers = {"Authorization": f"Bearer {settings.api_key}"}
137
+
138
+ with (
139
+ httpx.Client(timeout=300.0) as client,
140
+ client.stream(
141
+ "GET",
142
+ f"{settings.hud_rl_url}/models/{model_name}/training/logs",
143
+ params=params,
144
+ headers=headers,
145
+ ) as response,
146
+ ):
147
+ response.raise_for_status()
148
+ for line in response.iter_lines():
149
+ if line:
150
+ yield line