lyceum-cli 1.0.27__py3-none-any.whl → 1.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1023 @@
1
+ """GPU selection execution commands"""
2
+
3
+ import json
4
+ import time
5
+
6
+ import httpx
7
+ import typer
8
+ from rich.console import Console
9
+ from rich.panel import Panel
10
+ from rich.table import Table
11
+
12
+ from ....shared.config import config
13
+ from ....shared.streaming import StatusLine
14
+ from .python import (
15
+ inject_script_args,
16
+ load_workspace_config,
17
+ read_code_from_source,
18
+ resolve_import_files,
19
+ resolve_requirements,
20
+ )
21
+
22
+ console = Console()
23
+
24
+ gpu_selection_app = typer.Typer(name="gpu-selection", help="GPU selection and profiling commands")
25
+
26
+ POLL_INTERVAL = 2.0
27
+ MAX_POLL_TIME = 3600 # 1 hour - A100/H100 initialization can take up to 30 min
28
+
29
+ # Cache for GPU pricing
30
+ _pricing_cache: dict[str, float] | None = None
31
+
32
+ # Mapping from API profile names to display names
33
+ GPU_DISPLAY_NAMES = {
34
+ "gpu": "T4",
35
+ "gpu.t4": "T4",
36
+ "gpu.t4.64gb": "T4",
37
+ "gpu.a100": "A100",
38
+ "gpu.a100.40gb": "A100 (40GB)",
39
+ "gpu.a100.80gb": "A100 (80GB)",
40
+ "gpu.h100": "H100",
41
+ "gpu.h200": "H200",
42
+ "gpu.l40s": "L40S",
43
+ "gpu.b200": "B200",
44
+ "gpu.rtx6000pro": "RTX 6000 Pro",
45
+ }
46
+
47
+
48
+ def format_gpu_name(profile: str) -> str:
49
+ """Format GPU profile name for display."""
50
+ if profile in GPU_DISPLAY_NAMES:
51
+ return GPU_DISPLAY_NAMES[profile]
52
+ # Fallback: strip "gpu." prefix and uppercase
53
+ return profile.replace("gpu.", "").upper()
54
+
55
+
56
+ def fetch_gpu_pricing() -> dict[str, float]:
57
+ """Fetch GPU pricing from API. Returns dict of hardware_profile -> price_per_hour."""
58
+ global _pricing_cache
59
+ if _pricing_cache is not None:
60
+ return _pricing_cache
61
+
62
+ try:
63
+ response = httpx.get(
64
+ f"{config.base_url}/api/v2/external/compute/machine-types",
65
+ headers={"Authorization": f"Bearer {config.api_key}"},
66
+ timeout=10.0,
67
+ )
68
+ if response.status_code == 200:
69
+ data = response.json()
70
+ _pricing_cache = {
71
+ m["hardware_profile"]: m.get("price_per_hour", 0) or 0
72
+ for m in data.get("machine_types", [])
73
+ }
74
+ return _pricing_cache
75
+ except Exception:
76
+ pass
77
+ return {}
78
+
79
+
80
+ def calculate_cost(execution_time_s: float, hardware_profile: str, pricing: dict[str, float]) -> float | None:
81
+ """Calculate cost based on execution time and GPU pricing."""
82
+ price_per_hour = pricing.get(hardware_profile)
83
+ if price_per_hour is None or price_per_hour == 0:
84
+ return None
85
+ return execution_time_s * (price_per_hour / 3600)
86
+
87
+
88
+ def submit_gpu_selection(payload: dict, status: StatusLine = None) -> str:
89
+ """Submit GPU selection request to API and return the execution_id."""
90
+ if status:
91
+ status.update("Submitting GPU selection job...")
92
+
93
+ response = httpx.post(
94
+ f"{config.base_url}/api/v2/external/execution/gpu_selection/start",
95
+ headers={"Authorization": f"Bearer {config.api_key}"},
96
+ json=payload,
97
+ timeout=30.0,
98
+ )
99
+
100
+ if response.status_code != 200:
101
+ if status:
102
+ status.stop()
103
+ console.print(f"[red]Error: HTTP {response.status_code}[/red]")
104
+ if response.status_code == 401:
105
+ console.print("[red]Authentication failed. Your session may have expired.[/red]")
106
+ console.print("[yellow]Run 'lyceum auth login' to re-authenticate.[/yellow]")
107
+ elif response.status_code == 402:
108
+ console.print("[red]Insufficient credits. Please purchase more credits to continue.[/red]")
109
+ elif response.status_code == 403:
110
+ console.print("[red]You do not have access to GPU instances.[/red]")
111
+ else:
112
+ console.print(f"[red]{response.content.decode()}[/red]")
113
+ raise typer.Exit(1)
114
+
115
+ data = response.json()
116
+ return data["execution_id"]
117
+
118
+
119
+ def poll_gpu_selection(execution_id: str, status: StatusLine = None) -> dict:
120
+ """Poll GPU selection status until terminal state."""
121
+ elapsed = 0.0
122
+
123
+ while elapsed < MAX_POLL_TIME:
124
+ try:
125
+ response = httpx.get(
126
+ f"{config.base_url}/api/v2/external/execution/gpu_selection/{execution_id}/status",
127
+ headers={"Authorization": f"Bearer {config.api_key}"},
128
+ timeout=10.0,
129
+ )
130
+
131
+ if response.status_code != 200:
132
+ if status:
133
+ status.update(f"Waiting for results (status check returned {response.status_code})...")
134
+ time.sleep(POLL_INTERVAL)
135
+ elapsed += POLL_INTERVAL
136
+ continue
137
+
138
+ data = response.json()
139
+ current_status = data.get("status", "unknown")
140
+
141
+ if current_status in ("completed", "failed", "aborted", "system_failure"):
142
+ return data
143
+
144
+ if status:
145
+ status.update(f"Status: {current_status}...")
146
+
147
+ except httpx.RequestError:
148
+ if status:
149
+ status.update("Reconnecting...")
150
+
151
+ time.sleep(POLL_INTERVAL)
152
+ elapsed += POLL_INTERVAL
153
+
154
+ if status:
155
+ status.stop()
156
+ console.print("[yellow]Timed out waiting for GPU selection results.[/yellow]")
157
+ console.print(f"[dim]Check later: lyceum predict status {execution_id}[/dim]")
158
+ raise typer.Exit(1)
159
+
160
+
161
+ ERROR_SUGGESTIONS = {
162
+ "No PyTorch or Hugging Face ecosystem detected": [
163
+ "Add [cyan]import torch[/cyan] and use PyTorch modules",
164
+ "Or use HuggingFace [cyan]transformers[/cyan] library",
165
+ ],
166
+ "GPU requirement cannot be determined or is CPU-only": [
167
+ "Move model to GPU: [cyan]model.to('cuda')[/cyan]",
168
+ "Move tensors to GPU: [cyan]tensor.to('cuda')[/cyan]",
169
+ "Or use [cyan]device = torch.device('cuda')[/cyan]",
170
+ ],
171
+ "No model found": [
172
+ "Define a class that inherits from [cyan]nn.Module[/cyan]",
173
+ "Or use a pretrained model from [cyan]transformers[/cyan]",
174
+ ],
175
+ "No training loop detected": [
176
+ "Add a training loop with [cyan]loss.backward()[/cyan]",
177
+ "And [cyan]optimizer.step()[/cyan]",
178
+ ],
179
+ }
180
+
181
+
182
+ def get_suggestions(error: str) -> list[str]:
183
+ """Get suggestions for a given error message."""
184
+ for key, suggestions in ERROR_SUGGESTIONS.items():
185
+ if key.lower() in error.lower():
186
+ return suggestions
187
+ return []
188
+
189
+
190
+ def display_results(data: dict, file_path: str | None = None) -> None:
191
+ """Display GPU selection results."""
192
+ if data is None:
193
+ console.print()
194
+ console.print(Panel(
195
+ "[red]✗[/red] No response data received",
196
+ title="[red]GPU Selection Failed[/red]",
197
+ border_style="red",
198
+ padding=(1, 2),
199
+ ))
200
+ return
201
+
202
+ status = data.get("status", "unknown")
203
+
204
+ # Parse metadata if it's a string
205
+ metadata = data.get("metadata")
206
+ if isinstance(metadata, str):
207
+ try:
208
+ metadata = json.loads(metadata)
209
+ except (json.JSONDecodeError, TypeError):
210
+ metadata = {}
211
+ metadata = metadata or {}
212
+
213
+ if status != "completed":
214
+ errors = data.get("system_errors") or []
215
+
216
+ # Build error content
217
+ error_lines = []
218
+ all_suggestions = []
219
+
220
+ for err in errors:
221
+ error_lines.append(f"[red]✗[/red] {err}")
222
+ all_suggestions.extend(get_suggestions(err))
223
+
224
+ if not error_lines:
225
+ error_lines.append(f"[red]✗[/red] Status: {status}")
226
+
227
+ error_content = "\n".join(error_lines)
228
+
229
+ # Add suggestions if available
230
+ if all_suggestions:
231
+ error_content += "\n\n[dim]Suggestions:[/dim]"
232
+ for suggestion in all_suggestions:
233
+ error_content += f"\n → {suggestion}"
234
+
235
+ console.print()
236
+ console.print(Panel(
237
+ error_content,
238
+ title="[red]GPU Selection Failed[/red]",
239
+ border_style="red",
240
+ padding=(1, 2),
241
+ ))
242
+ return
243
+
244
+ profiling = metadata.get("profiling_results", [])
245
+ extraction = metadata.get("extraction_result", {})
246
+
247
+ # Get memory info for summary
248
+ mem_config = extraction.get("memory_config", {})
249
+ minimal_configs = mem_config.get("minimal_configs", [])
250
+
251
+ # Find the smallest/cheapest GPU option (lowest VRAM that works)
252
+ best_gpu = None
253
+ if minimal_configs:
254
+ # Sort by VRAM to find smallest viable option
255
+ sorted_configs = sorted(minimal_configs, key=lambda x: x.get("per_gpu_vram_gb", 999))
256
+ best_gpu = sorted_configs[0] if sorted_configs else None
257
+
258
+ # Summary panel
259
+ console.print()
260
+ summary_lines = ["[green]✓[/green] Analysis complete"]
261
+
262
+ if best_gpu:
263
+ gpu_name = format_gpu_name(best_gpu.get("gpu_type", "unknown"))
264
+ vram = best_gpu.get("per_gpu_vram_gb", "?")
265
+ count = best_gpu.get("min_gpu_count", 1)
266
+ gpu_str = f"{count}x " if count > 1 else ""
267
+ summary_lines.append("")
268
+ summary_lines.append(f"[bold]Recommended:[/bold] [cyan]{gpu_str}{gpu_name}[/cyan] ({vram} GB VRAM)")
269
+
270
+ # Add runtime info if available
271
+ if profiling:
272
+ completed = [p for p in profiling if p.get("status") == "completed"]
273
+ if completed:
274
+ fastest = min(completed, key=lambda x: x.get("execution_time", 999))
275
+ time_s = fastest.get("execution_time")
276
+ if time_s is not None:
277
+ summary_lines.append(f"[bold]Est. runtime:[/bold] {time_s:.2f}s")
278
+
279
+ report = fastest.get("runtime_report", {})
280
+ iters = report.get("train_iteration", {}).get("train_iterations_per_second")
281
+ if iters:
282
+ summary_lines.append(f"[bold]Throughput:[/bold] {iters:.0f} iters/sec")
283
+
284
+ console.print(Panel(
285
+ "\n".join(summary_lines),
286
+ title="[green]GPU Selection Results[/green]",
287
+ border_style="green",
288
+ padding=(1, 2),
289
+ ))
290
+
291
+ # Profiling results table with cost
292
+ if profiling:
293
+ pricing = fetch_gpu_pricing()
294
+ console.print()
295
+ prof_table = Table(title="Profiling Results", show_header=True, header_style="bold")
296
+ prof_table.add_column("GPU", style="cyan")
297
+ prof_table.add_column("Status")
298
+ prof_table.add_column("Time", justify="right")
299
+ prof_table.add_column("Cost", justify="right")
300
+ prof_table.add_column("Iters/sec", justify="right")
301
+ prof_table.add_column("Peak VRAM", justify="right")
302
+
303
+ # Sort by execution time
304
+ sorted_profiling = sorted(profiling, key=lambda x: x.get("execution_time") or 999)
305
+
306
+ for result in sorted_profiling:
307
+ profile = result.get("profile", "?")
308
+ rst = result.get("status", "unknown")
309
+ style = "green" if rst in ("completed", "success") else "red"
310
+
311
+ report = result.get("runtime_report") or {}
312
+ train_iter = report.get("train_iteration") or {}
313
+
314
+ time_s = result.get("execution_time")
315
+ # Calculate cost from pricing
316
+ cost = result.get("cost")
317
+ if cost is None and time_s is not None:
318
+ cost = calculate_cost(time_s, profile, pricing)
319
+
320
+ iters = train_iter.get("train_iterations_per_second")
321
+ vram = report.get("Peak VRAM Allocated (MB)")
322
+
323
+ prof_table.add_row(
324
+ format_gpu_name(profile),
325
+ f"[{style}]{rst}[/{style}]",
326
+ f"{time_s:.2f}s" if time_s is not None else "-",
327
+ f"${cost:.6f}" if cost is not None else "-",
328
+ f"{iters:.0f}" if iters else "-",
329
+ f"{vram:.1f} MB" if vram else "-",
330
+ )
331
+
332
+ console.print(prof_table)
333
+
334
+ # Compatible GPU configurations table
335
+ if minimal_configs:
336
+ console.print()
337
+ gpu_table = Table(title="Compatible GPUs", show_header=True, header_style="bold")
338
+ gpu_table.add_column("GPU", style="cyan")
339
+ gpu_table.add_column("VRAM", justify="right")
340
+ gpu_table.add_column("GPUs Needed", justify="right")
341
+ gpu_table.add_column("Utilization", justify="right")
342
+
343
+ # Sort by VRAM size for better display
344
+ sorted_configs = sorted(minimal_configs, key=lambda x: x.get("per_gpu_vram_gb", 0))
345
+
346
+ for i, cfg in enumerate(sorted_configs):
347
+ gpu_type = format_gpu_name(cfg.get("gpu_type", "?"))
348
+ vram = cfg.get("per_gpu_vram_gb", 0)
349
+ count = cfg.get("min_gpu_count", 1)
350
+ util = cfg.get("vram_utilization_percent", 0)
351
+
352
+ # Highlight the recommended (smallest) option
353
+ if i == 0:
354
+ gpu_type = f"[green]{gpu_type}[/green] ✓"
355
+
356
+ gpu_table.add_row(
357
+ gpu_type,
358
+ f"{vram} GB",
359
+ str(count),
360
+ f"{util}%",
361
+ )
362
+
363
+ console.print(gpu_table)
364
+
365
+ # Show run command hint if we have a best GPU and file path
366
+ if best_gpu and file_path:
367
+ machine_flag = best_gpu.get("gpu_type", "gpu")
368
+ console.print()
369
+ console.print(f"[dim]To run on optimal machine: lyceum python run {file_path} -m {machine_flag}[/dim]")
370
+
371
+
372
+ @gpu_selection_app.command("run", context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
373
+ def run_gpu_selection(
374
+ ctx: typer.Context,
375
+ code_or_file: str = typer.Argument(..., help="Python code to execute or path to Python file"),
376
+ file_name: str | None = typer.Option(None, "--file-name", "-f", help="Name for the execution"),
377
+ timeout: int = typer.Option(60, "--timeout", "-t", help="Timeout per sub-job in seconds (1-600)"),
378
+ requirements: str | None = typer.Option(
379
+ None, "--requirements", "-r", help="Requirements file path or pip requirements string"
380
+ ),
381
+ imports: list[str] | None = typer.Option(
382
+ None, "--import", help="Pre-import modules (can be used multiple times)"
383
+ ),
384
+ use_config: bool = typer.Option(
385
+ True, "--use-config/--no-config",
386
+ help="Use workspace config from .lyceum/config.json if available"
387
+ ),
388
+ debug: bool = typer.Option(
389
+ False, "--debug", "-d",
390
+ help="Show detailed debug information about config, requirements, and payload"
391
+ ),
392
+ ):
393
+ """Run code on multiple GPUs and select the optimal hardware.
394
+
395
+ Submits the code to run on all GPU profiles available to your account,
396
+ then returns which GPU performed best.
397
+
398
+ Script arguments can be passed after the file path:
399
+
400
+ lyceum predict run train.py -- --epochs 10 --lr 0.001
401
+ """
402
+ status = StatusLine()
403
+
404
+ try:
405
+ config.get_client()
406
+ status.start()
407
+
408
+ script_args = [arg for arg in (ctx.args or []) if arg != "--"]
409
+
410
+ code, file_path, detected_file_name = read_code_from_source(code_or_file, status)
411
+ if not file_name:
412
+ file_name = detected_file_name
413
+
414
+ code = inject_script_args(code, script_args, file_name)
415
+
416
+ workspace_config = None
417
+ if use_config:
418
+ status.update("Loading workspace config...")
419
+ workspace_config = load_workspace_config(file_path)
420
+ if workspace_config and debug:
421
+ status.stop()
422
+ console.print(f"[cyan]DEBUG: Config keys: {list(workspace_config.keys())}[/cyan]")
423
+ status.start()
424
+
425
+ requirements_content = resolve_requirements(requirements, workspace_config, debug, status)
426
+ import_files = resolve_import_files(file_path, workspace_config, debug, status)
427
+
428
+ # Build payload matching GPUSelectionRequest schema
429
+ payload = {
430
+ "code": code,
431
+ "nbcode": 0,
432
+ "timeout": timeout,
433
+ }
434
+ if file_name:
435
+ payload["file_name"] = file_name
436
+ if requirements_content:
437
+ payload["requirements_content"] = requirements_content
438
+ if imports:
439
+ payload["prior_imports"] = imports
440
+ if import_files:
441
+ payload["import_files"] = import_files
442
+
443
+ if debug:
444
+ status.stop()
445
+ console.print("[cyan]DEBUG: Payload summary:[/cyan]")
446
+ console.print(f"[cyan] - timeout: {timeout}[/cyan]")
447
+ console.print(f"[cyan] - code length: {len(code)} chars[/cyan]")
448
+ console.print(f"[cyan] - requirements_content: {len(requirements_content or '')} chars[/cyan]")
449
+ console.print(f"[cyan] - import_files: {len(import_files or '')} chars[/cyan]")
450
+ status.start()
451
+
452
+ execution_id = submit_gpu_selection(payload, status)
453
+ console.print(f"[dim]Execution ID: {execution_id}[/dim]")
454
+
455
+ status.update("Waiting for GPU selection results...")
456
+ data = poll_gpu_selection(execution_id, status)
457
+ status.stop()
458
+
459
+ display_results(data, file_path=code_or_file)
460
+
461
+ if data.get("status") != "completed":
462
+ raise typer.Exit(1)
463
+
464
+ except typer.Exit:
465
+ status.stop()
466
+ raise
467
+ except Exception as e:
468
+ status.stop()
469
+ console.print(f"[red]Error: {e}[/red]")
470
+ raise typer.Exit(1)
471
+
472
+
473
+ @gpu_selection_app.command("status")
474
+ def predict_status(
475
+ execution_id: str = typer.Argument(..., help="Execution ID to check"),
476
+ ):
477
+ """Check the status of a GPU selection execution."""
478
+ try:
479
+ config.get_client()
480
+
481
+ response = httpx.get(
482
+ f"{config.base_url}/api/v2/external/execution/gpu_selection/{execution_id}/status",
483
+ headers={"Authorization": f"Bearer {config.api_key}"},
484
+ timeout=10.0,
485
+ )
486
+
487
+ if response.status_code == 404:
488
+ console.print("[red]Execution not found.[/red]")
489
+ raise typer.Exit(1)
490
+
491
+ if response.status_code != 200:
492
+ console.print(f"[red]Error: HTTP {response.status_code}[/red]")
493
+ console.print(f"[red]{response.content.decode()}[/red]")
494
+ raise typer.Exit(1)
495
+
496
+ data = response.json()
497
+
498
+ # Parse metadata if it's a string
499
+ if isinstance(data.get("metadata"), str):
500
+ try:
501
+ data["metadata"] = json.loads(data["metadata"])
502
+ except (json.JSONDecodeError, TypeError):
503
+ pass
504
+
505
+ current_status = data.get("status", "unknown")
506
+ console.print(f"Status: [bold]{current_status}[/bold]")
507
+
508
+ if current_status in ("completed", "failed", "aborted", "system_failure"):
509
+ display_results(data)
510
+ else:
511
+ console.print("[dim]Job is still running. Check again later.[/dim]")
512
+
513
+ except typer.Exit:
514
+ raise
515
+ except Exception as e:
516
+ console.print(f"[red]Error: {e}[/red]")
517
+ raise typer.Exit(1)
518
+
519
+
520
+ def display_memory_results(data: dict, file_path: str | None = None) -> None:
521
+ """Display memory analysis results."""
522
+ if data is None:
523
+ console.print("[red]No data received[/red]")
524
+ return
525
+
526
+ metadata = data.get("metadata")
527
+ if isinstance(metadata, str):
528
+ try:
529
+ metadata = json.loads(metadata)
530
+ except (json.JSONDecodeError, TypeError):
531
+ metadata = {}
532
+ metadata = metadata or {}
533
+
534
+ extraction = metadata.get("extraction_result", {})
535
+ mem_config = extraction.get("memory_config", {})
536
+ mem_reqs = mem_config.get("memory_requirements", {})
537
+ minimal_configs = mem_config.get("minimal_configs", [])
538
+
539
+ if not mem_reqs and not minimal_configs:
540
+ console.print("[yellow]No memory analysis data available.[/yellow]")
541
+ return
542
+
543
+ # Memory requirements breakdown
544
+ if mem_reqs:
545
+ console.print()
546
+ mem_table = Table(title="Memory Requirements", show_header=True, header_style="bold")
547
+ mem_table.add_column("Component", style="cyan")
548
+ mem_table.add_column("Size", justify="right")
549
+
550
+ def format_gb(val: float) -> str:
551
+ if val < 0.001:
552
+ return f"{val * 1024:.2f} MB"
553
+ return f"{val:.3f} GB"
554
+
555
+ components = [
556
+ ("Model Weights", mem_reqs.get("model_weights", 0)),
557
+ ("Gradients", mem_reqs.get("gradients", 0)),
558
+ ("Optimizer States", mem_reqs.get("optimizer_states", 0)),
559
+ ("Activations", mem_reqs.get("activations", 0)),
560
+ ("Largest Layer", mem_reqs.get("largest_layer", 0)),
561
+ ]
562
+
563
+ total = sum(v for _, v in components if v)
564
+ for name, val in components:
565
+ if val:
566
+ mem_table.add_row(name, format_gb(val))
567
+
568
+ mem_table.add_row("─" * 20, "─" * 10)
569
+ mem_table.add_row("[bold]Total[/bold]", f"[bold]{format_gb(total)}[/bold]")
570
+
571
+ param_count = mem_reqs.get("parameter_count", 0)
572
+ if param_count:
573
+ mem_table.add_row("", "")
574
+ mem_table.add_row("Parameter Count", f"{param_count:.2e}")
575
+
576
+ console.print(mem_table)
577
+
578
+ # Compatible GPUs
579
+ if minimal_configs:
580
+ console.print()
581
+ gpu_table = Table(title="Compatible GPUs", show_header=True, header_style="bold")
582
+ gpu_table.add_column("GPU", style="cyan")
583
+ gpu_table.add_column("VRAM", justify="right")
584
+ gpu_table.add_column("GPUs Needed", justify="right")
585
+ gpu_table.add_column("Utilization", justify="right")
586
+
587
+ sorted_configs = sorted(minimal_configs, key=lambda x: x.get("per_gpu_vram_gb", 0))
588
+
589
+ for i, cfg in enumerate(sorted_configs):
590
+ gpu_type = format_gpu_name(cfg.get("gpu_type", "?"))
591
+ vram = cfg.get("per_gpu_vram_gb", 0)
592
+ count = cfg.get("min_gpu_count", 1)
593
+ util = cfg.get("vram_utilization_percent", 0)
594
+
595
+ if i == 0:
596
+ gpu_type = f"[green]{gpu_type}[/green] ✓"
597
+
598
+ gpu_table.add_row(gpu_type, f"{vram} GB", str(count), f"{util}%")
599
+
600
+ console.print(gpu_table)
601
+
602
+ # Show run command hint
603
+ if file_path and sorted_configs:
604
+ best = sorted_configs[0]
605
+ machine_flag = best.get("gpu_type", "gpu")
606
+ console.print()
607
+ console.print(f"[dim]To run on optimal machine: lyceum python run {file_path} -m {machine_flag}[/dim]")
608
+
609
+
610
+ @gpu_selection_app.command("memory", context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
611
+ def predict_memory(
612
+ ctx: typer.Context,
613
+ code_or_file: str = typer.Argument(..., help="Python code or path to Python file"),
614
+ file_name: str | None = typer.Option(None, "--file-name", "-f", help="Name for the execution"),
615
+ requirements: str | None = typer.Option(
616
+ None, "--requirements", "-r", help="Requirements file path or pip requirements string"
617
+ ),
618
+ imports: list[str] | None = typer.Option(
619
+ None, "--import", help="Pre-import modules (can be used multiple times)"
620
+ ),
621
+ mixed_precision: str | None = typer.Option(
622
+ None, "--mixed-precision", "-mp",
623
+ help="Mixed precision dtype (fp16, bf16)"
624
+ ),
625
+ strategy: str | None = typer.Option(
626
+ None, "--strategy", "-s",
627
+ help="Parallelization strategy (ddp, fsdp, zero1, zero2, zero3)"
628
+ ),
629
+ use_config: bool = typer.Option(
630
+ True, "--use-config/--no-config",
631
+ help="Use workspace config from .lyceum/config.json if available"
632
+ ),
633
+ ):
634
+ """Estimate memory requirements for your training script.
635
+
636
+ Analyzes model architecture to predict VRAM usage without running
637
+ full GPU profiling. Faster than 'predict run'.
638
+
639
+ Examples:
640
+ lyceum predict memory train.py
641
+ lyceum predict memory train.py --mixed-precision fp16
642
+ lyceum predict memory train.py --strategy fsdp
643
+ """
644
+ status = StatusLine()
645
+
646
+ try:
647
+ config.get_client()
648
+ status.start()
649
+
650
+ script_args = [arg for arg in (ctx.args or []) if arg != "--"]
651
+
652
+ code, file_path, detected_file_name = read_code_from_source(code_or_file, status)
653
+ if not file_name:
654
+ file_name = detected_file_name
655
+
656
+ code = inject_script_args(code, script_args, file_name)
657
+
658
+ workspace_config = None
659
+ if use_config:
660
+ status.update("Loading workspace config...")
661
+ workspace_config = load_workspace_config(file_path)
662
+
663
+ requirements_content = resolve_requirements(requirements, workspace_config, False, status)
664
+ import_files = resolve_import_files(file_path, workspace_config, False, status)
665
+
666
+ # Build payload - same as gpu_selection but we'll only show memory results
667
+ payload = {
668
+ "code": code,
669
+ "nbcode": 0,
670
+ "timeout": 60, # Memory analysis is quick
671
+ }
672
+ if file_name:
673
+ payload["file_name"] = file_name
674
+ if requirements_content:
675
+ payload["requirements_content"] = requirements_content
676
+ if imports:
677
+ payload["prior_imports"] = imports
678
+ if import_files:
679
+ payload["import_files"] = import_files
680
+
681
+ # TODO: When backend supports it, add mixed_precision and strategy to payload
682
+ if mixed_precision:
683
+ console.print(f"[dim]Note: --mixed-precision {mixed_precision} (backend support coming soon)[/dim]")
684
+ if strategy:
685
+ console.print(f"[dim]Note: --strategy {strategy} (backend support coming soon)[/dim]")
686
+
687
+ execution_id = submit_gpu_selection(payload, status)
688
+ console.print(f"[dim]Execution ID: {execution_id}[/dim]")
689
+
690
+ status.update("Analyzing memory requirements...")
691
+ data = poll_gpu_selection(execution_id, status)
692
+ status.stop()
693
+
694
+ if data.get("status") != "completed":
695
+ display_results(data) # Show error with suggestions
696
+ raise typer.Exit(1)
697
+
698
+ display_memory_results(data, file_path=code_or_file)
699
+
700
+ except typer.Exit:
701
+ status.stop()
702
+ raise
703
+ except Exception as e:
704
+ status.stop()
705
+ console.print(f"[red]Error: {e}[/red]")
706
+ raise typer.Exit(1)
707
+
708
+
709
+ @gpu_selection_app.command("recommend-gpus", context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
710
+ def recommend_gpus(
711
+ ctx: typer.Context,
712
+ code_or_file: str = typer.Argument(..., help="Python code or path to Python file"),
713
+ file_name: str | None = typer.Option(None, "--file-name", "-f", help="Name for the execution"),
714
+ requirements: str | None = typer.Option(
715
+ None, "--requirements", "-r", help="Requirements file path or pip requirements string"
716
+ ),
717
+ use_config: bool = typer.Option(
718
+ True, "--use-config/--no-config",
719
+ help="Use workspace config from .lyceum/config.json if available"
720
+ ),
721
+ top: int = typer.Option(3, "--top", "-n", help="Number of recommendations to show"),
722
+ ):
723
+ """Quick GPU recommendations based on memory analysis.
724
+
725
+ Analyzes your model and recommends the best GPU configurations
726
+ sorted by cost-efficiency.
727
+
728
+ Examples:
729
+ lyceum predict recommend-gpus train.py
730
+ lyceum predict recommend-gpus train.py --top 5
731
+ """
732
+ status = StatusLine()
733
+
734
+ try:
735
+ config.get_client()
736
+ status.start()
737
+
738
+ script_args = [arg for arg in (ctx.args or []) if arg != "--"]
739
+
740
+ code, file_path, detected_file_name = read_code_from_source(code_or_file, status)
741
+ if not file_name:
742
+ file_name = detected_file_name
743
+
744
+ code = inject_script_args(code, script_args, file_name)
745
+
746
+ workspace_config = None
747
+ if use_config:
748
+ status.update("Loading workspace config...")
749
+ workspace_config = load_workspace_config(file_path)
750
+
751
+ requirements_content = resolve_requirements(requirements, workspace_config, False, status)
752
+ import_files = resolve_import_files(file_path, workspace_config, False, status)
753
+
754
+ payload = {
755
+ "code": code,
756
+ "nbcode": 0,
757
+ "timeout": 60,
758
+ }
759
+ if file_name:
760
+ payload["file_name"] = file_name
761
+ if requirements_content:
762
+ payload["requirements_content"] = requirements_content
763
+ if import_files:
764
+ payload["import_files"] = import_files
765
+
766
+ execution_id = submit_gpu_selection(payload, status)
767
+ console.print(f"[dim]Execution ID: {execution_id}[/dim]")
768
+
769
+ status.update("Analyzing model and generating recommendations...")
770
+ data = poll_gpu_selection(execution_id, status)
771
+ status.stop()
772
+
773
+ if data.get("status") != "completed":
774
+ display_results(data)
775
+ raise typer.Exit(1)
776
+
777
+ # Get pricing for cost estimates
778
+ pricing = fetch_gpu_pricing()
779
+
780
+ metadata = data.get("metadata")
781
+ if isinstance(metadata, str):
782
+ try:
783
+ metadata = json.loads(metadata)
784
+ except (json.JSONDecodeError, TypeError):
785
+ metadata = {}
786
+ metadata = metadata or {}
787
+
788
+ extraction = metadata.get("extraction_result", {})
789
+ mem_config = extraction.get("memory_config", {})
790
+ minimal_configs = mem_config.get("minimal_configs", [])
791
+
792
+ if not minimal_configs:
793
+ console.print("[yellow]No GPU recommendations available.[/yellow]")
794
+ raise typer.Exit(1)
795
+
796
+ # Sort by VRAM (smaller = likely cheaper)
797
+ sorted_configs = sorted(minimal_configs, key=lambda x: x.get("per_gpu_vram_gb", 0))[:top]
798
+
799
+ console.print()
800
+ console.print(Panel(
801
+ f"[green]✓[/green] Found {len(minimal_configs)} compatible GPU configurations",
802
+ title="[green]GPU Recommendations[/green]",
803
+ border_style="green",
804
+ ))
805
+
806
+ console.print()
807
+ rec_table = Table(title=f"Top {min(top, len(sorted_configs))} Recommendations", show_header=True, header_style="bold")
808
+ rec_table.add_column("#", style="dim", width=3)
809
+ rec_table.add_column("GPU", style="cyan")
810
+ rec_table.add_column("VRAM", justify="right")
811
+ rec_table.add_column("GPUs", justify="right")
812
+ rec_table.add_column("$/hour", justify="right")
813
+ rec_table.add_column("Utilization", justify="right")
814
+
815
+ for i, cfg in enumerate(sorted_configs):
816
+ gpu_type = cfg.get("gpu_type", "?")
817
+ gpu_display = format_gpu_name(gpu_type)
818
+ vram = cfg.get("per_gpu_vram_gb", 0)
819
+ count = cfg.get("min_gpu_count", 1)
820
+ util = cfg.get("vram_utilization_percent", 0)
821
+
822
+ price = pricing.get(gpu_type, 0)
823
+ price_str = f"${price:.2f}" if price else "-"
824
+
825
+ rank = f"[green]{i + 1}[/green]" if i == 0 else str(i + 1)
826
+
827
+ rec_table.add_row(
828
+ rank,
829
+ gpu_display,
830
+ f"{vram} GB",
831
+ str(count),
832
+ price_str,
833
+ f"{util}%",
834
+ )
835
+
836
+ console.print(rec_table)
837
+
838
+ # Show command hint
839
+ best = sorted_configs[0]
840
+ machine_flag = best.get("gpu_type", "gpu")
841
+ console.print()
842
+ console.print(f"[dim]Run with: lyceum python run {code_or_file} -m {machine_flag}[/dim]")
843
+
844
+ except typer.Exit:
845
+ status.stop()
846
+ raise
847
+ except Exception as e:
848
+ status.stop()
849
+ console.print(f"[red]Error: {e}[/red]")
850
+ raise typer.Exit(1)
851
+
852
+
853
+ def display_runtime_results(data: dict, file_path: str | None = None) -> None:
854
+ """Display runtime profiling results."""
855
+ if data is None:
856
+ console.print("[red]No data received[/red]")
857
+ return
858
+
859
+ metadata = data.get("metadata")
860
+ if isinstance(metadata, str):
861
+ try:
862
+ metadata = json.loads(metadata)
863
+ except (json.JSONDecodeError, TypeError):
864
+ metadata = {}
865
+ metadata = metadata or {}
866
+
867
+ profiling = metadata.get("profiling_results", [])
868
+
869
+ if not profiling:
870
+ console.print("[yellow]No runtime profiling data available.[/yellow]")
871
+ return
872
+
873
+ pricing = fetch_gpu_pricing()
874
+
875
+ # Find best performers
876
+ completed = [p for p in profiling if p.get("status") in ("completed", "success")]
877
+
878
+ if completed:
879
+ fastest = min(completed, key=lambda x: x.get("execution_time") or 999)
880
+ fastest_profile = format_gpu_name(fastest.get("profile", "?"))
881
+ fastest_time = fastest.get("execution_time", 0)
882
+
883
+ console.print()
884
+ console.print(Panel(
885
+ f"[green]✓[/green] Profiling complete\n\n"
886
+ f"[bold]Fastest:[/bold] [cyan]{fastest_profile}[/cyan] ({fastest_time:.2f}s)",
887
+ title="[green]Runtime Analysis[/green]",
888
+ border_style="green",
889
+ ))
890
+
891
+ # Detailed results table
892
+ console.print()
893
+ prof_table = Table(title="Runtime Results by GPU", show_header=True, header_style="bold")
894
+ prof_table.add_column("GPU", style="cyan")
895
+ prof_table.add_column("Status")
896
+ prof_table.add_column("Time", justify="right")
897
+ prof_table.add_column("Cost", justify="right")
898
+ prof_table.add_column("Iters/sec", justify="right")
899
+ prof_table.add_column("Avg Batch (ms)", justify="right")
900
+ prof_table.add_column("Peak VRAM", justify="right")
901
+
902
+ sorted_profiling = sorted(profiling, key=lambda x: x.get("execution_time") or 999)
903
+
904
+ for result in sorted_profiling:
905
+ profile = result.get("profile", "?")
906
+ rst = result.get("status", "unknown")
907
+ style = "green" if rst in ("completed", "success") else "red"
908
+
909
+ report = result.get("runtime_report") or {}
910
+ train = report.get("training") or {}
911
+ train_iter = report.get("train_iteration") or {}
912
+
913
+ time_s = result.get("execution_time")
914
+ cost = result.get("cost")
915
+ if cost is None and time_s is not None:
916
+ cost = calculate_cost(time_s, profile, pricing)
917
+
918
+ iters = train_iter.get("train_iterations_per_second")
919
+ avg_batch = train.get("avg_train_time_ms")
920
+ vram = report.get("Peak VRAM Allocated (MB)")
921
+
922
+ prof_table.add_row(
923
+ format_gpu_name(profile),
924
+ f"[{style}]{rst}[/{style}]",
925
+ f"{time_s:.2f}s" if time_s is not None else "-",
926
+ f"${cost:.6f}" if cost is not None else "-",
927
+ f"{iters:.0f}" if iters else "-",
928
+ f"{avg_batch:.2f}" if avg_batch else "-",
929
+ f"{vram:.1f} MB" if vram else "-",
930
+ )
931
+
932
+ console.print(prof_table)
933
+
934
+ # Show run command hint based on fastest GPU
935
+ if file_path and completed:
936
+ fastest = min(completed, key=lambda x: x.get("execution_time") or 999)
937
+ machine_flag = fastest.get("profile", "gpu")
938
+ console.print()
939
+ console.print(f"[dim]To run on fastest machine: lyceum python run {file_path} -m {machine_flag}[/dim]")
940
+
941
+
942
+ @gpu_selection_app.command("runtime", context_settings={"allow_extra_args": True, "allow_interspersed_args": True})
943
+ def predict_runtime(
944
+ ctx: typer.Context,
945
+ code_or_file: str = typer.Argument(..., help="Python code or path to Python file"),
946
+ file_name: str | None = typer.Option(None, "--file-name", "-f", help="Name for the execution"),
947
+ timeout: int = typer.Option(120, "--timeout", "-t", help="Timeout per GPU in seconds (1-600)"),
948
+ requirements: str | None = typer.Option(
949
+ None, "--requirements", "-r", help="Requirements file path or pip requirements string"
950
+ ),
951
+ imports: list[str] | None = typer.Option(
952
+ None, "--import", help="Pre-import modules (can be used multiple times)"
953
+ ),
954
+ use_config: bool = typer.Option(
955
+ True, "--use-config/--no-config",
956
+ help="Use workspace config from .lyceum/config.json if available"
957
+ ),
958
+ ):
959
+ """Profile runtime performance across different GPUs.
960
+
961
+ Runs your training script on available GPUs and measures actual
962
+ execution time, throughput, and VRAM usage.
963
+
964
+ Examples:
965
+ lyceum predict runtime train.py
966
+ lyceum predict runtime train.py --timeout 180
967
+ """
968
+ status = StatusLine()
969
+
970
+ try:
971
+ config.get_client()
972
+ status.start()
973
+
974
+ script_args = [arg for arg in (ctx.args or []) if arg != "--"]
975
+
976
+ code, file_path, detected_file_name = read_code_from_source(code_or_file, status)
977
+ if not file_name:
978
+ file_name = detected_file_name
979
+
980
+ code = inject_script_args(code, script_args, file_name)
981
+
982
+ workspace_config = None
983
+ if use_config:
984
+ status.update("Loading workspace config...")
985
+ workspace_config = load_workspace_config(file_path)
986
+
987
+ requirements_content = resolve_requirements(requirements, workspace_config, False, status)
988
+ import_files = resolve_import_files(file_path, workspace_config, False, status)
989
+
990
+ payload = {
991
+ "code": code,
992
+ "nbcode": 0,
993
+ "timeout": timeout,
994
+ }
995
+ if file_name:
996
+ payload["file_name"] = file_name
997
+ if requirements_content:
998
+ payload["requirements_content"] = requirements_content
999
+ if imports:
1000
+ payload["prior_imports"] = imports
1001
+ if import_files:
1002
+ payload["import_files"] = import_files
1003
+
1004
+ execution_id = submit_gpu_selection(payload, status)
1005
+ console.print(f"[dim]Execution ID: {execution_id}[/dim]")
1006
+
1007
+ status.update("Profiling runtime across GPUs...")
1008
+ data = poll_gpu_selection(execution_id, status)
1009
+ status.stop()
1010
+
1011
+ if data.get("status") != "completed":
1012
+ display_results(data)
1013
+ raise typer.Exit(1)
1014
+
1015
+ display_runtime_results(data, file_path=code_or_file)
1016
+
1017
+ except typer.Exit:
1018
+ status.stop()
1019
+ raise
1020
+ except Exception as e:
1021
+ status.stop()
1022
+ console.print(f"[red]Error: {e}[/red]")
1023
+ raise typer.Exit(1)