gpu-dev 0.5.16__tar.gz → 0.5.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/PKG-INFO +1 -1
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +205 -4
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/pyproject.toml +1 -1
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_processor/index.py +173 -26
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/k8s_client.py +6 -1
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/.gitignore +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/CLAUDE.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/PROGRESS.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/TODO.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/admin/README.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/post.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/setup.cfg +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -1349,6 +1349,205 @@ def reserve(
|
|
|
1349
1349
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
1350
1350
|
|
|
1351
1351
|
|
|
1352
|
+
_SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1353
|
+
"h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
|
|
1354
|
+
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
|
|
1355
|
+
|
|
1356
|
+
|
|
1357
|
+
@main.command(context_settings={"ignore_unknown_options": True})
|
|
1358
|
+
@click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
|
|
1359
|
+
@click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
|
|
1360
|
+
@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation duration ceiling (job auto-cancels on exit).")
|
|
1361
|
+
@click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
|
|
1362
|
+
@click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
|
|
1363
|
+
@click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
|
|
1364
|
+
help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
|
|
1365
|
+
@click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
|
|
1366
|
+
@click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
|
|
1367
|
+
@click.option("--name", type=str, default=None, help="Reservation name.")
|
|
1368
|
+
@click.option("--timeout", type=int, default=20, show_default=True, help="Minutes to wait for the reservation to become active.")
|
|
1369
|
+
@click.argument("command", nargs=-1, required=True)
|
|
1370
|
+
@click.pass_context
|
|
1371
|
+
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pull, keep_alive, name, timeout, command):
|
|
1372
|
+
"""Submit a job: reserve, sync code, run, sync results back, auto-cancel.
|
|
1373
|
+
|
|
1374
|
+
\b
|
|
1375
|
+
Examples:
|
|
1376
|
+
gpu-dev submit --runtime ./ -- python train.py
|
|
1377
|
+
gpu-dev submit --gpus 16 --gpu-type h100 --runtime . -- bash run.sh
|
|
1378
|
+
gpu-dev submit --keep-alive -- nvidia-smi
|
|
1379
|
+
|
|
1380
|
+
The job runs on rank 0 (master pod). For multinode jobs, MULTINODE_HOSTS / RANK /
|
|
1381
|
+
SIZE / MASTER_ADDR / MASTER_PORT are exported on every pod so torchrun and friends
|
|
1382
|
+
work without manual wiring. Exit code mirrors the remote command's exit code.
|
|
1383
|
+
"""
|
|
1384
|
+
import subprocess
|
|
1385
|
+
import shlex
|
|
1386
|
+
import sys
|
|
1387
|
+
from pathlib import Path
|
|
1388
|
+
|
|
1389
|
+
if not command:
|
|
1390
|
+
rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
|
|
1391
|
+
sys.exit(2)
|
|
1392
|
+
|
|
1393
|
+
gt = gpu_type.lower()
|
|
1394
|
+
# Per-type max GPUs (mirrors gpu_configs in reserve flow)
|
|
1395
|
+
max_per_node = {
|
|
1396
|
+
"t4": 4, "l4": 4, "a10g": 4, "rtxpro6000": 4, "t4-small": 1,
|
|
1397
|
+
"a100": 8, "h100": 8, "h200": 8, "b200": 8,
|
|
1398
|
+
"h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8,
|
|
1399
|
+
"b200-mig-1g": 4, "b200-mig-2g": 2, "b200-mig-3g": 2,
|
|
1400
|
+
"cpu-arm": 0, "cpu-x86": 0,
|
|
1401
|
+
}.get(gt)
|
|
1402
|
+
if max_per_node is None:
|
|
1403
|
+
rprint(f"[red]❌ Unknown gpu-type '{gpu_type}'[/red]")
|
|
1404
|
+
sys.exit(2)
|
|
1405
|
+
|
|
1406
|
+
is_multinode = gt not in ("cpu-arm", "cpu-x86") and gpus > max_per_node
|
|
1407
|
+
if is_multinode and gpus % max_per_node != 0:
|
|
1408
|
+
rprint(f"[red]❌ For multinode {gt}, --gpus must be a multiple of {max_per_node}[/red]")
|
|
1409
|
+
sys.exit(2)
|
|
1410
|
+
|
|
1411
|
+
config = load_config()
|
|
1412
|
+
try:
|
|
1413
|
+
user_info = authenticate_user(config)
|
|
1414
|
+
except RuntimeError as e:
|
|
1415
|
+
rprint(f"[red]❌ {str(e)}[/red]")
|
|
1416
|
+
sys.exit(2)
|
|
1417
|
+
|
|
1418
|
+
rm = ReservationManager(config)
|
|
1419
|
+
|
|
1420
|
+
# Determine effective disk handling. Multinode: only master gets persistent disk; we always
|
|
1421
|
+
# SSH into rank 0, so passing --disk is fine.
|
|
1422
|
+
disk_name = None if no_persistent_disk else disk
|
|
1423
|
+
|
|
1424
|
+
rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
|
|
1425
|
+
if is_multinode:
|
|
1426
|
+
reservation_ids = rm.create_multinode_reservation(
|
|
1427
|
+
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1428
|
+
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1429
|
+
no_persistent_disk=no_persistent_disk, disk_name=disk_name)
|
|
1430
|
+
if not reservation_ids:
|
|
1431
|
+
rprint("[red]❌ Failed to create multinode reservation[/red]")
|
|
1432
|
+
sys.exit(2)
|
|
1433
|
+
primary_id = reservation_ids[0]
|
|
1434
|
+
else:
|
|
1435
|
+
primary_id = rm.create_reservation(
|
|
1436
|
+
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1437
|
+
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1438
|
+
no_persistent_disk=no_persistent_disk, disk_name=disk_name)
|
|
1439
|
+
if not primary_id:
|
|
1440
|
+
rprint("[red]❌ Failed to create reservation[/red]")
|
|
1441
|
+
sys.exit(2)
|
|
1442
|
+
reservation_ids = [primary_id]
|
|
1443
|
+
|
|
1444
|
+
short_id = primary_id[:8]
|
|
1445
|
+
cancelled = {"done": False}
|
|
1446
|
+
|
|
1447
|
+
def maybe_cancel(reason: str):
|
|
1448
|
+
if cancelled["done"] or keep_alive:
|
|
1449
|
+
return
|
|
1450
|
+
cancelled["done"] = True
|
|
1451
|
+
rprint(f"[yellow]🛑 Cancelling reservation {short_id} ({reason})[/yellow]")
|
|
1452
|
+
for rid in reservation_ids:
|
|
1453
|
+
try:
|
|
1454
|
+
rm.cancel_reservation(rid, user_info["user_id"])
|
|
1455
|
+
except Exception as ce:
|
|
1456
|
+
rprint(f"[dim] cancel {rid[:8]} failed: {ce}[/dim]")
|
|
1457
|
+
|
|
1458
|
+
try:
|
|
1459
|
+
rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active (up to {timeout}m)...[/cyan]")
|
|
1460
|
+
if is_multinode:
|
|
1461
|
+
results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
|
|
1462
|
+
else:
|
|
1463
|
+
single = rm.wait_for_reservation_completion(primary_id, timeout_minutes=timeout)
|
|
1464
|
+
results = [single] if single else None
|
|
1465
|
+
if not results:
|
|
1466
|
+
rprint("[red]❌ Reservation never became active[/red]")
|
|
1467
|
+
maybe_cancel("activation timeout")
|
|
1468
|
+
sys.exit(1)
|
|
1469
|
+
|
|
1470
|
+
# Resolve master pod (rank 0)
|
|
1471
|
+
conn = rm.get_connection_info(primary_id, user_info["user_id"])
|
|
1472
|
+
if not conn:
|
|
1473
|
+
rprint("[red]❌ Could not fetch connection info[/red]")
|
|
1474
|
+
maybe_cancel("no connection info")
|
|
1475
|
+
sys.exit(1)
|
|
1476
|
+
if conn.get("is_multinode"):
|
|
1477
|
+
nodes = sorted(conn["nodes"], key=lambda n: n.get("node_index", 0))
|
|
1478
|
+
master = nodes[0]
|
|
1479
|
+
master_id, master_pod, master_fqdn, master_name = (
|
|
1480
|
+
master["reservation_id"], master["pod_name"],
|
|
1481
|
+
master.get("fqdn"), master.get("name"))
|
|
1482
|
+
else:
|
|
1483
|
+
master_id, master_pod, master_fqdn, master_name = (
|
|
1484
|
+
primary_id, conn["pod_name"], conn.get("fqdn"), conn.get("name"))
|
|
1485
|
+
|
|
1486
|
+
# Ensure SSH config exists
|
|
1487
|
+
gpu_dev_dir = Path.home() / ".gpu-dev"
|
|
1488
|
+
config_file = gpu_dev_dir / f"{master_id[:8]}-sshconfig"
|
|
1489
|
+
if not config_file.exists():
|
|
1490
|
+
if not (master_fqdn and master_pod):
|
|
1491
|
+
rprint("[red]❌ Master pod has no FQDN yet — can't SSH[/red]")
|
|
1492
|
+
maybe_cancel("no fqdn")
|
|
1493
|
+
sys.exit(1)
|
|
1494
|
+
create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
|
|
1495
|
+
|
|
1496
|
+
ssh_alias = master_pod
|
|
1497
|
+
ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
|
|
1498
|
+
rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
|
|
1499
|
+
|
|
1500
|
+
# Working directory and rsync up
|
|
1501
|
+
if runtime:
|
|
1502
|
+
workdir = f"/workspace/submit-{master_id[:8]}"
|
|
1503
|
+
rprint(f"[cyan]📦 Syncing {runtime} → {ssh_alias}:{workdir}[/cyan]")
|
|
1504
|
+
r = subprocess.run(ssh_base + [ssh_alias, f"mkdir -p {shlex.quote(workdir)}"])
|
|
1505
|
+
if r.returncode != 0:
|
|
1506
|
+
rprint("[red]❌ Failed to create remote workspace[/red]")
|
|
1507
|
+
maybe_cancel("mkdir failed"); sys.exit(2)
|
|
1508
|
+
r = subprocess.run([
|
|
1509
|
+
"rsync", "-az", "--delete", "-e", rsync_e,
|
|
1510
|
+
f"{runtime.rstrip('/')}/", f"{ssh_alias}:{workdir}/",
|
|
1511
|
+
])
|
|
1512
|
+
if r.returncode != 0:
|
|
1513
|
+
rprint("[red]❌ Upload rsync failed[/red]")
|
|
1514
|
+
maybe_cancel("upload failed"); sys.exit(2)
|
|
1515
|
+
else:
|
|
1516
|
+
workdir = "/home/dev"
|
|
1517
|
+
|
|
1518
|
+
# Run remote command via login shell so MULTINODE_* etc. are loaded
|
|
1519
|
+
remote_cmd = " ".join(shlex.quote(c) for c in command)
|
|
1520
|
+
rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
|
|
1521
|
+
ssh_run = ssh_base + [ssh_alias,
|
|
1522
|
+
f"cd {shlex.quote(workdir)} && bash -lc {shlex.quote(remote_cmd)}"]
|
|
1523
|
+
rc = subprocess.call(ssh_run)
|
|
1524
|
+
rprint(f"\n[dim]Job exited with code {rc}[/dim]")
|
|
1525
|
+
|
|
1526
|
+
# Sync back results before cancelling
|
|
1527
|
+
if runtime and not no_pull:
|
|
1528
|
+
rprint(f"[cyan]📥 Syncing {ssh_alias}:{workdir}/ → {runtime}[/cyan]")
|
|
1529
|
+
pull = subprocess.run([
|
|
1530
|
+
"rsync", "-az", "-e", rsync_e,
|
|
1531
|
+
f"{ssh_alias}:{workdir}/", f"{runtime.rstrip('/')}/",
|
|
1532
|
+
])
|
|
1533
|
+
if pull.returncode != 0:
|
|
1534
|
+
rprint(f"[yellow]⚠️ Result rsync exited with {pull.returncode} — your output may be incomplete[/yellow]")
|
|
1535
|
+
|
|
1536
|
+
maybe_cancel("job complete")
|
|
1537
|
+
sys.exit(rc)
|
|
1538
|
+
|
|
1539
|
+
except KeyboardInterrupt:
|
|
1540
|
+
rprint("\n[yellow]Interrupted — cancelling[/yellow]")
|
|
1541
|
+
maybe_cancel("user interrupt")
|
|
1542
|
+
sys.exit(130)
|
|
1543
|
+
except SystemExit:
|
|
1544
|
+
raise
|
|
1545
|
+
except Exception as e:
|
|
1546
|
+
rprint(f"[red]❌ Submit error: {e}[/red]")
|
|
1547
|
+
maybe_cancel("submit error")
|
|
1548
|
+
sys.exit(2)
|
|
1549
|
+
|
|
1550
|
+
|
|
1352
1551
|
@main.command()
|
|
1353
1552
|
@click.option(
|
|
1354
1553
|
"--user",
|
|
@@ -1542,13 +1741,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1542
1741
|
if "@" in user_id:
|
|
1543
1742
|
user_display = user_id.split("@")[0]
|
|
1544
1743
|
|
|
1545
|
-
# Format GPU information
|
|
1744
|
+
# Format GPU information (MIG-friendly via _format_gpu_display)
|
|
1546
1745
|
if gpu_type and gpu_type not in ["unknown", "Unknown"]:
|
|
1547
|
-
# For CPU nodes (gpu_count = 0), show just the type
|
|
1548
1746
|
if gpu_count == 0:
|
|
1549
1747
|
gpu_display = gpu_type
|
|
1550
1748
|
else:
|
|
1551
|
-
gpu_display =
|
|
1749
|
+
gpu_display = _format_gpu_display(gpu_count, gpu_type)
|
|
1552
1750
|
else:
|
|
1553
1751
|
gpu_display = str(gpu_count)
|
|
1554
1752
|
|
|
@@ -1844,7 +2042,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1844
2042
|
if gpu_count == 0:
|
|
1845
2043
|
gpu_display = gpu_type
|
|
1846
2044
|
else:
|
|
1847
|
-
gpu_display =
|
|
2045
|
+
gpu_display = _format_gpu_display(gpu_count, gpu_type)
|
|
1848
2046
|
else:
|
|
1849
2047
|
gpu_display = str(gpu_count)
|
|
1850
2048
|
|
|
@@ -2417,6 +2615,9 @@ def _format_gpu_display(gpu_count, gpu_type):
|
|
|
2417
2615
|
"h100-mig-3g": "40GB H100 (MIG)",
|
|
2418
2616
|
"h100-mig-4g": "40GB H100 (MIG)",
|
|
2419
2617
|
"h100-mig-7g": "80GB H100 (MIG)",
|
|
2618
|
+
"b200-mig-1g": "23GB B200 (MIG)",
|
|
2619
|
+
"b200-mig-2g": "45GB B200 (MIG)",
|
|
2620
|
+
"b200-mig-3g": "90GB B200 (MIG)",
|
|
2420
2621
|
}
|
|
2421
2622
|
if gt_lower in mig_friendly:
|
|
2422
2623
|
return f"{gpu_count}× {mig_friendly[gt_lower]}"
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.18"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -308,30 +308,35 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
|
|
|
308
308
|
f"Node {node.metadata.name} in {node_az}: {available_gpus} available GPUs")
|
|
309
309
|
|
|
310
310
|
if candidate_nodes:
|
|
311
|
-
#
|
|
311
|
+
# Binpacking: pack into the most-loaded node that still fits the request.
|
|
312
|
+
# Sort by free GPUs ASC so the fullest node comes first; ties broken by node name
|
|
313
|
+
# so the choice is deterministic across Lambda invocations.
|
|
314
|
+
candidate_nodes.sort(key=lambda n: (n['available_gpus'], n['node_name']))
|
|
312
315
|
selected_node = candidate_nodes[0]
|
|
313
316
|
target_az = selected_node['az']
|
|
317
|
+
target_node = selected_node['node_name']
|
|
314
318
|
logger.info(
|
|
315
|
-
f"
|
|
316
|
-
|
|
319
|
+
f"Binpacked target for {gpu_type} {gpus_requested}gpu: "
|
|
320
|
+
f"node={target_node} az={target_az} free={selected_node['available_gpus']} "
|
|
321
|
+
f"(candidates considered: {len(candidate_nodes)})")
|
|
322
|
+
return target_az, target_node
|
|
317
323
|
|
|
318
324
|
if all_ready_nodes:
|
|
319
|
-
# No single node has enough GPUs
|
|
320
|
-
#
|
|
325
|
+
# No single node has enough GPUs — return AZ of the node with the most available GPUs
|
|
326
|
+
# so disk lands in the right AZ. No node hint (pod will Pending until something frees up).
|
|
321
327
|
best_node = max(all_ready_nodes, key=lambda n: n['available_gpus'])
|
|
322
328
|
target_az = best_node['az']
|
|
323
329
|
logger.info(
|
|
324
330
|
f"No single node has {gpus_requested} {gpu_type} GPUs, "
|
|
325
331
|
f"but {len(all_ready_nodes)} nodes exist. Using AZ {target_az} "
|
|
326
332
|
f"from node {best_node['node_name']} ({best_node['available_gpus']} GPUs available)")
|
|
327
|
-
return target_az
|
|
333
|
+
return target_az, None
|
|
328
334
|
|
|
329
335
|
logger.warning(f"No ready/schedulable {gpu_type} nodes found in cluster")
|
|
330
336
|
return None, None
|
|
331
337
|
|
|
332
338
|
except Exception as e:
|
|
333
339
|
logger.error(f"Error determining target AZ for {gpu_type}: {str(e)}")
|
|
334
|
-
# Fallback to primary AZ if detection fails (no node hint — let k8s pick).
|
|
335
340
|
return PRIMARY_AVAILABILITY_ZONE, None
|
|
336
341
|
|
|
337
342
|
|
|
@@ -1418,6 +1423,11 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
|
|
|
1418
1423
|
logger.info(
|
|
1419
1424
|
f"Starting parallel processing for {total_nodes} nodes")
|
|
1420
1425
|
|
|
1426
|
+
# Deterministic peer pod names by node_index so MULTINODE_RANK aligns with the
|
|
1427
|
+
# position of this pod in MULTINODE_HOSTS across all replicas.
|
|
1428
|
+
nodes_sorted = sorted(nodes, key=lambda n: int(n.get("node_index", 0)))
|
|
1429
|
+
peer_pod_names = [f"gpu-dev-{n['reservation_id'][:8]}" for n in nodes_sorted]
|
|
1430
|
+
|
|
1421
1431
|
def process_single_node(node_data):
|
|
1422
1432
|
"""Process a single node - to be run in parallel"""
|
|
1423
1433
|
i, node = node_data
|
|
@@ -1430,7 +1440,8 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
|
|
|
1430
1440
|
'action': 'process_multinode_individual',
|
|
1431
1441
|
'node_index': int(node_index),
|
|
1432
1442
|
'total_nodes': int(total_nodes),
|
|
1433
|
-
'master_reservation_id': str(master_reservation_id)
|
|
1443
|
+
'master_reservation_id': str(master_reservation_id),
|
|
1444
|
+
'multinode_peer_pods': peer_pod_names,
|
|
1434
1445
|
}
|
|
1435
1446
|
|
|
1436
1447
|
logger.info(
|
|
@@ -1536,6 +1547,12 @@ def process_multinode_individual_node(message_body: dict) -> bool:
|
|
|
1536
1547
|
|
|
1537
1548
|
node_data = response["Item"]
|
|
1538
1549
|
|
|
1550
|
+
# Forward peer pod list from coordinator into request dict so create_pod can
|
|
1551
|
+
# bake MULTINODE_HOSTS / MASTER_ADDR / MULTINODE_RANK env vars into the pod.
|
|
1552
|
+
peer_pods = message_body.get("multinode_peer_pods")
|
|
1553
|
+
if peer_pods:
|
|
1554
|
+
node_data["multinode_peer_pods"] = peer_pods
|
|
1555
|
+
|
|
1539
1556
|
# Update status to preparing pod
|
|
1540
1557
|
update_multinode_pod_status(
|
|
1541
1558
|
reservation_id, "preparing pod", node_index, total_nodes)
|
|
@@ -2722,6 +2739,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2722
2739
|
persistent_volume_id = None
|
|
2723
2740
|
device_name = None
|
|
2724
2741
|
target_az = None # Initialize target_az for use in connection info update
|
|
2742
|
+
target_node = None # Initialize target_node (binpacking hostname pin) for create_pod
|
|
2725
2743
|
is_new_disk = False # Initialize is_new_disk for all code paths
|
|
2726
2744
|
|
|
2727
2745
|
# If we're using persistent disk, immediately mark this reservation as having a volume
|
|
@@ -2749,8 +2767,8 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2749
2767
|
detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else "")
|
|
2750
2768
|
)
|
|
2751
2769
|
|
|
2752
|
-
# Determine target AZ for this reservation
|
|
2753
|
-
target_az = get_target_az_for_reservation(gpu_type, gpu_count)
|
|
2770
|
+
# Determine target AZ + node for this reservation (binpacking)
|
|
2771
|
+
target_az, target_node = get_target_az_for_reservation(gpu_type, gpu_count)
|
|
2754
2772
|
if not target_az:
|
|
2755
2773
|
raise ValueError(f"No {gpu_type} nodes found in cluster")
|
|
2756
2774
|
|
|
@@ -2881,6 +2899,9 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2881
2899
|
dockerfile_base64_data=dockerfile_base64_data,
|
|
2882
2900
|
dockerimage=dockerimage,
|
|
2883
2901
|
target_az=target_az,
|
|
2902
|
+
target_node=target_node,
|
|
2903
|
+
multinode_peer_pods=request.get("multinode_peer_pods"),
|
|
2904
|
+
multinode_rank=int(request.get("node_index", 0)) if is_multinode else 0,
|
|
2884
2905
|
preserve_entrypoint=preserve_entrypoint,
|
|
2885
2906
|
node_labels=node_labels,
|
|
2886
2907
|
trace_data=trace_data,
|
|
@@ -3421,6 +3442,9 @@ def create_kubernetes_resources(
|
|
|
3421
3442
|
recreate_env: bool = False,
|
|
3422
3443
|
efs_filesystem_id: str = None,
|
|
3423
3444
|
is_multinode: bool = False,
|
|
3445
|
+
target_node: str = None,
|
|
3446
|
+
multinode_peer_pods: list = None,
|
|
3447
|
+
multinode_rank: int = 0,
|
|
3424
3448
|
dockerfile_base64_data: str = None,
|
|
3425
3449
|
dockerimage: str = None,
|
|
3426
3450
|
target_az: str = None,
|
|
@@ -3524,6 +3548,9 @@ def create_kubernetes_resources(
|
|
|
3524
3548
|
dockerfile_base64_data=dockerfile_base64_data,
|
|
3525
3549
|
dockerimage=dockerimage,
|
|
3526
3550
|
target_az=target_az,
|
|
3551
|
+
target_node=target_node,
|
|
3552
|
+
multinode_peer_pods=multinode_peer_pods,
|
|
3553
|
+
multinode_rank=multinode_rank,
|
|
3527
3554
|
preserve_entrypoint=preserve_entrypoint,
|
|
3528
3555
|
node_labels=node_labels,
|
|
3529
3556
|
trace_data=trace_data,
|
|
@@ -3610,6 +3637,9 @@ def create_kubernetes_resources(
|
|
|
3610
3637
|
dockerfile_base64_data=dockerfile_base64_data,
|
|
3611
3638
|
dockerimage=dockerimage,
|
|
3612
3639
|
target_az=target_az,
|
|
3640
|
+
target_node=target_node,
|
|
3641
|
+
multinode_peer_pods=multinode_peer_pods,
|
|
3642
|
+
multinode_rank=multinode_rank,
|
|
3613
3643
|
preserve_entrypoint=preserve_entrypoint,
|
|
3614
3644
|
node_labels=node_labels,
|
|
3615
3645
|
trace_data=trace_data,
|
|
@@ -3712,6 +3742,30 @@ def find_available_node_port(k8s_client) -> int:
|
|
|
3712
3742
|
return random.randint(30000, 32767)
|
|
3713
3743
|
|
|
3714
3744
|
|
|
3745
|
+
def _mig_slice_fraction(gpu_type: str) -> float:
|
|
3746
|
+
"""For MIG SKUs return slice fraction of a single GPU (1g=1/7, 2g=2/7, ..., 7g=1).
|
|
3747
|
+
|
|
3748
|
+
Slice naming counts GPCs (compute slices). H100 and B200 both have 7 GPCs per GPU
|
|
3749
|
+
in the typical all-balanced profile, so a 1g slice is 1/7 of a GPU regardless of
|
|
3750
|
+
family. Used to size CPU/memory requests proportional to the GPU fraction the pod
|
|
3751
|
+
actually consumes — the older `gpu_count/max_gpus` ratio over-claimed node resources
|
|
3752
|
+
(a 1g slice would claim 1/4 or 1/16 of the host instead of 1/56).
|
|
3753
|
+
"""
|
|
3754
|
+
if "mig" not in gpu_type:
|
|
3755
|
+
return 1.0
|
|
3756
|
+
try:
|
|
3757
|
+
slices = int(gpu_type.split("-mig-")[1].rstrip("g"))
|
|
3758
|
+
except (IndexError, ValueError):
|
|
3759
|
+
return 1.0
|
|
3760
|
+
return slices / 7.0
|
|
3761
|
+
|
|
3762
|
+
|
|
3763
|
+
# Number of full GPUs on the underlying instance — used to convert the slice fraction
|
|
3764
|
+
# into a fraction of the host's CPU/memory. Both p5.48xlarge (H100) and p6-b200.48xlarge
|
|
3765
|
+
# (B200) have 8 GPUs, which matches every MIG-capable instance type we currently run.
|
|
3766
|
+
_FULL_GPUS_PER_MIG_NODE = 8
|
|
3767
|
+
|
|
3768
|
+
|
|
3715
3769
|
def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool = False) -> dict:
|
|
3716
3770
|
"""Get resource limits for pod based on GPU type and deployment mode"""
|
|
3717
3771
|
gpu_count = int(gpu_count)
|
|
@@ -3731,13 +3785,19 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
|
|
|
3731
3785
|
resource_name = config.get("k8s_resource", "nvidia.com/gpu")
|
|
3732
3786
|
limits[resource_name] = str(gpu_count)
|
|
3733
3787
|
|
|
3734
|
-
|
|
3735
|
-
|
|
3736
|
-
|
|
3737
|
-
|
|
3738
|
-
|
|
3739
|
-
|
|
3740
|
-
|
|
3788
|
+
if "mig" in gpu_type:
|
|
3789
|
+
# Scale by GPC fraction (slice of one GPU), not slice count over max slices.
|
|
3790
|
+
slice_fraction = _mig_slice_fraction(gpu_type)
|
|
3791
|
+
cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
|
|
3792
|
+
mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
|
|
3793
|
+
fractional_cpu = cpu_per_full_gpu * slice_fraction * gpu_count
|
|
3794
|
+
proportional_cpu_limit = max(1, min(config["cpus"], int(fractional_cpu * 1.5)))
|
|
3795
|
+
proportional_memory_limit = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count))
|
|
3796
|
+
else:
|
|
3797
|
+
gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
|
|
3798
|
+
fractional_cpu = config["cpus"] * gpu_ratio
|
|
3799
|
+
proportional_cpu_limit = min(config["cpus"], int(fractional_cpu * 1.5))
|
|
3800
|
+
proportional_memory_limit = int(config["memory_gb"] * gpu_ratio)
|
|
3741
3801
|
|
|
3742
3802
|
limits.update({
|
|
3743
3803
|
"cpu": str(proportional_cpu_limit),
|
|
@@ -3777,13 +3837,16 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
|
|
|
3777
3837
|
if gpu_count > 0:
|
|
3778
3838
|
resource_name = config.get("k8s_resource", "nvidia.com/gpu")
|
|
3779
3839
|
requests[resource_name] = str(gpu_count)
|
|
3780
|
-
|
|
3781
|
-
|
|
3782
|
-
|
|
3783
|
-
|
|
3784
|
-
|
|
3785
|
-
|
|
3786
|
-
|
|
3840
|
+
if "mig" in gpu_type:
|
|
3841
|
+
slice_fraction = _mig_slice_fraction(gpu_type)
|
|
3842
|
+
cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
|
|
3843
|
+
mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
|
|
3844
|
+
proportional_cpu_request = max(1, int(cpu_per_full_gpu * slice_fraction * gpu_count * 0.9))
|
|
3845
|
+
proportional_memory_request = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count * 0.9))
|
|
3846
|
+
else:
|
|
3847
|
+
gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
|
|
3848
|
+
proportional_cpu_request = int(config["cpus"] * gpu_ratio * 0.9)
|
|
3849
|
+
proportional_memory_request = int(config["memory_gb"] * gpu_ratio * 0.9)
|
|
3787
3850
|
|
|
3788
3851
|
requests.update({
|
|
3789
3852
|
"cpu": str(proportional_cpu_request),
|
|
@@ -3886,6 +3949,30 @@ def get_nccl_env_vars(gpu_type: str) -> list:
|
|
|
3886
3949
|
return env_vars
|
|
3887
3950
|
|
|
3888
3951
|
|
|
3952
|
+
def _get_multinode_env_vars(peer_pods: list, rank: int) -> list:
|
|
3953
|
+
"""Build env vars exposing peer hostnames/rank/master to the pod.
|
|
3954
|
+
|
|
3955
|
+
Hostnames use the per-pod headless service we already create elsewhere, so they
|
|
3956
|
+
resolve to the current pod IP via cluster DNS even if a pod is recreated. We
|
|
3957
|
+
don\'t inject IPs at pod-creation time (they aren\'t known until kube schedules
|
|
3958
|
+
everyone) — the bashrc/zshrc helper resolves and exports MULTINODE_IPS at shell
|
|
3959
|
+
start, and a /usr/local/bin/multinode-ips helper is available for non-interactive
|
|
3960
|
+
callers.
|
|
3961
|
+
"""
|
|
3962
|
+
if not peer_pods or len(peer_pods) <= 1:
|
|
3963
|
+
return []
|
|
3964
|
+
namespace = "gpu-dev"
|
|
3965
|
+
hosts = [f"{p}-headless.{namespace}.svc.cluster.local" for p in peer_pods]
|
|
3966
|
+
return [
|
|
3967
|
+
client.V1EnvVar(name="MULTINODE_HOSTS", value=",".join(hosts)),
|
|
3968
|
+
client.V1EnvVar(name="MULTINODE_PEER_PODS", value=",".join(peer_pods)),
|
|
3969
|
+
client.V1EnvVar(name="MULTINODE_RANK", value=str(rank)),
|
|
3970
|
+
client.V1EnvVar(name="MULTINODE_SIZE", value=str(len(peer_pods))),
|
|
3971
|
+
client.V1EnvVar(name="MASTER_ADDR", value=hosts[0]),
|
|
3972
|
+
client.V1EnvVar(name="MASTER_PORT", value="29500"),
|
|
3973
|
+
]
|
|
3974
|
+
|
|
3975
|
+
|
|
3889
3976
|
def create_pod(
|
|
3890
3977
|
k8s_client,
|
|
3891
3978
|
pod_name: str,
|
|
@@ -3902,6 +3989,9 @@ def create_pod(
|
|
|
3902
3989
|
dockerfile_base64_data: str = None,
|
|
3903
3990
|
dockerimage: str = None,
|
|
3904
3991
|
target_az: str = None,
|
|
3992
|
+
target_node: str = None,
|
|
3993
|
+
multinode_peer_pods: list = None,
|
|
3994
|
+
multinode_rank: int = 0,
|
|
3905
3995
|
preserve_entrypoint: bool = False,
|
|
3906
3996
|
node_labels: dict = None,
|
|
3907
3997
|
trace_data: dict = None,
|
|
@@ -4386,6 +4476,16 @@ EOF_PROFILE
|
|
|
4386
4476
|
# User identification
|
|
4387
4477
|
export GPU_DEV_USER_ID="{user_id or 'dev'}"
|
|
4388
4478
|
|
|
4479
|
+
# Multinode peer info — inlined from container env at pod startup. sshd strips
|
|
4480
|
+
# container env vars from login shells, so we materialize the values into rc files.
|
|
4481
|
+
# Skipped (empty exports) for single-node reservations where MULTINODE_* aren't set.
|
|
4482
|
+
export MULTINODE_HOSTS="$MULTINODE_HOSTS"
|
|
4483
|
+
export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
|
|
4484
|
+
export MULTINODE_RANK="$MULTINODE_RANK"
|
|
4485
|
+
export MULTINODE_SIZE="$MULTINODE_SIZE"
|
|
4486
|
+
export MASTER_ADDR="$MASTER_ADDR"
|
|
4487
|
+
export MASTER_PORT="$MASTER_PORT"
|
|
4488
|
+
|
|
4389
4489
|
# Function to check for GPU reservation expiry warnings and startup script status
|
|
4390
4490
|
check_warnings() {{
|
|
4391
4491
|
# Check for startup script still running
|
|
@@ -4404,6 +4504,22 @@ check_warnings() {{
|
|
|
4404
4504
|
|
|
4405
4505
|
# Run warning check before every command prompt
|
|
4406
4506
|
PROMPT_COMMAND="check_warnings; \$PROMPT_COMMAND"
|
|
4507
|
+
|
|
4508
|
+
# Multinode peer IP resolution: MULTINODE_HOSTS is baked at pod creation, but per-pod
|
|
4509
|
+
# IPs are only known once kube schedules them. Resolve at shell start so users can do
|
|
4510
|
+
# torchrun --master_addr=\$MASTER_ADDR or mpirun -H "\$MULTINODE_IPS" without extra steps.
|
|
4511
|
+
if [ -n "\$MULTINODE_HOSTS" ]; then
|
|
4512
|
+
_MULTINODE_IPS=""
|
|
4513
|
+
for _h in \$(echo "\$MULTINODE_HOSTS" | tr ',' ' '); do
|
|
4514
|
+
_ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
|
|
4515
|
+
if [ -n "\$_ip" ]; then
|
|
4516
|
+
_MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
|
|
4517
|
+
fi
|
|
4518
|
+
done
|
|
4519
|
+
export MULTINODE_IPS="\$_MULTINODE_IPS"
|
|
4520
|
+
[ -n "\$MULTINODE_IPS" ] && export MASTER_IP=\$(echo "\$MULTINODE_IPS" | cut -d, -f1)
|
|
4521
|
+
unset _MULTINODE_IPS _h _ip
|
|
4522
|
+
fi
|
|
4407
4523
|
EOF_BASHRC_EXT
|
|
4408
4524
|
|
|
4409
4525
|
cat > /home/dev/.zshrc_ext << EOF_ZSHRC_EXT
|
|
@@ -4414,6 +4530,15 @@ EOF_BASHRC_EXT
|
|
|
4414
4530
|
# User identification
|
|
4415
4531
|
export GPU_DEV_USER_ID="{user_id or 'dev'}"
|
|
4416
4532
|
|
|
4533
|
+
# Multinode peer info — inlined from container env at pod startup. sshd strips
|
|
4534
|
+
# container env vars from login shells, so we materialize the values into rc files.
|
|
4535
|
+
export MULTINODE_HOSTS="$MULTINODE_HOSTS"
|
|
4536
|
+
export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
|
|
4537
|
+
export MULTINODE_RANK="$MULTINODE_RANK"
|
|
4538
|
+
export MULTINODE_SIZE="$MULTINODE_SIZE"
|
|
4539
|
+
export MASTER_ADDR="$MASTER_ADDR"
|
|
4540
|
+
export MASTER_PORT="$MASTER_PORT"
|
|
4541
|
+
|
|
4417
4542
|
# Function to check for GPU reservation expiry warnings and startup script status
|
|
4418
4543
|
check_warnings() {{
|
|
4419
4544
|
# Check for startup script still running
|
|
@@ -4433,6 +4558,20 @@ check_warnings() {{
|
|
|
4433
4558
|
|
|
4434
4559
|
# Run warning check before every command prompt (zsh hook)
|
|
4435
4560
|
precmd() {{ check_warnings }}
|
|
4561
|
+
|
|
4562
|
+
# Multinode peer IP resolution (see .bashrc_ext for rationale)
|
|
4563
|
+
if [[ -n "\$MULTINODE_HOSTS" ]]; then
|
|
4564
|
+
_MULTINODE_IPS=""
|
|
4565
|
+
for _h in \${{(s:,:)MULTINODE_HOSTS}}; do
|
|
4566
|
+
_ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
|
|
4567
|
+
if [[ -n "\$_ip" ]]; then
|
|
4568
|
+
_MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
|
|
4569
|
+
fi
|
|
4570
|
+
done
|
|
4571
|
+
export MULTINODE_IPS="\$_MULTINODE_IPS"
|
|
4572
|
+
[[ -n "\$MULTINODE_IPS" ]] && export MASTER_IP="\${{MULTINODE_IPS%%,*}}"
|
|
4573
|
+
unset _MULTINODE_IPS _h _ip
|
|
4574
|
+
fi
|
|
4436
4575
|
EOF_ZSHRC_EXT
|
|
4437
4576
|
|
|
4438
4577
|
chown 1081:1081 /home/dev/.bashrc_ext /home/dev/.zshrc_ext
|
|
@@ -5163,7 +5302,7 @@ EOF
|
|
|
5163
5302
|
client.V1EnvVar(
|
|
5164
5303
|
name="NVIDIA_DRIVER_CAPABILITIES", value="compute,utility"
|
|
5165
5304
|
)
|
|
5166
|
-
] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type),
|
|
5305
|
+
] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
|
|
5167
5306
|
resources=client.V1ResourceRequirements(
|
|
5168
5307
|
limits=get_pod_resource_limits(
|
|
5169
5308
|
gpu_count, gpu_type, is_multinode),
|
|
@@ -5309,7 +5448,12 @@ EOF
|
|
|
5309
5448
|
] if _pod_uses_efa(gpu_count, gpu_type, is_multinode) else []),
|
|
5310
5449
|
node_selector={
|
|
5311
5450
|
"GpuType": get_node_gpu_type(gpu_type),
|
|
5312
|
-
**({} if target_az is None else {"topology.kubernetes.io/zone": target_az})
|
|
5451
|
+
**({} if target_az is None else {"topology.kubernetes.io/zone": target_az}),
|
|
5452
|
+
# Hard-pin to the binpacked node when Lambda picked one. Lambda runs
|
|
5453
|
+
# serialized (reserved_concurrent_executions=1), so allocations seen by the
|
|
5454
|
+
# next invocation include this pod. If the node is unavailable, the pod
|
|
5455
|
+
# stays Pending and surfaces the error rather than spreading.
|
|
5456
|
+
**({} if target_node is None else {"kubernetes.io/hostname": target_node}),
|
|
5313
5457
|
},
|
|
5314
5458
|
# Node affinity for profiling-dedicated preference
|
|
5315
5459
|
# If user requests nsight=true, prefer profiling-dedicated nodes
|
|
@@ -6303,6 +6447,9 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6303
6447
|
"nvidia.com/mig-3g.40gb": "h100-mig-3g",
|
|
6304
6448
|
"nvidia.com/mig-4g.40gb": "h100-mig-4g",
|
|
6305
6449
|
"nvidia.com/mig-7g.80gb": "h100-mig-7g",
|
|
6450
|
+
"nvidia.com/mig-1g.23gb": "b200-mig-1g",
|
|
6451
|
+
"nvidia.com/mig-2g.45gb": "b200-mig-2g",
|
|
6452
|
+
"nvidia.com/mig-3g.90gb": "b200-mig-3g",
|
|
6306
6453
|
}
|
|
6307
6454
|
if pod.spec.containers:
|
|
6308
6455
|
for c in pod.spec.containers:
|
|
@@ -31,9 +31,14 @@ def get_bearer_token() -> str:
|
|
|
31
31
|
"""
|
|
32
32
|
Create a k8s-aws-v1 bearer token by presigning STS:GetCallerIdentity.
|
|
33
33
|
IMPORTANT: base64url-encode the FULL presigned URL, then strip padding.
|
|
34
|
+
|
|
35
|
+
expires_in must match _EFFECTIVE_TOKEN_TTL: previously this was 60s while the cache
|
|
36
|
+
held the token for 14 min, so warm Lambda containers handed EKS expired URLs and got
|
|
37
|
+
401s for ~13 min until the next refresh. 900s is the typical EKS get-token default
|
|
38
|
+
and the max for IAM-role-derived presigned URLs.
|
|
34
39
|
"""
|
|
35
40
|
logger.info("Starting bearer token generation")
|
|
36
|
-
STS_TOKEN_EXPIRES_IN =
|
|
41
|
+
STS_TOKEN_EXPIRES_IN = 900
|
|
37
42
|
session = boto3.session.Session(region_name=REGION)
|
|
38
43
|
logger.info(f"Created boto3 session for region {REGION}")
|
|
39
44
|
|
|
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
184
|
-
MIN_CLI_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.22"
|
|
184
|
+
MIN_CLI_VERSION = "0.5.16"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
187
|
}, local.alb_env_vars)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|