gpu-dev 0.5.16__tar.gz → 0.5.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +205 -4
  4. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/pyproject.toml +1 -1
  5. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_processor/index.py +173 -26
  6. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/k8s_client.py +6 -1
  7. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda.tf +2 -2
  8. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/.github/workflows/no-gitlinks.yml +0 -0
  9. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/.github/workflows/publish.yml +0 -0
  10. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/.gitignore +0 -0
  11. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/CLAUDE.md +0 -0
  12. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/PROGRESS.md +0 -0
  13. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/PR_DESCRIPTION.md +0 -0
  14. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/TODO.md +0 -0
  15. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/admin/README.md +0 -0
  16. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/admin/generate_stats.py +0 -0
  17. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/admin/requirements.txt +0 -0
  18. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/README.md +0 -0
  19. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  20. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  21. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  22. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  23. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  24. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  25. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  26. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  27. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  28. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  29. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  30. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  31. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  32. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  33. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  34. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  35. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/docs/USER_GUIDE.md +0 -0
  36. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/docs/devgpu-features.html +0 -0
  37. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/docs/docker-mark-blue.svg +0 -0
  38. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/docs/icons8-cursor-ai.svg +0 -0
  39. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/post.md +0 -0
  40. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/setup.cfg +0 -0
  41. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  42. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  43. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/README.md +0 -0
  44. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/alb.tf +0 -0
  45. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/availability.tf +0 -0
  46. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/backend.tf +0 -0
  47. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  48. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  49. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  50. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/bash_profile +0 -0
  51. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/bashrc +0 -0
  52. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  53. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  54. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  55. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  56. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/motd_script +0 -0
  57. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  58. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/profile +0 -0
  59. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  60. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  61. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  62. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/shell_env +0 -0
  63. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/ssh_config +0 -0
  64. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/zprofile +0 -0
  65. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/zshrc +0 -0
  66. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  67. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker-build.tf +0 -0
  68. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  69. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  70. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ecr.tf +0 -0
  71. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/efs.tf +0 -0
  72. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/eks.tf +0 -0
  73. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/expiry.tf +0 -0
  74. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/git-cache.tf +0 -0
  75. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/kubernetes.tf +0 -0
  76. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  77. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  78. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  79. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  80. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  81. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  82. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  83. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  84. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  85. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  86. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  87. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  88. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  89. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/main.tf +0 -0
  90. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/mig-config.tf +0 -0
  91. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  92. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  93. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  94. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  95. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  96. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  97. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/monitoring.tf +0 -0
  98. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/outputs.tf +0 -0
  99. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/pyproject.toml +0 -0
  100. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/queue.tf +0 -0
  101. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/route53.tf +0 -0
  102. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  103. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  104. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  105. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  106. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  107. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  108. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  109. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  110. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  111. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  112. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/switch-to.sh +0 -0
  113. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  114. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  115. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  116. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  117. {gpu_dev-0.5.16 → gpu_dev-0.5.18}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.16
3
+ Version: 0.5.18
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.16
3
+ Version: 0.5.18
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1349,6 +1349,205 @@ def reserve(
1349
1349
  rprint(f"[red]❌ Error: {str(e)}[/red]")
1350
1350
 
1351
1351
 
1352
+ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1353
+ "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
1354
+ "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
1355
+
1356
+
1357
+ @main.command(context_settings={"ignore_unknown_options": True})
1358
+ @click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
1359
+ @click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
1360
+ @click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation duration ceiling (job auto-cancels on exit).")
1361
+ @click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
1362
+ @click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
1363
+ @click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
1364
+ help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
1365
+ @click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
1366
+ @click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
1367
+ @click.option("--name", type=str, default=None, help="Reservation name.")
1368
+ @click.option("--timeout", type=int, default=20, show_default=True, help="Minutes to wait for the reservation to become active.")
1369
+ @click.argument("command", nargs=-1, required=True)
1370
+ @click.pass_context
1371
+ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pull, keep_alive, name, timeout, command):
1372
+ """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
1373
+
1374
+ \b
1375
+ Examples:
1376
+ gpu-dev submit --runtime ./ -- python train.py
1377
+ gpu-dev submit --gpus 16 --gpu-type h100 --runtime . -- bash run.sh
1378
+ gpu-dev submit --keep-alive -- nvidia-smi
1379
+
1380
+ The job runs on rank 0 (master pod). For multinode jobs, MULTINODE_HOSTS / RANK /
1381
+ SIZE / MASTER_ADDR / MASTER_PORT are exported on every pod so torchrun and friends
1382
+ work without manual wiring. Exit code mirrors the remote command's exit code.
1383
+ """
1384
+ import subprocess
1385
+ import shlex
1386
+ import sys
1387
+ from pathlib import Path
1388
+
1389
+ if not command:
1390
+ rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
1391
+ sys.exit(2)
1392
+
1393
+ gt = gpu_type.lower()
1394
+ # Per-type max GPUs (mirrors gpu_configs in reserve flow)
1395
+ max_per_node = {
1396
+ "t4": 4, "l4": 4, "a10g": 4, "rtxpro6000": 4, "t4-small": 1,
1397
+ "a100": 8, "h100": 8, "h200": 8, "b200": 8,
1398
+ "h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8,
1399
+ "b200-mig-1g": 4, "b200-mig-2g": 2, "b200-mig-3g": 2,
1400
+ "cpu-arm": 0, "cpu-x86": 0,
1401
+ }.get(gt)
1402
+ if max_per_node is None:
1403
+ rprint(f"[red]❌ Unknown gpu-type '{gpu_type}'[/red]")
1404
+ sys.exit(2)
1405
+
1406
+ is_multinode = gt not in ("cpu-arm", "cpu-x86") and gpus > max_per_node
1407
+ if is_multinode and gpus % max_per_node != 0:
1408
+ rprint(f"[red]❌ For multinode {gt}, --gpus must be a multiple of {max_per_node}[/red]")
1409
+ sys.exit(2)
1410
+
1411
+ config = load_config()
1412
+ try:
1413
+ user_info = authenticate_user(config)
1414
+ except RuntimeError as e:
1415
+ rprint(f"[red]❌ {str(e)}[/red]")
1416
+ sys.exit(2)
1417
+
1418
+ rm = ReservationManager(config)
1419
+
1420
+ # Determine effective disk handling. Multinode: only master gets persistent disk; we always
1421
+ # SSH into rank 0, so passing --disk is fine.
1422
+ disk_name = None if no_persistent_disk else disk
1423
+
1424
+ rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
1425
+ if is_multinode:
1426
+ reservation_ids = rm.create_multinode_reservation(
1427
+ user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1428
+ duration_hours=hours, name=name, github_user=user_info["github_user"],
1429
+ no_persistent_disk=no_persistent_disk, disk_name=disk_name)
1430
+ if not reservation_ids:
1431
+ rprint("[red]❌ Failed to create multinode reservation[/red]")
1432
+ sys.exit(2)
1433
+ primary_id = reservation_ids[0]
1434
+ else:
1435
+ primary_id = rm.create_reservation(
1436
+ user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1437
+ duration_hours=hours, name=name, github_user=user_info["github_user"],
1438
+ no_persistent_disk=no_persistent_disk, disk_name=disk_name)
1439
+ if not primary_id:
1440
+ rprint("[red]❌ Failed to create reservation[/red]")
1441
+ sys.exit(2)
1442
+ reservation_ids = [primary_id]
1443
+
1444
+ short_id = primary_id[:8]
1445
+ cancelled = {"done": False}
1446
+
1447
+ def maybe_cancel(reason: str):
1448
+ if cancelled["done"] or keep_alive:
1449
+ return
1450
+ cancelled["done"] = True
1451
+ rprint(f"[yellow]🛑 Cancelling reservation {short_id} ({reason})[/yellow]")
1452
+ for rid in reservation_ids:
1453
+ try:
1454
+ rm.cancel_reservation(rid, user_info["user_id"])
1455
+ except Exception as ce:
1456
+ rprint(f"[dim] cancel {rid[:8]} failed: {ce}[/dim]")
1457
+
1458
+ try:
1459
+ rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active (up to {timeout}m)...[/cyan]")
1460
+ if is_multinode:
1461
+ results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
1462
+ else:
1463
+ single = rm.wait_for_reservation_completion(primary_id, timeout_minutes=timeout)
1464
+ results = [single] if single else None
1465
+ if not results:
1466
+ rprint("[red]❌ Reservation never became active[/red]")
1467
+ maybe_cancel("activation timeout")
1468
+ sys.exit(1)
1469
+
1470
+ # Resolve master pod (rank 0)
1471
+ conn = rm.get_connection_info(primary_id, user_info["user_id"])
1472
+ if not conn:
1473
+ rprint("[red]❌ Could not fetch connection info[/red]")
1474
+ maybe_cancel("no connection info")
1475
+ sys.exit(1)
1476
+ if conn.get("is_multinode"):
1477
+ nodes = sorted(conn["nodes"], key=lambda n: n.get("node_index", 0))
1478
+ master = nodes[0]
1479
+ master_id, master_pod, master_fqdn, master_name = (
1480
+ master["reservation_id"], master["pod_name"],
1481
+ master.get("fqdn"), master.get("name"))
1482
+ else:
1483
+ master_id, master_pod, master_fqdn, master_name = (
1484
+ primary_id, conn["pod_name"], conn.get("fqdn"), conn.get("name"))
1485
+
1486
+ # Ensure SSH config exists
1487
+ gpu_dev_dir = Path.home() / ".gpu-dev"
1488
+ config_file = gpu_dev_dir / f"{master_id[:8]}-sshconfig"
1489
+ if not config_file.exists():
1490
+ if not (master_fqdn and master_pod):
1491
+ rprint("[red]❌ Master pod has no FQDN yet — can't SSH[/red]")
1492
+ maybe_cancel("no fqdn")
1493
+ sys.exit(1)
1494
+ create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
1495
+
1496
+ ssh_alias = master_pod
1497
+ ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
1498
+ rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
1499
+
1500
+ # Working directory and rsync up
1501
+ if runtime:
1502
+ workdir = f"/workspace/submit-{master_id[:8]}"
1503
+ rprint(f"[cyan]📦 Syncing {runtime} → {ssh_alias}:{workdir}[/cyan]")
1504
+ r = subprocess.run(ssh_base + [ssh_alias, f"mkdir -p {shlex.quote(workdir)}"])
1505
+ if r.returncode != 0:
1506
+ rprint("[red]❌ Failed to create remote workspace[/red]")
1507
+ maybe_cancel("mkdir failed"); sys.exit(2)
1508
+ r = subprocess.run([
1509
+ "rsync", "-az", "--delete", "-e", rsync_e,
1510
+ f"{runtime.rstrip('/')}/", f"{ssh_alias}:{workdir}/",
1511
+ ])
1512
+ if r.returncode != 0:
1513
+ rprint("[red]❌ Upload rsync failed[/red]")
1514
+ maybe_cancel("upload failed"); sys.exit(2)
1515
+ else:
1516
+ workdir = "/home/dev"
1517
+
1518
+ # Run remote command via login shell so MULTINODE_* etc. are loaded
1519
+ remote_cmd = " ".join(shlex.quote(c) for c in command)
1520
+ rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
1521
+ ssh_run = ssh_base + [ssh_alias,
1522
+ f"cd {shlex.quote(workdir)} && bash -lc {shlex.quote(remote_cmd)}"]
1523
+ rc = subprocess.call(ssh_run)
1524
+ rprint(f"\n[dim]Job exited with code {rc}[/dim]")
1525
+
1526
+ # Sync back results before cancelling
1527
+ if runtime and not no_pull:
1528
+ rprint(f"[cyan]📥 Syncing {ssh_alias}:{workdir}/ → {runtime}[/cyan]")
1529
+ pull = subprocess.run([
1530
+ "rsync", "-az", "-e", rsync_e,
1531
+ f"{ssh_alias}:{workdir}/", f"{runtime.rstrip('/')}/",
1532
+ ])
1533
+ if pull.returncode != 0:
1534
+ rprint(f"[yellow]⚠️ Result rsync exited with {pull.returncode} — your output may be incomplete[/yellow]")
1535
+
1536
+ maybe_cancel("job complete")
1537
+ sys.exit(rc)
1538
+
1539
+ except KeyboardInterrupt:
1540
+ rprint("\n[yellow]Interrupted — cancelling[/yellow]")
1541
+ maybe_cancel("user interrupt")
1542
+ sys.exit(130)
1543
+ except SystemExit:
1544
+ raise
1545
+ except Exception as e:
1546
+ rprint(f"[red]❌ Submit error: {e}[/red]")
1547
+ maybe_cancel("submit error")
1548
+ sys.exit(2)
1549
+
1550
+
1352
1551
  @main.command()
1353
1552
  @click.option(
1354
1553
  "--user",
@@ -1542,13 +1741,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1542
1741
  if "@" in user_id:
1543
1742
  user_display = user_id.split("@")[0]
1544
1743
 
1545
- # Format GPU information
1744
+ # Format GPU information (MIG-friendly via _format_gpu_display)
1546
1745
  if gpu_type and gpu_type not in ["unknown", "Unknown"]:
1547
- # For CPU nodes (gpu_count = 0), show just the type
1548
1746
  if gpu_count == 0:
1549
1747
  gpu_display = gpu_type
1550
1748
  else:
1551
- gpu_display = f"{gpu_count}x {gpu_type}"
1749
+ gpu_display = _format_gpu_display(gpu_count, gpu_type)
1552
1750
  else:
1553
1751
  gpu_display = str(gpu_count)
1554
1752
 
@@ -1844,7 +2042,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1844
2042
  if gpu_count == 0:
1845
2043
  gpu_display = gpu_type
1846
2044
  else:
1847
- gpu_display = f"{gpu_count}x {gpu_type}"
2045
+ gpu_display = _format_gpu_display(gpu_count, gpu_type)
1848
2046
  else:
1849
2047
  gpu_display = str(gpu_count)
1850
2048
 
@@ -2417,6 +2615,9 @@ def _format_gpu_display(gpu_count, gpu_type):
2417
2615
  "h100-mig-3g": "40GB H100 (MIG)",
2418
2616
  "h100-mig-4g": "40GB H100 (MIG)",
2419
2617
  "h100-mig-7g": "80GB H100 (MIG)",
2618
+ "b200-mig-1g": "23GB B200 (MIG)",
2619
+ "b200-mig-2g": "45GB B200 (MIG)",
2620
+ "b200-mig-3g": "90GB B200 (MIG)",
2420
2621
  }
2421
2622
  if gt_lower in mig_friendly:
2422
2623
  return f"{gpu_count}× {mig_friendly[gt_lower]}"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.16"
7
+ version = "0.5.18"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -308,30 +308,35 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
308
308
  f"Node {node.metadata.name} in {node_az}: {available_gpus} available GPUs")
309
309
 
310
310
  if candidate_nodes:
311
- # Return the AZ of the first suitable node (Kubernetes scheduler will make the final decision)
311
+ # Binpacking: pack into the most-loaded node that still fits the request.
312
+ # Sort by free GPUs ASC so the fullest node comes first; ties broken by node name
313
+ # so the choice is deterministic across Lambda invocations.
314
+ candidate_nodes.sort(key=lambda n: (n['available_gpus'], n['node_name']))
312
315
  selected_node = candidate_nodes[0]
313
316
  target_az = selected_node['az']
317
+ target_node = selected_node['node_name']
314
318
  logger.info(
315
- f"Target AZ for {gpu_type} reservation: {target_az} (node: {selected_node['node_name']})")
316
- return target_az
319
+ f"Binpacked target for {gpu_type} {gpus_requested}gpu: "
320
+ f"node={target_node} az={target_az} free={selected_node['available_gpus']} "
321
+ f"(candidates considered: {len(candidate_nodes)})")
322
+ return target_az, target_node
317
323
 
318
324
  if all_ready_nodes:
319
- # No single node has enough GPUs, but nodes exist — return AZ of the node
320
- # with the most available GPUs so the disk is created in the right AZ
325
+ # No single node has enough GPUs — return AZ of the node with the most available GPUs
326
+ # so disk lands in the right AZ. No node hint (pod will Pending until something frees up).
321
327
  best_node = max(all_ready_nodes, key=lambda n: n['available_gpus'])
322
328
  target_az = best_node['az']
323
329
  logger.info(
324
330
  f"No single node has {gpus_requested} {gpu_type} GPUs, "
325
331
  f"but {len(all_ready_nodes)} nodes exist. Using AZ {target_az} "
326
332
  f"from node {best_node['node_name']} ({best_node['available_gpus']} GPUs available)")
327
- return target_az
333
+ return target_az, None
328
334
 
329
335
  logger.warning(f"No ready/schedulable {gpu_type} nodes found in cluster")
330
336
  return None, None
331
337
 
332
338
  except Exception as e:
333
339
  logger.error(f"Error determining target AZ for {gpu_type}: {str(e)}")
334
- # Fallback to primary AZ if detection fails (no node hint — let k8s pick).
335
340
  return PRIMARY_AVAILABILITY_ZONE, None
336
341
 
337
342
 
@@ -1418,6 +1423,11 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
1418
1423
  logger.info(
1419
1424
  f"Starting parallel processing for {total_nodes} nodes")
1420
1425
 
1426
+ # Deterministic peer pod names by node_index so MULTINODE_RANK aligns with the
1427
+ # position of this pod in MULTINODE_HOSTS across all replicas.
1428
+ nodes_sorted = sorted(nodes, key=lambda n: int(n.get("node_index", 0)))
1429
+ peer_pod_names = [f"gpu-dev-{n['reservation_id'][:8]}" for n in nodes_sorted]
1430
+
1421
1431
  def process_single_node(node_data):
1422
1432
  """Process a single node - to be run in parallel"""
1423
1433
  i, node = node_data
@@ -1430,7 +1440,8 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
1430
1440
  'action': 'process_multinode_individual',
1431
1441
  'node_index': int(node_index),
1432
1442
  'total_nodes': int(total_nodes),
1433
- 'master_reservation_id': str(master_reservation_id)
1443
+ 'master_reservation_id': str(master_reservation_id),
1444
+ 'multinode_peer_pods': peer_pod_names,
1434
1445
  }
1435
1446
 
1436
1447
  logger.info(
@@ -1536,6 +1547,12 @@ def process_multinode_individual_node(message_body: dict) -> bool:
1536
1547
 
1537
1548
  node_data = response["Item"]
1538
1549
 
1550
+ # Forward peer pod list from coordinator into request dict so create_pod can
1551
+ # bake MULTINODE_HOSTS / MASTER_ADDR / MULTINODE_RANK env vars into the pod.
1552
+ peer_pods = message_body.get("multinode_peer_pods")
1553
+ if peer_pods:
1554
+ node_data["multinode_peer_pods"] = peer_pods
1555
+
1539
1556
  # Update status to preparing pod
1540
1557
  update_multinode_pod_status(
1541
1558
  reservation_id, "preparing pod", node_index, total_nodes)
@@ -2722,6 +2739,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2722
2739
  persistent_volume_id = None
2723
2740
  device_name = None
2724
2741
  target_az = None # Initialize target_az for use in connection info update
2742
+ target_node = None # Initialize target_node (binpacking hostname pin) for create_pod
2725
2743
  is_new_disk = False # Initialize is_new_disk for all code paths
2726
2744
 
2727
2745
  # If we're using persistent disk, immediately mark this reservation as having a volume
@@ -2749,8 +2767,8 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2749
2767
  detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else "")
2750
2768
  )
2751
2769
 
2752
- # Determine target AZ for this reservation
2753
- target_az = get_target_az_for_reservation(gpu_type, gpu_count)
2770
+ # Determine target AZ + node for this reservation (binpacking)
2771
+ target_az, target_node = get_target_az_for_reservation(gpu_type, gpu_count)
2754
2772
  if not target_az:
2755
2773
  raise ValueError(f"No {gpu_type} nodes found in cluster")
2756
2774
 
@@ -2881,6 +2899,9 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2881
2899
  dockerfile_base64_data=dockerfile_base64_data,
2882
2900
  dockerimage=dockerimage,
2883
2901
  target_az=target_az,
2902
+ target_node=target_node,
2903
+ multinode_peer_pods=request.get("multinode_peer_pods"),
2904
+ multinode_rank=int(request.get("node_index", 0)) if is_multinode else 0,
2884
2905
  preserve_entrypoint=preserve_entrypoint,
2885
2906
  node_labels=node_labels,
2886
2907
  trace_data=trace_data,
@@ -3421,6 +3442,9 @@ def create_kubernetes_resources(
3421
3442
  recreate_env: bool = False,
3422
3443
  efs_filesystem_id: str = None,
3423
3444
  is_multinode: bool = False,
3445
+ target_node: str = None,
3446
+ multinode_peer_pods: list = None,
3447
+ multinode_rank: int = 0,
3424
3448
  dockerfile_base64_data: str = None,
3425
3449
  dockerimage: str = None,
3426
3450
  target_az: str = None,
@@ -3524,6 +3548,9 @@ def create_kubernetes_resources(
3524
3548
  dockerfile_base64_data=dockerfile_base64_data,
3525
3549
  dockerimage=dockerimage,
3526
3550
  target_az=target_az,
3551
+ target_node=target_node,
3552
+ multinode_peer_pods=multinode_peer_pods,
3553
+ multinode_rank=multinode_rank,
3527
3554
  preserve_entrypoint=preserve_entrypoint,
3528
3555
  node_labels=node_labels,
3529
3556
  trace_data=trace_data,
@@ -3610,6 +3637,9 @@ def create_kubernetes_resources(
3610
3637
  dockerfile_base64_data=dockerfile_base64_data,
3611
3638
  dockerimage=dockerimage,
3612
3639
  target_az=target_az,
3640
+ target_node=target_node,
3641
+ multinode_peer_pods=multinode_peer_pods,
3642
+ multinode_rank=multinode_rank,
3613
3643
  preserve_entrypoint=preserve_entrypoint,
3614
3644
  node_labels=node_labels,
3615
3645
  trace_data=trace_data,
@@ -3712,6 +3742,30 @@ def find_available_node_port(k8s_client) -> int:
3712
3742
  return random.randint(30000, 32767)
3713
3743
 
3714
3744
 
3745
+ def _mig_slice_fraction(gpu_type: str) -> float:
3746
+ """For MIG SKUs return slice fraction of a single GPU (1g=1/7, 2g=2/7, ..., 7g=1).
3747
+
3748
+ Slice naming counts GPCs (compute slices). H100 and B200 both have 7 GPCs per GPU
3749
+ in the typical all-balanced profile, so a 1g slice is 1/7 of a GPU regardless of
3750
+ family. Used to size CPU/memory requests proportional to the GPU fraction the pod
3751
+ actually consumes — the older `gpu_count/max_gpus` ratio over-claimed node resources
3752
+ (a 1g slice would claim 1/4 or 1/16 of the host instead of 1/56).
3753
+ """
3754
+ if "mig" not in gpu_type:
3755
+ return 1.0
3756
+ try:
3757
+ slices = int(gpu_type.split("-mig-")[1].rstrip("g"))
3758
+ except (IndexError, ValueError):
3759
+ return 1.0
3760
+ return slices / 7.0
3761
+
3762
+
3763
+ # Number of full GPUs on the underlying instance — used to convert the slice fraction
3764
+ # into a fraction of the host's CPU/memory. Both p5.48xlarge (H100) and p6-b200.48xlarge
3765
+ # (B200) have 8 GPUs, which matches every MIG-capable instance type we currently run.
3766
+ _FULL_GPUS_PER_MIG_NODE = 8
3767
+
3768
+
3715
3769
  def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool = False) -> dict:
3716
3770
  """Get resource limits for pod based on GPU type and deployment mode"""
3717
3771
  gpu_count = int(gpu_count)
@@ -3731,13 +3785,19 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
3731
3785
  resource_name = config.get("k8s_resource", "nvidia.com/gpu")
3732
3786
  limits[resource_name] = str(gpu_count)
3733
3787
 
3734
- gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3735
-
3736
- # Calculate proportional limits with CPU overprovisioning for burst capacity
3737
- # Give 1.5x CPU limit to allow burst, capped at node total
3738
- fractional_cpu = config["cpus"] * gpu_ratio
3739
- proportional_cpu_limit = min(config["cpus"], int(fractional_cpu * 1.5))
3740
- proportional_memory_limit = int(config["memory_gb"] * gpu_ratio)
3788
+ if "mig" in gpu_type:
3789
+ # Scale by GPC fraction (slice of one GPU), not slice count over max slices.
3790
+ slice_fraction = _mig_slice_fraction(gpu_type)
3791
+ cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
3792
+ mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
3793
+ fractional_cpu = cpu_per_full_gpu * slice_fraction * gpu_count
3794
+ proportional_cpu_limit = max(1, min(config["cpus"], int(fractional_cpu * 1.5)))
3795
+ proportional_memory_limit = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count))
3796
+ else:
3797
+ gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3798
+ fractional_cpu = config["cpus"] * gpu_ratio
3799
+ proportional_cpu_limit = min(config["cpus"], int(fractional_cpu * 1.5))
3800
+ proportional_memory_limit = int(config["memory_gb"] * gpu_ratio)
3741
3801
 
3742
3802
  limits.update({
3743
3803
  "cpu": str(proportional_cpu_limit),
@@ -3777,13 +3837,16 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
3777
3837
  if gpu_count > 0:
3778
3838
  resource_name = config.get("k8s_resource", "nvidia.com/gpu")
3779
3839
  requests[resource_name] = str(gpu_count)
3780
- gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3781
-
3782
- # Calculate proportional requests (reserve 10% for system overhead)
3783
- # This ensures requests don't exceed node allocatable resources
3784
- # Limits can be higher for burst capacity (Burstable QoS)
3785
- proportional_cpu_request = int(config["cpus"] * gpu_ratio * 0.9)
3786
- proportional_memory_request = int(config["memory_gb"] * gpu_ratio * 0.9)
3840
+ if "mig" in gpu_type:
3841
+ slice_fraction = _mig_slice_fraction(gpu_type)
3842
+ cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
3843
+ mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
3844
+ proportional_cpu_request = max(1, int(cpu_per_full_gpu * slice_fraction * gpu_count * 0.9))
3845
+ proportional_memory_request = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count * 0.9))
3846
+ else:
3847
+ gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3848
+ proportional_cpu_request = int(config["cpus"] * gpu_ratio * 0.9)
3849
+ proportional_memory_request = int(config["memory_gb"] * gpu_ratio * 0.9)
3787
3850
 
3788
3851
  requests.update({
3789
3852
  "cpu": str(proportional_cpu_request),
@@ -3886,6 +3949,30 @@ def get_nccl_env_vars(gpu_type: str) -> list:
3886
3949
  return env_vars
3887
3950
 
3888
3951
 
3952
+ def _get_multinode_env_vars(peer_pods: list, rank: int) -> list:
3953
+ """Build env vars exposing peer hostnames/rank/master to the pod.
3954
+
3955
+ Hostnames use the per-pod headless service we already create elsewhere, so they
3956
+ resolve to the current pod IP via cluster DNS even if a pod is recreated. We
3957
+ don\'t inject IPs at pod-creation time (they aren\'t known until kube schedules
3958
+ everyone) — the bashrc/zshrc helper resolves and exports MULTINODE_IPS at shell
3959
+ start, and a /usr/local/bin/multinode-ips helper is available for non-interactive
3960
+ callers.
3961
+ """
3962
+ if not peer_pods or len(peer_pods) <= 1:
3963
+ return []
3964
+ namespace = "gpu-dev"
3965
+ hosts = [f"{p}-headless.{namespace}.svc.cluster.local" for p in peer_pods]
3966
+ return [
3967
+ client.V1EnvVar(name="MULTINODE_HOSTS", value=",".join(hosts)),
3968
+ client.V1EnvVar(name="MULTINODE_PEER_PODS", value=",".join(peer_pods)),
3969
+ client.V1EnvVar(name="MULTINODE_RANK", value=str(rank)),
3970
+ client.V1EnvVar(name="MULTINODE_SIZE", value=str(len(peer_pods))),
3971
+ client.V1EnvVar(name="MASTER_ADDR", value=hosts[0]),
3972
+ client.V1EnvVar(name="MASTER_PORT", value="29500"),
3973
+ ]
3974
+
3975
+
3889
3976
  def create_pod(
3890
3977
  k8s_client,
3891
3978
  pod_name: str,
@@ -3902,6 +3989,9 @@ def create_pod(
3902
3989
  dockerfile_base64_data: str = None,
3903
3990
  dockerimage: str = None,
3904
3991
  target_az: str = None,
3992
+ target_node: str = None,
3993
+ multinode_peer_pods: list = None,
3994
+ multinode_rank: int = 0,
3905
3995
  preserve_entrypoint: bool = False,
3906
3996
  node_labels: dict = None,
3907
3997
  trace_data: dict = None,
@@ -4386,6 +4476,16 @@ EOF_PROFILE
4386
4476
  # User identification
4387
4477
  export GPU_DEV_USER_ID="{user_id or 'dev'}"
4388
4478
 
4479
+ # Multinode peer info — inlined from container env at pod startup. sshd strips
4480
+ # container env vars from login shells, so we materialize the values into rc files.
4481
+ # Skipped (empty exports) for single-node reservations where MULTINODE_* aren't set.
4482
+ export MULTINODE_HOSTS="$MULTINODE_HOSTS"
4483
+ export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
4484
+ export MULTINODE_RANK="$MULTINODE_RANK"
4485
+ export MULTINODE_SIZE="$MULTINODE_SIZE"
4486
+ export MASTER_ADDR="$MASTER_ADDR"
4487
+ export MASTER_PORT="$MASTER_PORT"
4488
+
4389
4489
  # Function to check for GPU reservation expiry warnings and startup script status
4390
4490
  check_warnings() {{
4391
4491
  # Check for startup script still running
@@ -4404,6 +4504,22 @@ check_warnings() {{
4404
4504
 
4405
4505
  # Run warning check before every command prompt
4406
4506
  PROMPT_COMMAND="check_warnings; \$PROMPT_COMMAND"
4507
+
4508
+ # Multinode peer IP resolution: MULTINODE_HOSTS is baked at pod creation, but per-pod
4509
+ # IPs are only known once kube schedules them. Resolve at shell start so users can do
4510
+ # torchrun --master_addr=\$MASTER_ADDR or mpirun -H "\$MULTINODE_IPS" without extra steps.
4511
+ if [ -n "\$MULTINODE_HOSTS" ]; then
4512
+ _MULTINODE_IPS=""
4513
+ for _h in \$(echo "\$MULTINODE_HOSTS" | tr ',' ' '); do
4514
+ _ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
4515
+ if [ -n "\$_ip" ]; then
4516
+ _MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
4517
+ fi
4518
+ done
4519
+ export MULTINODE_IPS="\$_MULTINODE_IPS"
4520
+ [ -n "\$MULTINODE_IPS" ] && export MASTER_IP=\$(echo "\$MULTINODE_IPS" | cut -d, -f1)
4521
+ unset _MULTINODE_IPS _h _ip
4522
+ fi
4407
4523
  EOF_BASHRC_EXT
4408
4524
 
4409
4525
  cat > /home/dev/.zshrc_ext << EOF_ZSHRC_EXT
@@ -4414,6 +4530,15 @@ EOF_BASHRC_EXT
4414
4530
  # User identification
4415
4531
  export GPU_DEV_USER_ID="{user_id or 'dev'}"
4416
4532
 
4533
+ # Multinode peer info — inlined from container env at pod startup. sshd strips
4534
+ # container env vars from login shells, so we materialize the values into rc files.
4535
+ export MULTINODE_HOSTS="$MULTINODE_HOSTS"
4536
+ export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
4537
+ export MULTINODE_RANK="$MULTINODE_RANK"
4538
+ export MULTINODE_SIZE="$MULTINODE_SIZE"
4539
+ export MASTER_ADDR="$MASTER_ADDR"
4540
+ export MASTER_PORT="$MASTER_PORT"
4541
+
4417
4542
  # Function to check for GPU reservation expiry warnings and startup script status
4418
4543
  check_warnings() {{
4419
4544
  # Check for startup script still running
@@ -4433,6 +4558,20 @@ check_warnings() {{
4433
4558
 
4434
4559
  # Run warning check before every command prompt (zsh hook)
4435
4560
  precmd() {{ check_warnings }}
4561
+
4562
+ # Multinode peer IP resolution (see .bashrc_ext for rationale)
4563
+ if [[ -n "\$MULTINODE_HOSTS" ]]; then
4564
+ _MULTINODE_IPS=""
4565
+ for _h in \${{(s:,:)MULTINODE_HOSTS}}; do
4566
+ _ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
4567
+ if [[ -n "\$_ip" ]]; then
4568
+ _MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
4569
+ fi
4570
+ done
4571
+ export MULTINODE_IPS="\$_MULTINODE_IPS"
4572
+ [[ -n "\$MULTINODE_IPS" ]] && export MASTER_IP="\${{MULTINODE_IPS%%,*}}"
4573
+ unset _MULTINODE_IPS _h _ip
4574
+ fi
4436
4575
  EOF_ZSHRC_EXT
4437
4576
 
4438
4577
  chown 1081:1081 /home/dev/.bashrc_ext /home/dev/.zshrc_ext
@@ -5163,7 +5302,7 @@ EOF
5163
5302
  client.V1EnvVar(
5164
5303
  name="NVIDIA_DRIVER_CAPABILITIES", value="compute,utility"
5165
5304
  )
5166
- ] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type),
5305
+ ] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
5167
5306
  resources=client.V1ResourceRequirements(
5168
5307
  limits=get_pod_resource_limits(
5169
5308
  gpu_count, gpu_type, is_multinode),
@@ -5309,7 +5448,12 @@ EOF
5309
5448
  ] if _pod_uses_efa(gpu_count, gpu_type, is_multinode) else []),
5310
5449
  node_selector={
5311
5450
  "GpuType": get_node_gpu_type(gpu_type),
5312
- **({} if target_az is None else {"topology.kubernetes.io/zone": target_az})
5451
+ **({} if target_az is None else {"topology.kubernetes.io/zone": target_az}),
5452
+ # Hard-pin to the binpacked node when Lambda picked one. Lambda runs
5453
+ # serialized (reserved_concurrent_executions=1), so allocations seen by the
5454
+ # next invocation include this pod. If the node is unavailable, the pod
5455
+ # stays Pending and surfaces the error rather than spreading.
5456
+ **({} if target_node is None else {"kubernetes.io/hostname": target_node}),
5313
5457
  },
5314
5458
  # Node affinity for profiling-dedicated preference
5315
5459
  # If user requests nsight=true, prefer profiling-dedicated nodes
@@ -6303,6 +6447,9 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6303
6447
  "nvidia.com/mig-3g.40gb": "h100-mig-3g",
6304
6448
  "nvidia.com/mig-4g.40gb": "h100-mig-4g",
6305
6449
  "nvidia.com/mig-7g.80gb": "h100-mig-7g",
6450
+ "nvidia.com/mig-1g.23gb": "b200-mig-1g",
6451
+ "nvidia.com/mig-2g.45gb": "b200-mig-2g",
6452
+ "nvidia.com/mig-3g.90gb": "b200-mig-3g",
6306
6453
  }
6307
6454
  if pod.spec.containers:
6308
6455
  for c in pod.spec.containers:
@@ -31,9 +31,14 @@ def get_bearer_token() -> str:
31
31
  """
32
32
  Create a k8s-aws-v1 bearer token by presigning STS:GetCallerIdentity.
33
33
  IMPORTANT: base64url-encode the FULL presigned URL, then strip padding.
34
+
35
+ expires_in must match _EFFECTIVE_TOKEN_TTL: previously this was 60s while the cache
36
+ held the token for 14 min, so warm Lambda containers handed EKS expired URLs and got
37
+ 401s for ~13 min until the next refresh. 900s is the typical EKS get-token default
38
+ and the max for IAM-role-derived presigned URLs.
34
39
  """
35
40
  logger.info("Starting bearer token generation")
36
- STS_TOKEN_EXPIRES_IN = 60
41
+ STS_TOKEN_EXPIRES_IN = 900
37
42
  session = boto3.session.Session(region_name=REGION)
38
43
  logger.info(f"Created boto3 session for region {REGION}")
39
44
 
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.13"
184
- MIN_CLI_VERSION = "0.5.9"
183
+ LAMBDA_VERSION = "0.5.22"
184
+ MIN_CLI_VERSION = "0.5.16"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
187
  }, local.alb_env_vars)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes