gpu-dev 0.5.17__tar.gz → 0.5.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +5 -1
  4. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +248 -0
  5. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/index.py +147 -16
  7. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_client.py +6 -1
  8. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda.tf +1 -1
  9. gpu_dev-0.5.19/tests/submit/README.md +63 -0
  10. gpu_dev-0.5.19/tests/submit/fail/run.sh +20 -0
  11. gpu_dev-0.5.19/tests/submit/multinode/run.sh +65 -0
  12. gpu_dev-0.5.19/tests/submit/success/run.sh +23 -0
  13. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/.github/workflows/no-gitlinks.yml +0 -0
  14. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/.github/workflows/publish.yml +0 -0
  15. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/.gitignore +0 -0
  16. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/CLAUDE.md +0 -0
  17. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/PROGRESS.md +0 -0
  18. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/PR_DESCRIPTION.md +0 -0
  19. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/TODO.md +0 -0
  20. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/README.md +0 -0
  21. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/generate_stats.py +0 -0
  22. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/requirements.txt +0 -0
  23. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/README.md +0 -0
  24. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  25. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  26. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  27. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  28. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  29. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  30. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  31. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  32. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  33. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  34. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  35. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  36. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  37. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  38. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  39. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/USER_GUIDE.md +0 -0
  40. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/devgpu-features.html +0 -0
  41. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/docker-mark-blue.svg +0 -0
  42. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/icons8-cursor-ai.svg +0 -0
  43. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/post.md +0 -0
  44. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/setup.cfg +0 -0
  45. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  46. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  47. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/README.md +0 -0
  48. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/alb.tf +0 -0
  49. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/availability.tf +0 -0
  50. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/backend.tf +0 -0
  51. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  52. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  53. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  54. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bash_profile +0 -0
  55. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc +0 -0
  56. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  57. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  58. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  59. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  60. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/motd_script +0 -0
  61. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  62. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/profile +0 -0
  63. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  64. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  65. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  66. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/shell_env +0 -0
  67. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/ssh_config +0 -0
  68. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zprofile +0 -0
  69. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc +0 -0
  70. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  71. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-build.tf +0 -0
  72. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  73. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  74. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ecr.tf +0 -0
  75. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/efs.tf +0 -0
  76. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/eks.tf +0 -0
  77. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/expiry.tf +0 -0
  78. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/git-cache.tf +0 -0
  79. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/kubernetes.tf +0 -0
  80. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  81. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  82. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  83. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  84. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  85. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  86. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  87. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  88. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  89. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  90. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  91. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  92. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  93. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/main.tf +0 -0
  94. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-config.tf +0 -0
  95. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  96. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  97. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  98. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  99. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  100. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  101. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/monitoring.tf +0 -0
  102. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/outputs.tf +0 -0
  103. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/pyproject.toml +0 -0
  104. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/queue.tf +0 -0
  105. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/route53.tf +0 -0
  106. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  107. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  108. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  109. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  110. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  111. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  112. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  113. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  114. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  115. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  116. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/switch-to.sh +0 -0
  117. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  118. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  119. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  120. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  121. {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.17
3
+ Version: 0.5.19
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.17
3
+ Version: 0.5.19
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -112,4 +112,8 @@ terraform-gpu-devservers/ssh-proxy/requirements.txt
112
112
  terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
113
113
  terraform-gpu-devservers/templates/al2023-user-data.sh
114
114
  terraform-gpu-devservers/templates/user-data-self-managed.sh
115
- terraform-gpu-devservers/templates/user-data.sh
115
+ terraform-gpu-devservers/templates/user-data.sh
116
+ tests/submit/README.md
117
+ tests/submit/fail/run.sh
118
+ tests/submit/multinode/run.sh
119
+ tests/submit/success/run.sh
@@ -1349,6 +1349,254 @@ def reserve(
1349
1349
  rprint(f"[red]❌ Error: {str(e)}[/red]")
1350
1350
 
1351
1351
 
1352
+ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1353
+ "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
1354
+ "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
1355
+
1356
+
1357
+ @main.command(context_settings={"ignore_unknown_options": True})
1358
+ @click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
1359
+ @click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
1360
+ @click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
1361
+ @click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
1362
+ @click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
1363
+ @click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
1364
+ help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
1365
+ @click.option("--dockerimage", type=str, default=None,
1366
+ help="Pre-built container image reference (e.g. ghcr.io/me/img:tag) to run instead of the default.")
1367
+ @click.option("--preserve-entrypoint", is_flag=True,
1368
+ help="Keep the custom image's ENTRYPOINT/CMD instead of letting gpu-dev wrap with the SSH harness. Note: submit needs SSH to work.")
1369
+ @click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
1370
+ help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
1371
+ @click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
1372
+ @click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
1373
+ @click.option("--name", type=str, default=None, help="Reservation name.")
1374
+ @click.option("--timeout", type=int, default=24 * 60, show_default=True,
1375
+ help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
1376
+ @click.argument("command", nargs=-1, required=True)
1377
+ @click.pass_context
1378
+ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
1379
+ runtime, no_pull, keep_alive, name, timeout, command):
1380
+ """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
1381
+
1382
+ \b
1383
+ Examples:
1384
+ gpu-dev submit --runtime ./ -- python train.py
1385
+ gpu-dev submit --gpus 16 --gpu-type h100 --runtime . -- bash run.sh
1386
+ gpu-dev submit --keep-alive -- nvidia-smi
1387
+
1388
+ The job runs on rank 0 (master pod). For multinode jobs, MULTINODE_HOSTS / RANK /
1389
+ SIZE / MASTER_ADDR / MASTER_PORT are exported on every pod so torchrun and friends
1390
+ work without manual wiring. Exit code mirrors the remote command's exit code.
1391
+ """
1392
+ import subprocess
1393
+ import shlex
1394
+ import sys
1395
+ from pathlib import Path
1396
+
1397
+ if not command:
1398
+ rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
1399
+ sys.exit(2)
1400
+
1401
+ gt = gpu_type.lower()
1402
+ # Per-type max GPUs (mirrors gpu_configs in reserve flow)
1403
+ max_per_node = {
1404
+ "t4": 4, "l4": 4, "a10g": 4, "rtxpro6000": 4, "t4-small": 1,
1405
+ "a100": 8, "h100": 8, "h200": 8, "b200": 8,
1406
+ "h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8,
1407
+ "b200-mig-1g": 4, "b200-mig-2g": 2, "b200-mig-3g": 2,
1408
+ "cpu-arm": 0, "cpu-x86": 0,
1409
+ }.get(gt)
1410
+ if max_per_node is None:
1411
+ rprint(f"[red]❌ Unknown gpu-type '{gpu_type}'[/red]")
1412
+ sys.exit(2)
1413
+
1414
+ is_multinode = gt not in ("cpu-arm", "cpu-x86") and gpus > max_per_node
1415
+ if is_multinode and gpus % max_per_node != 0:
1416
+ rprint(f"[red]❌ For multinode {gt}, --gpus must be a multiple of {max_per_node}[/red]")
1417
+ sys.exit(2)
1418
+
1419
+ config = load_config()
1420
+ try:
1421
+ user_info = authenticate_user(config)
1422
+ except RuntimeError as e:
1423
+ rprint(f"[red]❌ {str(e)}[/red]")
1424
+ sys.exit(2)
1425
+
1426
+ rm = ReservationManager(config)
1427
+
1428
+ # Determine effective disk handling. Multinode: only master gets persistent disk; we always
1429
+ # SSH into rank 0, so passing --disk is fine.
1430
+ disk_name = None if no_persistent_disk else disk
1431
+
1432
+ # Build dockerfile context if provided (mirrors the reserve-flow logic)
1433
+ dockerfile_payload = None
1434
+ if dockerfile:
1435
+ import os, tarfile, tempfile, base64
1436
+ if os.path.getsize(dockerfile) > 512 * 1024:
1437
+ rprint("[red]❌ Dockerfile too large (max 512KB)[/red]")
1438
+ sys.exit(2)
1439
+ ctx_dir = os.path.dirname(os.path.abspath(dockerfile))
1440
+ rprint(f"[cyan]📦 Building tar.gz context from {ctx_dir}[/cyan]")
1441
+ with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
1442
+ with tarfile.open(tmp.name, "w:gz") as tar:
1443
+ for root, _, files in os.walk(ctx_dir):
1444
+ for f in files:
1445
+ full = os.path.join(root, f)
1446
+ tar.add(full, arcname=os.path.relpath(full, ctx_dir))
1447
+ if os.path.basename(dockerfile).lower() != "dockerfile":
1448
+ tar.add(dockerfile, arcname="Dockerfile")
1449
+ tar_size = os.path.getsize(tmp.name)
1450
+ if tar_size > 700 * 1024:
1451
+ os.unlink(tmp.name)
1452
+ rprint(f"[red]❌ Build context too large: {tar_size}B (max ~700KB compressed)[/red]")
1453
+ sys.exit(2)
1454
+ with open(tmp.name, "rb") as fh:
1455
+ dockerfile_payload = base64.b64encode(fh.read()).decode("utf-8")
1456
+ os.unlink(tmp.name)
1457
+ rprint(f"[green]✅ Dockerfile context: {tar_size}B compressed[/green]")
1458
+
1459
+ if dockerimage and not preserve_entrypoint:
1460
+ rprint("[dim]Note: passing --dockerimage without --preserve-entrypoint, so gpu-dev wraps the image with the SSH harness.[/dim]")
1461
+ if preserve_entrypoint and not (dockerfile or dockerimage):
1462
+ rprint("[red]❌ --preserve-entrypoint requires --dockerfile or --dockerimage[/red]")
1463
+ sys.exit(2)
1464
+
1465
+ rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
1466
+ if is_multinode:
1467
+ reservation_ids = rm.create_multinode_reservation(
1468
+ user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1469
+ duration_hours=hours, name=name, github_user=user_info["github_user"],
1470
+ no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1471
+ dockerfile=dockerfile_payload, dockerimage=dockerimage,
1472
+ preserve_entrypoint=preserve_entrypoint)
1473
+ if not reservation_ids:
1474
+ rprint("[red]❌ Failed to create multinode reservation[/red]")
1475
+ sys.exit(2)
1476
+ primary_id = reservation_ids[0]
1477
+ else:
1478
+ primary_id = rm.create_reservation(
1479
+ user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1480
+ duration_hours=hours, name=name, github_user=user_info["github_user"],
1481
+ no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1482
+ dockerfile=dockerfile_payload, dockerimage=dockerimage,
1483
+ preserve_entrypoint=preserve_entrypoint)
1484
+ if not primary_id:
1485
+ rprint("[red]❌ Failed to create reservation[/red]")
1486
+ sys.exit(2)
1487
+ reservation_ids = [primary_id]
1488
+
1489
+ short_id = primary_id[:8]
1490
+ cancelled = {"done": False}
1491
+
1492
+ def maybe_cancel(reason: str):
1493
+ if cancelled["done"] or keep_alive:
1494
+ return
1495
+ cancelled["done"] = True
1496
+ rprint(f"[yellow]🛑 Cancelling reservation {short_id} ({reason})[/yellow]")
1497
+ for rid in reservation_ids:
1498
+ try:
1499
+ rm.cancel_reservation(rid, user_info["user_id"])
1500
+ except Exception as ce:
1501
+ rprint(f"[dim] cancel {rid[:8]} failed: {ce}[/dim]")
1502
+
1503
+ try:
1504
+ if timeout >= 60:
1505
+ wait_str = f"up to {timeout//60}h{(" " + str(timeout%60) + "m") if timeout%60 else ""}"
1506
+ else:
1507
+ wait_str = f"up to {timeout}m"
1508
+ rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active ({wait_str}; can queue when cluster is full)...[/cyan]")
1509
+ if is_multinode:
1510
+ results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
1511
+ else:
1512
+ single = rm.wait_for_reservation_completion(primary_id, timeout_minutes=timeout)
1513
+ results = [single] if single else None
1514
+ if not results:
1515
+ rprint("[red]❌ Reservation never became active[/red]")
1516
+ maybe_cancel("activation timeout")
1517
+ sys.exit(1)
1518
+
1519
+ # Resolve master pod (rank 0)
1520
+ conn = rm.get_connection_info(primary_id, user_info["user_id"])
1521
+ if not conn:
1522
+ rprint("[red]❌ Could not fetch connection info[/red]")
1523
+ maybe_cancel("no connection info")
1524
+ sys.exit(1)
1525
+ if conn.get("is_multinode"):
1526
+ nodes = sorted(conn["nodes"], key=lambda n: n.get("node_index", 0))
1527
+ master = nodes[0]
1528
+ master_id, master_pod, master_fqdn, master_name = (
1529
+ master["reservation_id"], master["pod_name"],
1530
+ master.get("fqdn"), master.get("name"))
1531
+ else:
1532
+ master_id, master_pod, master_fqdn, master_name = (
1533
+ primary_id, conn["pod_name"], conn.get("fqdn"), conn.get("name"))
1534
+
1535
+ # Ensure SSH config exists
1536
+ gpu_dev_dir = Path.home() / ".gpu-dev"
1537
+ config_file = gpu_dev_dir / f"{master_id[:8]}-sshconfig"
1538
+ if not config_file.exists():
1539
+ if not (master_fqdn and master_pod):
1540
+ rprint("[red]❌ Master pod has no FQDN yet — can't SSH[/red]")
1541
+ maybe_cancel("no fqdn")
1542
+ sys.exit(1)
1543
+ create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
1544
+
1545
+ ssh_alias = master_pod
1546
+ ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
1547
+ rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
1548
+
1549
+ # Working directory and rsync up
1550
+ if runtime:
1551
+ workdir = f"/workspace/submit-{master_id[:8]}"
1552
+ rprint(f"[cyan]📦 Syncing {runtime} → {ssh_alias}:{workdir}[/cyan]")
1553
+ r = subprocess.run(ssh_base + [ssh_alias, f"mkdir -p {shlex.quote(workdir)}"])
1554
+ if r.returncode != 0:
1555
+ rprint("[red]❌ Failed to create remote workspace[/red]")
1556
+ maybe_cancel("mkdir failed"); sys.exit(2)
1557
+ r = subprocess.run([
1558
+ "rsync", "-az", "--delete", "-e", rsync_e,
1559
+ f"{runtime.rstrip('/')}/", f"{ssh_alias}:{workdir}/",
1560
+ ])
1561
+ if r.returncode != 0:
1562
+ rprint("[red]❌ Upload rsync failed[/red]")
1563
+ maybe_cancel("upload failed"); sys.exit(2)
1564
+ else:
1565
+ workdir = "/home/dev"
1566
+
1567
+ # Run remote command via login shell so MULTINODE_* etc. are loaded
1568
+ remote_cmd = " ".join(shlex.quote(c) for c in command)
1569
+ rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
1570
+ ssh_run = ssh_base + [ssh_alias,
1571
+ f"cd {shlex.quote(workdir)} && bash -lc {shlex.quote(remote_cmd)}"]
1572
+ rc = subprocess.call(ssh_run)
1573
+ rprint(f"\n[dim]Job exited with code {rc}[/dim]")
1574
+
1575
+ # Sync back results before cancelling
1576
+ if runtime and not no_pull:
1577
+ rprint(f"[cyan]📥 Syncing {ssh_alias}:{workdir}/ → {runtime}[/cyan]")
1578
+ pull = subprocess.run([
1579
+ "rsync", "-az", "-e", rsync_e,
1580
+ f"{ssh_alias}:{workdir}/", f"{runtime.rstrip('/')}/",
1581
+ ])
1582
+ if pull.returncode != 0:
1583
+ rprint(f"[yellow]⚠️ Result rsync exited with {pull.returncode} — your output may be incomplete[/yellow]")
1584
+
1585
+ maybe_cancel("job complete")
1586
+ sys.exit(rc)
1587
+
1588
+ except KeyboardInterrupt:
1589
+ rprint("\n[yellow]Interrupted — cancelling[/yellow]")
1590
+ maybe_cancel("user interrupt")
1591
+ sys.exit(130)
1592
+ except SystemExit:
1593
+ raise
1594
+ except Exception as e:
1595
+ rprint(f"[red]❌ Submit error: {e}[/red]")
1596
+ maybe_cancel("submit error")
1597
+ sys.exit(2)
1598
+
1599
+
1352
1600
  @main.command()
1353
1601
  @click.option(
1354
1602
  "--user",
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.17"
7
+ version = "0.5.19"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -1423,6 +1423,11 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
1423
1423
  logger.info(
1424
1424
  f"Starting parallel processing for {total_nodes} nodes")
1425
1425
 
1426
+ # Deterministic peer pod names by node_index so MULTINODE_RANK aligns with the
1427
+ # position of this pod in MULTINODE_HOSTS across all replicas.
1428
+ nodes_sorted = sorted(nodes, key=lambda n: int(n.get("node_index", 0)))
1429
+ peer_pod_names = [f"gpu-dev-{n['reservation_id'][:8]}" for n in nodes_sorted]
1430
+
1426
1431
  def process_single_node(node_data):
1427
1432
  """Process a single node - to be run in parallel"""
1428
1433
  i, node = node_data
@@ -1435,7 +1440,8 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
1435
1440
  'action': 'process_multinode_individual',
1436
1441
  'node_index': int(node_index),
1437
1442
  'total_nodes': int(total_nodes),
1438
- 'master_reservation_id': str(master_reservation_id)
1443
+ 'master_reservation_id': str(master_reservation_id),
1444
+ 'multinode_peer_pods': peer_pod_names,
1439
1445
  }
1440
1446
 
1441
1447
  logger.info(
@@ -1541,6 +1547,12 @@ def process_multinode_individual_node(message_body: dict) -> bool:
1541
1547
 
1542
1548
  node_data = response["Item"]
1543
1549
 
1550
+ # Forward peer pod list from coordinator into request dict so create_pod can
1551
+ # bake MULTINODE_HOSTS / MASTER_ADDR / MULTINODE_RANK env vars into the pod.
1552
+ peer_pods = message_body.get("multinode_peer_pods")
1553
+ if peer_pods:
1554
+ node_data["multinode_peer_pods"] = peer_pods
1555
+
1544
1556
  # Update status to preparing pod
1545
1557
  update_multinode_pod_status(
1546
1558
  reservation_id, "preparing pod", node_index, total_nodes)
@@ -2888,6 +2900,8 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2888
2900
  dockerimage=dockerimage,
2889
2901
  target_az=target_az,
2890
2902
  target_node=target_node,
2903
+ multinode_peer_pods=request.get("multinode_peer_pods"),
2904
+ multinode_rank=int(request.get("node_index", 0)) if is_multinode else 0,
2891
2905
  preserve_entrypoint=preserve_entrypoint,
2892
2906
  node_labels=node_labels,
2893
2907
  trace_data=trace_data,
@@ -3429,6 +3443,8 @@ def create_kubernetes_resources(
3429
3443
  efs_filesystem_id: str = None,
3430
3444
  is_multinode: bool = False,
3431
3445
  target_node: str = None,
3446
+ multinode_peer_pods: list = None,
3447
+ multinode_rank: int = 0,
3432
3448
  dockerfile_base64_data: str = None,
3433
3449
  dockerimage: str = None,
3434
3450
  target_az: str = None,
@@ -3533,6 +3549,8 @@ def create_kubernetes_resources(
3533
3549
  dockerimage=dockerimage,
3534
3550
  target_az=target_az,
3535
3551
  target_node=target_node,
3552
+ multinode_peer_pods=multinode_peer_pods,
3553
+ multinode_rank=multinode_rank,
3536
3554
  preserve_entrypoint=preserve_entrypoint,
3537
3555
  node_labels=node_labels,
3538
3556
  trace_data=trace_data,
@@ -3620,6 +3638,8 @@ def create_kubernetes_resources(
3620
3638
  dockerimage=dockerimage,
3621
3639
  target_az=target_az,
3622
3640
  target_node=target_node,
3641
+ multinode_peer_pods=multinode_peer_pods,
3642
+ multinode_rank=multinode_rank,
3623
3643
  preserve_entrypoint=preserve_entrypoint,
3624
3644
  node_labels=node_labels,
3625
3645
  trace_data=trace_data,
@@ -3722,6 +3742,30 @@ def find_available_node_port(k8s_client) -> int:
3722
3742
  return random.randint(30000, 32767)
3723
3743
 
3724
3744
 
3745
+ def _mig_slice_fraction(gpu_type: str) -> float:
3746
+ """For MIG SKUs return slice fraction of a single GPU (1g=1/7, 2g=2/7, ..., 7g=1).
3747
+
3748
+ Slice naming counts GPCs (compute slices). H100 and B200 both have 7 GPCs per GPU
3749
+ in the typical all-balanced profile, so a 1g slice is 1/7 of a GPU regardless of
3750
+ family. Used to size CPU/memory requests proportional to the GPU fraction the pod
3751
+ actually consumes — the older `gpu_count/max_gpus` ratio over-claimed node resources
3752
+ (a 1g slice would claim 1/4 or 1/16 of the host instead of 1/56).
3753
+ """
3754
+ if "mig" not in gpu_type:
3755
+ return 1.0
3756
+ try:
3757
+ slices = int(gpu_type.split("-mig-")[1].rstrip("g"))
3758
+ except (IndexError, ValueError):
3759
+ return 1.0
3760
+ return slices / 7.0
3761
+
3762
+
3763
+ # Number of full GPUs on the underlying instance — used to convert the slice fraction
3764
+ # into a fraction of the host's CPU/memory. Both p5.48xlarge (H100) and p6-b200.48xlarge
3765
+ # (B200) have 8 GPUs, which matches every MIG-capable instance type we currently run.
3766
+ _FULL_GPUS_PER_MIG_NODE = 8
3767
+
3768
+
3725
3769
  def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool = False) -> dict:
3726
3770
  """Get resource limits for pod based on GPU type and deployment mode"""
3727
3771
  gpu_count = int(gpu_count)
@@ -3741,13 +3785,19 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
3741
3785
  resource_name = config.get("k8s_resource", "nvidia.com/gpu")
3742
3786
  limits[resource_name] = str(gpu_count)
3743
3787
 
3744
- gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3745
-
3746
- # Calculate proportional limits with CPU overprovisioning for burst capacity
3747
- # Give 1.5x CPU limit to allow burst, capped at node total
3748
- fractional_cpu = config["cpus"] * gpu_ratio
3749
- proportional_cpu_limit = min(config["cpus"], int(fractional_cpu * 1.5))
3750
- proportional_memory_limit = int(config["memory_gb"] * gpu_ratio)
3788
+ if "mig" in gpu_type:
3789
+ # Scale by GPC fraction (slice of one GPU), not slice count over max slices.
3790
+ slice_fraction = _mig_slice_fraction(gpu_type)
3791
+ cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
3792
+ mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
3793
+ fractional_cpu = cpu_per_full_gpu * slice_fraction * gpu_count
3794
+ proportional_cpu_limit = max(1, min(config["cpus"], int(fractional_cpu * 1.5)))
3795
+ proportional_memory_limit = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count))
3796
+ else:
3797
+ gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3798
+ fractional_cpu = config["cpus"] * gpu_ratio
3799
+ proportional_cpu_limit = min(config["cpus"], int(fractional_cpu * 1.5))
3800
+ proportional_memory_limit = int(config["memory_gb"] * gpu_ratio)
3751
3801
 
3752
3802
  limits.update({
3753
3803
  "cpu": str(proportional_cpu_limit),
@@ -3787,13 +3837,16 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
3787
3837
  if gpu_count > 0:
3788
3838
  resource_name = config.get("k8s_resource", "nvidia.com/gpu")
3789
3839
  requests[resource_name] = str(gpu_count)
3790
- gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3791
-
3792
- # Calculate proportional requests (reserve 10% for system overhead)
3793
- # This ensures requests don't exceed node allocatable resources
3794
- # Limits can be higher for burst capacity (Burstable QoS)
3795
- proportional_cpu_request = int(config["cpus"] * gpu_ratio * 0.9)
3796
- proportional_memory_request = int(config["memory_gb"] * gpu_ratio * 0.9)
3840
+ if "mig" in gpu_type:
3841
+ slice_fraction = _mig_slice_fraction(gpu_type)
3842
+ cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
3843
+ mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
3844
+ proportional_cpu_request = max(1, int(cpu_per_full_gpu * slice_fraction * gpu_count * 0.9))
3845
+ proportional_memory_request = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count * 0.9))
3846
+ else:
3847
+ gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3848
+ proportional_cpu_request = int(config["cpus"] * gpu_ratio * 0.9)
3849
+ proportional_memory_request = int(config["memory_gb"] * gpu_ratio * 0.9)
3797
3850
 
3798
3851
  requests.update({
3799
3852
  "cpu": str(proportional_cpu_request),
@@ -3896,6 +3949,30 @@ def get_nccl_env_vars(gpu_type: str) -> list:
3896
3949
  return env_vars
3897
3950
 
3898
3951
 
3952
+ def _get_multinode_env_vars(peer_pods: list, rank: int) -> list:
3953
+ """Build env vars exposing peer hostnames/rank/master to the pod.
3954
+
3955
+ Hostnames use the per-pod headless service we already create elsewhere, so they
3956
+ resolve to the current pod IP via cluster DNS even if a pod is recreated. We
3957
+ don\'t inject IPs at pod-creation time (they aren\'t known until kube schedules
3958
+ everyone) — the bashrc/zshrc helper resolves and exports MULTINODE_IPS at shell
3959
+ start, and a /usr/local/bin/multinode-ips helper is available for non-interactive
3960
+ callers.
3961
+ """
3962
+ if not peer_pods or len(peer_pods) <= 1:
3963
+ return []
3964
+ namespace = "gpu-dev"
3965
+ hosts = [f"{p}-headless.{namespace}.svc.cluster.local" for p in peer_pods]
3966
+ return [
3967
+ client.V1EnvVar(name="MULTINODE_HOSTS", value=",".join(hosts)),
3968
+ client.V1EnvVar(name="MULTINODE_PEER_PODS", value=",".join(peer_pods)),
3969
+ client.V1EnvVar(name="MULTINODE_RANK", value=str(rank)),
3970
+ client.V1EnvVar(name="MULTINODE_SIZE", value=str(len(peer_pods))),
3971
+ client.V1EnvVar(name="MASTER_ADDR", value=hosts[0]),
3972
+ client.V1EnvVar(name="MASTER_PORT", value="29500"),
3973
+ ]
3974
+
3975
+
3899
3976
  def create_pod(
3900
3977
  k8s_client,
3901
3978
  pod_name: str,
@@ -3913,6 +3990,8 @@ def create_pod(
3913
3990
  dockerimage: str = None,
3914
3991
  target_az: str = None,
3915
3992
  target_node: str = None,
3993
+ multinode_peer_pods: list = None,
3994
+ multinode_rank: int = 0,
3916
3995
  preserve_entrypoint: bool = False,
3917
3996
  node_labels: dict = None,
3918
3997
  trace_data: dict = None,
@@ -4397,6 +4476,16 @@ EOF_PROFILE
4397
4476
  # User identification
4398
4477
  export GPU_DEV_USER_ID="{user_id or 'dev'}"
4399
4478
 
4479
+ # Multinode peer info — inlined from container env at pod startup. sshd strips
4480
+ # container env vars from login shells, so we materialize the values into rc files.
4481
+ # Skipped (empty exports) for single-node reservations where MULTINODE_* aren't set.
4482
+ export MULTINODE_HOSTS="$MULTINODE_HOSTS"
4483
+ export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
4484
+ export MULTINODE_RANK="$MULTINODE_RANK"
4485
+ export MULTINODE_SIZE="$MULTINODE_SIZE"
4486
+ export MASTER_ADDR="$MASTER_ADDR"
4487
+ export MASTER_PORT="$MASTER_PORT"
4488
+
4400
4489
  # Function to check for GPU reservation expiry warnings and startup script status
4401
4490
  check_warnings() {{
4402
4491
  # Check for startup script still running
@@ -4415,6 +4504,22 @@ check_warnings() {{
4415
4504
 
4416
4505
  # Run warning check before every command prompt
4417
4506
  PROMPT_COMMAND="check_warnings; \$PROMPT_COMMAND"
4507
+
4508
+ # Multinode peer IP resolution: MULTINODE_HOSTS is baked at pod creation, but per-pod
4509
+ # IPs are only known once kube schedules them. Resolve at shell start so users can do
4510
+ # torchrun --master_addr=\$MASTER_ADDR or mpirun -H "\$MULTINODE_IPS" without extra steps.
4511
+ if [ -n "\$MULTINODE_HOSTS" ]; then
4512
+ _MULTINODE_IPS=""
4513
+ for _h in \$(echo "\$MULTINODE_HOSTS" | tr ',' ' '); do
4514
+ _ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
4515
+ if [ -n "\$_ip" ]; then
4516
+ _MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
4517
+ fi
4518
+ done
4519
+ export MULTINODE_IPS="\$_MULTINODE_IPS"
4520
+ [ -n "\$MULTINODE_IPS" ] && export MASTER_IP=\$(echo "\$MULTINODE_IPS" | cut -d, -f1)
4521
+ unset _MULTINODE_IPS _h _ip
4522
+ fi
4418
4523
  EOF_BASHRC_EXT
4419
4524
 
4420
4525
  cat > /home/dev/.zshrc_ext << EOF_ZSHRC_EXT
@@ -4425,6 +4530,15 @@ EOF_BASHRC_EXT
4425
4530
  # User identification
4426
4531
  export GPU_DEV_USER_ID="{user_id or 'dev'}"
4427
4532
 
4533
+ # Multinode peer info — inlined from container env at pod startup. sshd strips
4534
+ # container env vars from login shells, so we materialize the values into rc files.
4535
+ export MULTINODE_HOSTS="$MULTINODE_HOSTS"
4536
+ export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
4537
+ export MULTINODE_RANK="$MULTINODE_RANK"
4538
+ export MULTINODE_SIZE="$MULTINODE_SIZE"
4539
+ export MASTER_ADDR="$MASTER_ADDR"
4540
+ export MASTER_PORT="$MASTER_PORT"
4541
+
4428
4542
  # Function to check for GPU reservation expiry warnings and startup script status
4429
4543
  check_warnings() {{
4430
4544
  # Check for startup script still running
@@ -4444,6 +4558,20 @@ check_warnings() {{
4444
4558
 
4445
4559
  # Run warning check before every command prompt (zsh hook)
4446
4560
  precmd() {{ check_warnings }}
4561
+
4562
+ # Multinode peer IP resolution (see .bashrc_ext for rationale)
4563
+ if [[ -n "\$MULTINODE_HOSTS" ]]; then
4564
+ _MULTINODE_IPS=""
4565
+ for _h in \${{(s:,:)MULTINODE_HOSTS}}; do
4566
+ _ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
4567
+ if [[ -n "\$_ip" ]]; then
4568
+ _MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
4569
+ fi
4570
+ done
4571
+ export MULTINODE_IPS="\$_MULTINODE_IPS"
4572
+ [[ -n "\$MULTINODE_IPS" ]] && export MASTER_IP="\${{MULTINODE_IPS%%,*}}"
4573
+ unset _MULTINODE_IPS _h _ip
4574
+ fi
4447
4575
  EOF_ZSHRC_EXT
4448
4576
 
4449
4577
  chown 1081:1081 /home/dev/.bashrc_ext /home/dev/.zshrc_ext
@@ -5174,7 +5302,7 @@ EOF
5174
5302
  client.V1EnvVar(
5175
5303
  name="NVIDIA_DRIVER_CAPABILITIES", value="compute,utility"
5176
5304
  )
5177
- ] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type),
5305
+ ] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
5178
5306
  resources=client.V1ResourceRequirements(
5179
5307
  limits=get_pod_resource_limits(
5180
5308
  gpu_count, gpu_type, is_multinode),
@@ -6319,6 +6447,9 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6319
6447
  "nvidia.com/mig-3g.40gb": "h100-mig-3g",
6320
6448
  "nvidia.com/mig-4g.40gb": "h100-mig-4g",
6321
6449
  "nvidia.com/mig-7g.80gb": "h100-mig-7g",
6450
+ "nvidia.com/mig-1g.23gb": "b200-mig-1g",
6451
+ "nvidia.com/mig-2g.45gb": "b200-mig-2g",
6452
+ "nvidia.com/mig-3g.90gb": "b200-mig-3g",
6322
6453
  }
6323
6454
  if pod.spec.containers:
6324
6455
  for c in pod.spec.containers:
@@ -31,9 +31,14 @@ def get_bearer_token() -> str:
31
31
  """
32
32
  Create a k8s-aws-v1 bearer token by presigning STS:GetCallerIdentity.
33
33
  IMPORTANT: base64url-encode the FULL presigned URL, then strip padding.
34
+
35
+ expires_in must match _EFFECTIVE_TOKEN_TTL: previously this was 60s while the cache
36
+ held the token for 14 min, so warm Lambda containers handed EKS expired URLs and got
37
+ 401s for ~13 min until the next refresh. 900s is the typical EKS get-token default
38
+ and the max for IAM-role-derived presigned URLs.
34
39
  """
35
40
  logger.info("Starting bearer token generation")
36
- STS_TOKEN_EXPIRES_IN = 60
41
+ STS_TOKEN_EXPIRES_IN = 900
37
42
  session = boto3.session.Session(region_name=REGION)
38
43
  logger.info(f"Created boto3 session for region {REGION}")
39
44
 
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.17"
183
+ LAMBDA_VERSION = "0.5.22"
184
184
  MIN_CLI_VERSION = "0.5.16"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
@@ -0,0 +1,63 @@
1
+ # `gpu-dev submit` smoke tests
2
+
3
+ Three tests that exercise `gpu-dev submit` end-to-end. Each test lives in its
4
+ own folder so you can `--runtime` it directly. Output files written by the
5
+ script are pulled back into the same folder via the post-run rsync.
6
+
7
+ > Requires `gpu-dev >= 0.5.19`. No Lambda update needed.
8
+
9
+ ## 1. success — single T4 GPU, exit 0
10
+
11
+ ```bash
12
+ cd tests/submit/success
13
+ gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
14
+ echo $? # 0
15
+ ls # nvidia-info.txt, compute.txt, status.txt all created
16
+ ```
17
+
18
+ ## 2. fail — single T4 GPU, exit 7
19
+
20
+ Writes a partial file before exploding so you can confirm rsync still pulls
21
+ output on failure and the local exit code is the remote's.
22
+
23
+ ```bash
24
+ cd tests/submit/fail
25
+ gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
26
+ echo $? # 7
27
+ ls # step1.txt, step2.txt, gpus-before-fail.txt — but no step3.txt
28
+ ```
29
+
30
+ ## 3. multinode — 2x H100 nodes, exit 0
31
+
32
+ Reserves 16 H100s (= 2 nodes), verifies env vars + peer ssh + NCCL all_reduce
33
+ across the whole cluster via mpirun (orchestrated entirely from rank 0).
34
+
35
+ ```bash
36
+ cd tests/submit/multinode
37
+ gpu-dev submit --gpu-type h100 --gpus 16 --runtime ./ -- bash run.sh
38
+ echo $? # 0
39
+ cat multinode-env.txt resolved-ips.txt peer-ssh.txt nccl-all_reduce.log
40
+ ```
41
+
42
+ ## What each test proves
43
+
44
+ | Test | Proves |
45
+ |------------|-------------------------------------------------------------------------------|
46
+ | success | reserve → rsync up → exec → rsync back → cancel → exit 0 |
47
+ | fail | exit code propagation; rsync-back still runs on non-zero exit; cancel fires |
48
+ | multinode | MULTINODE_* env vars; peer DNS / passwordless ssh; cross-node NCCL via mpirun |
49
+
50
+ After every run, `gpu-dev list` should show neither reservation — both auto-cancelled.
51
+ Use `--keep-alive` on any of them if you want to debug interactively afterward.
52
+
53
+ ## Other submit flags (forwarded to `reserve`)
54
+
55
+ - `--hours N` — reservation lifetime ceiling (default 1.0)
56
+ - `--disk NAME` — attach a persistent disk to the master node
57
+ - `--no-persistent-disk` — skip persistent disk
58
+ - `--dockerfile PATH` — build a custom image from this Dockerfile
59
+ - `--dockerimage REF` — use a pre-built container image
60
+ - `--preserve-entrypoint` — keep the custom image's ENTRYPOINT (you must run sshd yourself for submit to work)
61
+ - `--timeout MINUTES` — wait-for-active timeout (default 1440 = 24h, since reservations may queue)
62
+ - `--no-pull` — skip the post-run sync-back
63
+ - `--keep-alive` — skip auto-cancel
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env bash
2
+ # Failure test for `gpu-dev submit`: writes a partial output, then exits 7.
3
+ # Verifies the post-run rsync still pulls the partial files even on failure,
4
+ # the auto-cancel runs on non-zero exit, and the local exit code is preserved.
5
+ set -e
6
+
7
+ echo "=== host ==="
8
+ hostname
9
+ date -u
10
+
11
+ # Write a partial file so we can verify it was synced back
12
+ echo "step1 done at $(date -u)" > step1.txt
13
+ nvidia-smi -L > gpus-before-fail.txt
14
+
15
+ # Now error out
16
+ echo "About to fail..." > step2.txt
17
+ python3 -c "import sys; sys.exit(7)"
18
+
19
+ # Should not reach here
20
+ echo "should-not-appear" > step3.txt
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env bash
2
+ # Multinode test for `gpu-dev submit`: runs on rank 0 only and orchestrates the
3
+ # whole cluster via mpirun (uses passwordless ssh + the headless service DNS we
4
+ # already set up). Verifies env vars, peer connectivity, and an actual NCCL
5
+ # all_reduce across all nodes.
6
+ set -euo pipefail
7
+ cd "$(dirname "$0")"
8
+
9
+ echo "=== rank 0 host: $(hostname) at $(date -u) ==="
10
+
11
+ echo "=== multinode env ==="
12
+ {
13
+ echo "MULTINODE_HOSTS=$MULTINODE_HOSTS"
14
+ echo "MULTINODE_PEER_PODS=$MULTINODE_PEER_PODS"
15
+ echo "MULTINODE_RANK=$MULTINODE_RANK"
16
+ echo "MULTINODE_SIZE=$MULTINODE_SIZE"
17
+ echo "MASTER_ADDR=$MASTER_ADDR"
18
+ echo "MASTER_PORT=$MASTER_PORT"
19
+ echo "MULTINODE_IPS=${MULTINODE_IPS:-(not set)}"
20
+ } | tee multinode-env.txt
21
+
22
+ if [[ -z "${MULTINODE_HOSTS:-}" ]]; then
23
+ echo "ERROR: MULTINODE_HOSTS empty — submit with --gpus >= 16 on h100" >&2
24
+ exit 2
25
+ fi
26
+
27
+ # Resolve IPs even if the bashrc helper didn't run (defensive)
28
+ IPS=""
29
+ for h in $(echo "$MULTINODE_HOSTS" | tr ',' ' '); do
30
+ ip=$(getent hosts "$h" | awk '{print $1}' | head -1)
31
+ [[ -n "$ip" ]] && IPS="${IPS:+$IPS,}$ip"
32
+ done
33
+ echo "Resolved IPS=$IPS" | tee resolved-ips.txt
34
+
35
+ echo "=== peer ssh check (port 2222 inside cluster) ==="
36
+ peer_host=$(echo "$MULTINODE_HOSTS" | cut -d, -f2)
37
+ ssh -o StrictHostKeyChecking=no -p 2222 "$peer_host" 'hostname; nvidia-smi -L | wc -l' \
38
+ | tee peer-ssh.txt
39
+
40
+ GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
41
+ echo "GPUS_PER_NODE=$GPUS_PER_NODE" | tee gpus-per-node.txt
42
+
43
+ # Build --host arg: ip1:N,ip2:N,...
44
+ HOST_ARG=$(echo "$IPS" | awk -v g="$GPUS_PER_NODE" -F, '{out=""; for(i=1;i<=NF;i++){out=out ($i ":" g) (i<NF?",":"")}; print out}')
45
+ echo "HOST_ARG=$HOST_ARG"
46
+
47
+ echo "=== NCCL all_reduce_perf via mpirun ==="
48
+ # Note: -g 1 = 1 GPU per process, -n 20 iterations. Sweep 1M..1G in factor-of-2 steps.
49
+ mpirun --host "$HOST_ARG" \
50
+ --mca plm_rsh_args "-p 2222 -o StrictHostKeyChecking=no" \
51
+ -x PATH -x LD_LIBRARY_PATH \
52
+ -x FI_PROVIDER -x FI_EFA_USE_DEVICE_RDMA \
53
+ -x NCCL_NET_GDR_LEVEL -x NCCL_ALGO \
54
+ -x NCCL_SOCKET_IFNAME -x NCCL_DEBUG -x NCCL_IB_HCA \
55
+ /opt/nccl-tests/build/all_reduce_perf -b 1M -e 1G -f 2 -g 1 -n 20 \
56
+ 2>&1 | tee nccl-all_reduce.log
57
+
58
+ echo "=== summary ==="
59
+ {
60
+ echo "rank=$MULTINODE_RANK size=$MULTINODE_SIZE"
61
+ echo "host_arg=$HOST_ARG"
62
+ echo "completed at $(date -u)"
63
+ } | tee summary.txt
64
+
65
+ echo "DONE"
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env bash
2
+ # Smoke test for `gpu-dev submit`: runs on a single GPU, expected exit 0.
3
+ set -euo pipefail
4
+
5
+ echo "=== host ==="
6
+ hostname
7
+ date -u
8
+
9
+ echo "=== nvidia-smi ==="
10
+ nvidia-smi | tee nvidia-info.txt
11
+
12
+ echo "=== compute ==="
13
+ python3 - <<'PY' | tee compute.txt
14
+ import torch
15
+ assert torch.cuda.is_available(), "CUDA not available"
16
+ n = torch.cuda.device_count()
17
+ x = torch.arange(1_000_000, device="cuda", dtype=torch.float32)
18
+ s = x.sum().item()
19
+ print(f"devices={n} sum(0..999_999)={s}")
20
+ PY
21
+
22
+ echo "ok at $(date -u)" > status.txt
23
+ echo "DONE"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes