gpu-dev 0.5.18__tar.gz → 0.5.20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +5 -1
  4. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +64 -6
  5. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/Dockerfile +2 -1
  7. gpu_dev-0.5.20/tests/submit/README.md +63 -0
  8. gpu_dev-0.5.20/tests/submit/fail/run.sh +20 -0
  9. gpu_dev-0.5.20/tests/submit/multinode/run.sh +65 -0
  10. gpu_dev-0.5.20/tests/submit/success/run.sh +23 -0
  11. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/.github/workflows/no-gitlinks.yml +0 -0
  12. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/.github/workflows/publish.yml +0 -0
  13. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/.gitignore +0 -0
  14. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/CLAUDE.md +0 -0
  15. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/PROGRESS.md +0 -0
  16. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/PR_DESCRIPTION.md +0 -0
  17. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/TODO.md +0 -0
  18. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/admin/README.md +0 -0
  19. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/admin/generate_stats.py +0 -0
  20. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/admin/requirements.txt +0 -0
  21. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/README.md +0 -0
  22. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  23. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  24. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  25. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  26. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  27. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  28. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  29. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  30. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  31. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  32. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  33. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  34. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  35. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  36. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  37. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/docs/USER_GUIDE.md +0 -0
  38. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/docs/devgpu-features.html +0 -0
  39. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/docs/docker-mark-blue.svg +0 -0
  40. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/docs/icons8-cursor-ai.svg +0 -0
  41. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/post.md +0 -0
  42. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/setup.cfg +0 -0
  43. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  44. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  45. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/README.md +0 -0
  46. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/alb.tf +0 -0
  47. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/availability.tf +0 -0
  48. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/backend.tf +0 -0
  49. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  50. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  51. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/bash_profile +0 -0
  52. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/bashrc +0 -0
  53. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  54. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  55. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  56. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  57. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/motd_script +0 -0
  58. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  59. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/profile +0 -0
  60. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  61. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  62. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  63. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/shell_env +0 -0
  64. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/ssh_config +0 -0
  65. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/zprofile +0 -0
  66. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/zshrc +0 -0
  67. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  68. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker-build.tf +0 -0
  69. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  70. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  71. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ecr.tf +0 -0
  72. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/efs.tf +0 -0
  73. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/eks.tf +0 -0
  74. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/expiry.tf +0 -0
  75. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/git-cache.tf +0 -0
  76. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/kubernetes.tf +0 -0
  77. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  78. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  84. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  85. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  86. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  87. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  88. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  89. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  90. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  91. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  92. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda.tf +0 -0
  93. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/main.tf +0 -0
  94. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/mig-config.tf +0 -0
  95. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  96. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  97. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  98. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  99. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  100. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  101. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/monitoring.tf +0 -0
  102. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/outputs.tf +0 -0
  103. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/pyproject.toml +0 -0
  104. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/queue.tf +0 -0
  105. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/route53.tf +0 -0
  106. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  107. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  108. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  109. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  110. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  111. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  112. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  113. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  114. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  115. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  116. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/switch-to.sh +0 -0
  117. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  118. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  119. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  120. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  121. {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.18
3
+ Version: 0.5.20
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.18
3
+ Version: 0.5.20
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -112,4 +112,8 @@ terraform-gpu-devservers/ssh-proxy/requirements.txt
112
112
  terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
113
113
  terraform-gpu-devservers/templates/al2023-user-data.sh
114
114
  terraform-gpu-devservers/templates/user-data-self-managed.sh
115
- terraform-gpu-devservers/templates/user-data.sh
115
+ terraform-gpu-devservers/templates/user-data.sh
116
+ tests/submit/README.md
117
+ tests/submit/fail/run.sh
118
+ tests/submit/multinode/run.sh
119
+ tests/submit/success/run.sh
@@ -1357,18 +1357,26 @@ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200"
1357
1357
  @main.command(context_settings={"ignore_unknown_options": True})
1358
1358
  @click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
1359
1359
  @click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
1360
- @click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation duration ceiling (job auto-cancels on exit).")
1360
+ @click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling job auto-cancels well before this if it finishes.")
1361
1361
  @click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
1362
1362
  @click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
1363
+ @click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
1364
+ help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
1365
+ @click.option("--dockerimage", type=str, default=None,
1366
+ help="Pre-built container image reference (e.g. ghcr.io/me/img:tag) to run instead of the default.")
1367
+ @click.option("--preserve-entrypoint", is_flag=True,
1368
+ help="Keep the custom image's ENTRYPOINT/CMD instead of letting gpu-dev wrap with the SSH harness. Note: submit needs SSH to work.")
1363
1369
  @click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
1364
1370
  help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
1365
1371
  @click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
1366
1372
  @click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
1367
1373
  @click.option("--name", type=str, default=None, help="Reservation name.")
1368
- @click.option("--timeout", type=int, default=20, show_default=True, help="Minutes to wait for the reservation to become active.")
1374
+ @click.option("--timeout", type=int, default=24 * 60, show_default=True,
1375
+ help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
1369
1376
  @click.argument("command", nargs=-1, required=True)
1370
1377
  @click.pass_context
1371
- def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pull, keep_alive, name, timeout, command):
1378
+ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
1379
+ runtime, no_pull, keep_alive, name, timeout, command):
1372
1380
  """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
1373
1381
 
1374
1382
  \b
@@ -1390,6 +1398,15 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
1390
1398
  rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
1391
1399
  sys.exit(2)
1392
1400
 
1401
+ # rsync is on macOS by default and on virtually every Linux distro; bail early with a
1402
+ # readable message if the user has somehow uninstalled it locally rather than failing
1403
+ # mid-flight after the reservation has already been created.
1404
+ if runtime:
1405
+ import shutil
1406
+ if not shutil.which("rsync"):
1407
+ rprint("[red]❌ rsync not found on PATH locally. Install it (Mac: 'brew install rsync', Debian/Ubuntu: 'sudo apt install rsync') and retry.[/red]")
1408
+ sys.exit(2)
1409
+
1393
1410
  gt = gpu_type.lower()
1394
1411
  # Per-type max GPUs (mirrors gpu_configs in reserve flow)
1395
1412
  max_per_node = {
@@ -1421,12 +1438,47 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
1421
1438
  # SSH into rank 0, so passing --disk is fine.
1422
1439
  disk_name = None if no_persistent_disk else disk
1423
1440
 
1441
+ # Build dockerfile context if provided (mirrors the reserve-flow logic)
1442
+ dockerfile_payload = None
1443
+ if dockerfile:
1444
+ import os, tarfile, tempfile, base64
1445
+ if os.path.getsize(dockerfile) > 512 * 1024:
1446
+ rprint("[red]❌ Dockerfile too large (max 512KB)[/red]")
1447
+ sys.exit(2)
1448
+ ctx_dir = os.path.dirname(os.path.abspath(dockerfile))
1449
+ rprint(f"[cyan]📦 Building tar.gz context from {ctx_dir}[/cyan]")
1450
+ with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
1451
+ with tarfile.open(tmp.name, "w:gz") as tar:
1452
+ for root, _, files in os.walk(ctx_dir):
1453
+ for f in files:
1454
+ full = os.path.join(root, f)
1455
+ tar.add(full, arcname=os.path.relpath(full, ctx_dir))
1456
+ if os.path.basename(dockerfile).lower() != "dockerfile":
1457
+ tar.add(dockerfile, arcname="Dockerfile")
1458
+ tar_size = os.path.getsize(tmp.name)
1459
+ if tar_size > 700 * 1024:
1460
+ os.unlink(tmp.name)
1461
+ rprint(f"[red]❌ Build context too large: {tar_size}B (max ~700KB compressed)[/red]")
1462
+ sys.exit(2)
1463
+ with open(tmp.name, "rb") as fh:
1464
+ dockerfile_payload = base64.b64encode(fh.read()).decode("utf-8")
1465
+ os.unlink(tmp.name)
1466
+ rprint(f"[green]✅ Dockerfile context: {tar_size}B compressed[/green]")
1467
+
1468
+ if dockerimage and not preserve_entrypoint:
1469
+ rprint("[dim]Note: passing --dockerimage without --preserve-entrypoint, so gpu-dev wraps the image with the SSH harness.[/dim]")
1470
+ if preserve_entrypoint and not (dockerfile or dockerimage):
1471
+ rprint("[red]❌ --preserve-entrypoint requires --dockerfile or --dockerimage[/red]")
1472
+ sys.exit(2)
1473
+
1424
1474
  rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
1425
1475
  if is_multinode:
1426
1476
  reservation_ids = rm.create_multinode_reservation(
1427
1477
  user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1428
1478
  duration_hours=hours, name=name, github_user=user_info["github_user"],
1429
- no_persistent_disk=no_persistent_disk, disk_name=disk_name)
1479
+ no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1480
+ dockerfile=dockerfile_payload, dockerimage=dockerimage,
1481
+ preserve_entrypoint=preserve_entrypoint)
1430
1482
  if not reservation_ids:
1431
1483
  rprint("[red]❌ Failed to create multinode reservation[/red]")
1432
1484
  sys.exit(2)
@@ -1435,7 +1487,9 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
1435
1487
  primary_id = rm.create_reservation(
1436
1488
  user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1437
1489
  duration_hours=hours, name=name, github_user=user_info["github_user"],
1438
- no_persistent_disk=no_persistent_disk, disk_name=disk_name)
1490
+ no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1491
+ dockerfile=dockerfile_payload, dockerimage=dockerimage,
1492
+ preserve_entrypoint=preserve_entrypoint)
1439
1493
  if not primary_id:
1440
1494
  rprint("[red]❌ Failed to create reservation[/red]")
1441
1495
  sys.exit(2)
@@ -1456,7 +1510,11 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
1456
1510
  rprint(f"[dim] cancel {rid[:8]} failed: {ce}[/dim]")
1457
1511
 
1458
1512
  try:
1459
- rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active (up to {timeout}m)...[/cyan]")
1513
+ if timeout >= 60:
1514
+ wait_str = f"up to {timeout//60}h{(" " + str(timeout%60) + "m") if timeout%60 else ""}"
1515
+ else:
1516
+ wait_str = f"up to {timeout}m"
1517
+ rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active ({wait_str}; can queue when cluster is full)...[/cyan]")
1460
1518
  if is_multinode:
1461
1519
  results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
1462
1520
  else:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.18"
7
+ version = "0.5.20"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -36,7 +36,8 @@ RUN apt-get install -y --no-install-recommends \
36
36
  unzip \
37
37
  ccache \
38
38
  htop \
39
- tree
39
+ tree \
40
+ rsync
40
41
  # Install Node.js 20 from NodeSource (required for Claude CLI)
41
42
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
42
43
  apt-get install -y nodejs
@@ -0,0 +1,63 @@
1
+ # `gpu-dev submit` smoke tests
2
+
3
+ Three tests that exercise `gpu-dev submit` end-to-end. Each test lives in its
4
+ own folder so you can `--runtime` it directly. Output files written by the
5
+ script are pulled back into the same folder via the post-run rsync.
6
+
7
+ > Requires `gpu-dev >= 0.5.19`. No Lambda update needed.
8
+
9
+ ## 1. success — single T4 GPU, exit 0
10
+
11
+ ```bash
12
+ cd tests/submit/success
13
+ gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
14
+ echo $? # 0
15
+ ls # nvidia-info.txt, compute.txt, status.txt all created
16
+ ```
17
+
18
+ ## 2. fail — single T4 GPU, exit 7
19
+
20
+ Writes a partial file before exploding so you can confirm rsync still pulls
21
+ output on failure and the local exit code is the remote's.
22
+
23
+ ```bash
24
+ cd tests/submit/fail
25
+ gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
26
+ echo $? # 7
27
+ ls # step1.txt, step2.txt, gpus-before-fail.txt — but no step3.txt
28
+ ```
29
+
30
+ ## 3. multinode — 2x H100 nodes, exit 0
31
+
32
+ Reserves 16 H100s (= 2 nodes), verifies env vars + peer ssh + NCCL all_reduce
33
+ across the whole cluster via mpirun (orchestrated entirely from rank 0).
34
+
35
+ ```bash
36
+ cd tests/submit/multinode
37
+ gpu-dev submit --gpu-type h100 --gpus 16 --runtime ./ -- bash run.sh
38
+ echo $? # 0
39
+ cat multinode-env.txt resolved-ips.txt peer-ssh.txt nccl-all_reduce.log
40
+ ```
41
+
42
+ ## What each test proves
43
+
44
+ | Test | Proves |
45
+ |------------|-------------------------------------------------------------------------------|
46
+ | success | reserve → rsync up → exec → rsync back → cancel → exit 0 |
47
+ | fail | exit code propagation; rsync-back still runs on non-zero exit; cancel fires |
48
+ | multinode | MULTINODE_* env vars; peer DNS / passwordless ssh; cross-node NCCL via mpirun |
49
+
50
+ After every run, `gpu-dev list` should show neither reservation — both auto-cancelled.
51
+ Use `--keep-alive` on any of them if you want to debug interactively afterward.
52
+
53
+ ## Other submit flags (forwarded to `reserve`)
54
+
55
+ - `--hours N` — reservation lifetime ceiling (default 1.0)
56
+ - `--disk NAME` — attach a persistent disk to the master node
57
+ - `--no-persistent-disk` — skip persistent disk
58
+ - `--dockerfile PATH` — build a custom image from this Dockerfile
59
+ - `--dockerimage REF` — use a pre-built container image
60
+ - `--preserve-entrypoint` — keep the custom image's ENTRYPOINT (you must run sshd yourself for submit to work)
61
+ - `--timeout MINUTES` — wait-for-active timeout (default 1440 = 24h, since reservations may queue)
62
+ - `--no-pull` — skip the post-run sync-back
63
+ - `--keep-alive` — skip auto-cancel
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env bash
2
+ # Failure test for `gpu-dev submit`: writes a partial output, then exits 7.
3
+ # Verifies the post-run rsync still pulls the partial files even on failure,
4
+ # the auto-cancel runs on non-zero exit, and the local exit code is preserved.
5
+ set -e
6
+
7
+ echo "=== host ==="
8
+ hostname
9
+ date -u
10
+
11
+ # Write a partial file so we can verify it was synced back
12
+ echo "step1 done at $(date -u)" > step1.txt
13
+ nvidia-smi -L > gpus-before-fail.txt
14
+
15
+ # Now error out
16
+ echo "About to fail..." > step2.txt
17
+ python3 -c "import sys; sys.exit(7)"
18
+
19
+ # Should not reach here
20
+ echo "should-not-appear" > step3.txt
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env bash
2
+ # Multinode test for `gpu-dev submit`: runs on rank 0 only and orchestrates the
3
+ # whole cluster via mpirun (uses passwordless ssh + the headless service DNS we
4
+ # already set up). Verifies env vars, peer connectivity, and an actual NCCL
5
+ # all_reduce across all nodes.
6
+ set -euo pipefail
7
+ cd "$(dirname "$0")"
8
+
9
+ echo "=== rank 0 host: $(hostname) at $(date -u) ==="
10
+
11
+ echo "=== multinode env ==="
12
+ {
13
+ echo "MULTINODE_HOSTS=$MULTINODE_HOSTS"
14
+ echo "MULTINODE_PEER_PODS=$MULTINODE_PEER_PODS"
15
+ echo "MULTINODE_RANK=$MULTINODE_RANK"
16
+ echo "MULTINODE_SIZE=$MULTINODE_SIZE"
17
+ echo "MASTER_ADDR=$MASTER_ADDR"
18
+ echo "MASTER_PORT=$MASTER_PORT"
19
+ echo "MULTINODE_IPS=${MULTINODE_IPS:-(not set)}"
20
+ } | tee multinode-env.txt
21
+
22
+ if [[ -z "${MULTINODE_HOSTS:-}" ]]; then
23
+ echo "ERROR: MULTINODE_HOSTS empty — submit with --gpus >= 16 on h100" >&2
24
+ exit 2
25
+ fi
26
+
27
+ # Resolve IPs even if the bashrc helper didn't run (defensive)
28
+ IPS=""
29
+ for h in $(echo "$MULTINODE_HOSTS" | tr ',' ' '); do
30
+ ip=$(getent hosts "$h" | awk '{print $1}' | head -1)
31
+ [[ -n "$ip" ]] && IPS="${IPS:+$IPS,}$ip"
32
+ done
33
+ echo "Resolved IPS=$IPS" | tee resolved-ips.txt
34
+
35
+ echo "=== peer ssh check (port 2222 inside cluster) ==="
36
+ peer_host=$(echo "$MULTINODE_HOSTS" | cut -d, -f2)
37
+ ssh -o StrictHostKeyChecking=no -p 2222 "$peer_host" 'hostname; nvidia-smi -L | wc -l' \
38
+ | tee peer-ssh.txt
39
+
40
+ GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
41
+ echo "GPUS_PER_NODE=$GPUS_PER_NODE" | tee gpus-per-node.txt
42
+
43
+ # Build --host arg: ip1:N,ip2:N,...
44
+ HOST_ARG=$(echo "$IPS" | awk -v g="$GPUS_PER_NODE" -F, '{out=""; for(i=1;i<=NF;i++){out=out ($i ":" g) (i<NF?",":"")}; print out}')
45
+ echo "HOST_ARG=$HOST_ARG"
46
+
47
+ echo "=== NCCL all_reduce_perf via mpirun ==="
48
+ # Note: -g 1 = 1 GPU per process, -n 20 iterations. Sweep 1M..1G in factor-of-2 steps.
49
+ mpirun --host "$HOST_ARG" \
50
+ --mca plm_rsh_args "-p 2222 -o StrictHostKeyChecking=no" \
51
+ -x PATH -x LD_LIBRARY_PATH \
52
+ -x FI_PROVIDER -x FI_EFA_USE_DEVICE_RDMA \
53
+ -x NCCL_NET_GDR_LEVEL -x NCCL_ALGO \
54
+ -x NCCL_SOCKET_IFNAME -x NCCL_DEBUG -x NCCL_IB_HCA \
55
+ /opt/nccl-tests/build/all_reduce_perf -b 1M -e 1G -f 2 -g 1 -n 20 \
56
+ 2>&1 | tee nccl-all_reduce.log
57
+
58
+ echo "=== summary ==="
59
+ {
60
+ echo "rank=$MULTINODE_RANK size=$MULTINODE_SIZE"
61
+ echo "host_arg=$HOST_ARG"
62
+ echo "completed at $(date -u)"
63
+ } | tee summary.txt
64
+
65
+ echo "DONE"
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env bash
2
+ # Smoke test for `gpu-dev submit`: runs on a single GPU, expected exit 0.
3
+ set -euo pipefail
4
+
5
+ echo "=== host ==="
6
+ hostname
7
+ date -u
8
+
9
+ echo "=== nvidia-smi ==="
10
+ nvidia-smi | tee nvidia-info.txt
11
+
12
+ echo "=== compute ==="
13
+ python3 - <<'PY' | tee compute.txt
14
+ import torch
15
+ assert torch.cuda.is_available(), "CUDA not available"
16
+ n = torch.cuda.device_count()
17
+ x = torch.arange(1_000_000, device="cuda", dtype=torch.float32)
18
+ s = x.sum().item()
19
+ print(f"devices={n} sum(0..999_999)={s}")
20
+ PY
21
+
22
+ echo "ok at $(date -u)" > status.txt
23
+ echo "DONE"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes