gpu-dev 0.5.18__tar.gz → 0.5.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +5 -1
  4. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +55 -6
  5. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/pyproject.toml +1 -1
  6. gpu_dev-0.5.19/tests/submit/README.md +63 -0
  7. gpu_dev-0.5.19/tests/submit/fail/run.sh +20 -0
  8. gpu_dev-0.5.19/tests/submit/multinode/run.sh +65 -0
  9. gpu_dev-0.5.19/tests/submit/success/run.sh +23 -0
  10. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/.github/workflows/no-gitlinks.yml +0 -0
  11. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/.github/workflows/publish.yml +0 -0
  12. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/.gitignore +0 -0
  13. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/CLAUDE.md +0 -0
  14. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/PROGRESS.md +0 -0
  15. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/PR_DESCRIPTION.md +0 -0
  16. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/TODO.md +0 -0
  17. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/admin/README.md +0 -0
  18. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/admin/generate_stats.py +0 -0
  19. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/admin/requirements.txt +0 -0
  20. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/README.md +0 -0
  21. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  22. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  23. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  24. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  25. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  26. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  27. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  28. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  29. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  30. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  31. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  32. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  33. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  34. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  35. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  36. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/docs/USER_GUIDE.md +0 -0
  37. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/docs/devgpu-features.html +0 -0
  38. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/docs/docker-mark-blue.svg +0 -0
  39. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/docs/icons8-cursor-ai.svg +0 -0
  40. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/post.md +0 -0
  41. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/setup.cfg +0 -0
  42. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  43. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  44. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/README.md +0 -0
  45. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/alb.tf +0 -0
  46. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/availability.tf +0 -0
  47. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/backend.tf +0 -0
  48. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  49. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  50. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  51. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bash_profile +0 -0
  52. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc +0 -0
  53. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  54. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  55. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  56. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  57. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/motd_script +0 -0
  58. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  59. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/profile +0 -0
  60. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  61. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  62. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  63. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/shell_env +0 -0
  64. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/ssh_config +0 -0
  65. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zprofile +0 -0
  66. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc +0 -0
  67. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  68. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-build.tf +0 -0
  69. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  70. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  71. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ecr.tf +0 -0
  72. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/efs.tf +0 -0
  73. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/eks.tf +0 -0
  74. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/expiry.tf +0 -0
  75. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/git-cache.tf +0 -0
  76. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/kubernetes.tf +0 -0
  77. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  78. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  84. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  85. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  86. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  87. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  88. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  89. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  90. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  91. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  92. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda.tf +0 -0
  93. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/main.tf +0 -0
  94. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-config.tf +0 -0
  95. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  96. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  97. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  98. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  99. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  100. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  101. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/monitoring.tf +0 -0
  102. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/outputs.tf +0 -0
  103. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/pyproject.toml +0 -0
  104. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/queue.tf +0 -0
  105. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/route53.tf +0 -0
  106. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  107. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  108. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  109. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  110. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  111. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  112. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  113. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  114. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  115. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  116. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/switch-to.sh +0 -0
  117. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  118. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  119. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  120. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  121. {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.18
3
+ Version: 0.5.19
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.18
3
+ Version: 0.5.19
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -112,4 +112,8 @@ terraform-gpu-devservers/ssh-proxy/requirements.txt
112
112
  terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
113
113
  terraform-gpu-devservers/templates/al2023-user-data.sh
114
114
  terraform-gpu-devservers/templates/user-data-self-managed.sh
115
- terraform-gpu-devservers/templates/user-data.sh
115
+ terraform-gpu-devservers/templates/user-data.sh
116
+ tests/submit/README.md
117
+ tests/submit/fail/run.sh
118
+ tests/submit/multinode/run.sh
119
+ tests/submit/success/run.sh
@@ -1357,18 +1357,26 @@ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200"
1357
1357
  @main.command(context_settings={"ignore_unknown_options": True})
1358
1358
  @click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
1359
1359
  @click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
1360
- @click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation duration ceiling (job auto-cancels on exit).")
1360
+ @click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling job auto-cancels well before this if it finishes.")
1361
1361
  @click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
1362
1362
  @click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
1363
+ @click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
1364
+ help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
1365
+ @click.option("--dockerimage", type=str, default=None,
1366
+ help="Pre-built container image reference (e.g. ghcr.io/me/img:tag) to run instead of the default.")
1367
+ @click.option("--preserve-entrypoint", is_flag=True,
1368
+ help="Keep the custom image's ENTRYPOINT/CMD instead of letting gpu-dev wrap with the SSH harness. Note: submit needs SSH to work.")
1363
1369
  @click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
1364
1370
  help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
1365
1371
  @click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
1366
1372
  @click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
1367
1373
  @click.option("--name", type=str, default=None, help="Reservation name.")
1368
- @click.option("--timeout", type=int, default=20, show_default=True, help="Minutes to wait for the reservation to become active.")
1374
+ @click.option("--timeout", type=int, default=24 * 60, show_default=True,
1375
+ help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
1369
1376
  @click.argument("command", nargs=-1, required=True)
1370
1377
  @click.pass_context
1371
- def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pull, keep_alive, name, timeout, command):
1378
+ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
1379
+ runtime, no_pull, keep_alive, name, timeout, command):
1372
1380
  """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
1373
1381
 
1374
1382
  \b
@@ -1421,12 +1429,47 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
1421
1429
  # SSH into rank 0, so passing --disk is fine.
1422
1430
  disk_name = None if no_persistent_disk else disk
1423
1431
 
1432
+ # Build dockerfile context if provided (mirrors the reserve-flow logic)
1433
+ dockerfile_payload = None
1434
+ if dockerfile:
1435
+ import os, tarfile, tempfile, base64
1436
+ if os.path.getsize(dockerfile) > 512 * 1024:
1437
+ rprint("[red]❌ Dockerfile too large (max 512KB)[/red]")
1438
+ sys.exit(2)
1439
+ ctx_dir = os.path.dirname(os.path.abspath(dockerfile))
1440
+ rprint(f"[cyan]📦 Building tar.gz context from {ctx_dir}[/cyan]")
1441
+ with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
1442
+ with tarfile.open(tmp.name, "w:gz") as tar:
1443
+ for root, _, files in os.walk(ctx_dir):
1444
+ for f in files:
1445
+ full = os.path.join(root, f)
1446
+ tar.add(full, arcname=os.path.relpath(full, ctx_dir))
1447
+ if os.path.basename(dockerfile).lower() != "dockerfile":
1448
+ tar.add(dockerfile, arcname="Dockerfile")
1449
+ tar_size = os.path.getsize(tmp.name)
1450
+ if tar_size > 700 * 1024:
1451
+ os.unlink(tmp.name)
1452
+ rprint(f"[red]❌ Build context too large: {tar_size}B (max ~700KB compressed)[/red]")
1453
+ sys.exit(2)
1454
+ with open(tmp.name, "rb") as fh:
1455
+ dockerfile_payload = base64.b64encode(fh.read()).decode("utf-8")
1456
+ os.unlink(tmp.name)
1457
+ rprint(f"[green]✅ Dockerfile context: {tar_size}B compressed[/green]")
1458
+
1459
+ if dockerimage and not preserve_entrypoint:
1460
+ rprint("[dim]Note: passing --dockerimage without --preserve-entrypoint, so gpu-dev wraps the image with the SSH harness.[/dim]")
1461
+ if preserve_entrypoint and not (dockerfile or dockerimage):
1462
+ rprint("[red]❌ --preserve-entrypoint requires --dockerfile or --dockerimage[/red]")
1463
+ sys.exit(2)
1464
+
1424
1465
  rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
1425
1466
  if is_multinode:
1426
1467
  reservation_ids = rm.create_multinode_reservation(
1427
1468
  user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1428
1469
  duration_hours=hours, name=name, github_user=user_info["github_user"],
1429
- no_persistent_disk=no_persistent_disk, disk_name=disk_name)
1470
+ no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1471
+ dockerfile=dockerfile_payload, dockerimage=dockerimage,
1472
+ preserve_entrypoint=preserve_entrypoint)
1430
1473
  if not reservation_ids:
1431
1474
  rprint("[red]❌ Failed to create multinode reservation[/red]")
1432
1475
  sys.exit(2)
@@ -1435,7 +1478,9 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
1435
1478
  primary_id = rm.create_reservation(
1436
1479
  user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1437
1480
  duration_hours=hours, name=name, github_user=user_info["github_user"],
1438
- no_persistent_disk=no_persistent_disk, disk_name=disk_name)
1481
+ no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1482
+ dockerfile=dockerfile_payload, dockerimage=dockerimage,
1483
+ preserve_entrypoint=preserve_entrypoint)
1439
1484
  if not primary_id:
1440
1485
  rprint("[red]❌ Failed to create reservation[/red]")
1441
1486
  sys.exit(2)
@@ -1456,7 +1501,11 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
1456
1501
  rprint(f"[dim] cancel {rid[:8]} failed: {ce}[/dim]")
1457
1502
 
1458
1503
  try:
1459
- rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active (up to {timeout}m)...[/cyan]")
1504
+ if timeout >= 60:
1505
+ wait_str = f"up to {timeout//60}h{(" " + str(timeout%60) + "m") if timeout%60 else ""}"
1506
+ else:
1507
+ wait_str = f"up to {timeout}m"
1508
+ rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active ({wait_str}; can queue when cluster is full)...[/cyan]")
1460
1509
  if is_multinode:
1461
1510
  results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
1462
1511
  else:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.18"
7
+ version = "0.5.19"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -0,0 +1,63 @@
1
+ # `gpu-dev submit` smoke tests
2
+
3
+ Three tests that exercise `gpu-dev submit` end-to-end. Each test lives in its
4
+ own folder so you can `--runtime` it directly. Output files written by the
5
+ script are pulled back into the same folder via the post-run rsync.
6
+
7
+ > Requires `gpu-dev >= 0.5.19`. No Lambda update needed.
8
+
9
+ ## 1. success — single T4 GPU, exit 0
10
+
11
+ ```bash
12
+ cd tests/submit/success
13
+ gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
14
+ echo $? # 0
15
+ ls # nvidia-info.txt, compute.txt, status.txt all created
16
+ ```
17
+
18
+ ## 2. fail — single T4 GPU, exit 7
19
+
20
+ Writes a partial file before exploding so you can confirm rsync still pulls
21
+ output on failure and the local exit code is the remote's.
22
+
23
+ ```bash
24
+ cd tests/submit/fail
25
+ gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
26
+ echo $? # 7
27
+ ls # step1.txt, step2.txt, gpus-before-fail.txt — but no step3.txt
28
+ ```
29
+
30
+ ## 3. multinode — 2x H100 nodes, exit 0
31
+
32
+ Reserves 16 H100s (= 2 nodes), verifies env vars + peer ssh + NCCL all_reduce
33
+ across the whole cluster via mpirun (orchestrated entirely from rank 0).
34
+
35
+ ```bash
36
+ cd tests/submit/multinode
37
+ gpu-dev submit --gpu-type h100 --gpus 16 --runtime ./ -- bash run.sh
38
+ echo $? # 0
39
+ cat multinode-env.txt resolved-ips.txt peer-ssh.txt nccl-all_reduce.log
40
+ ```
41
+
42
+ ## What each test proves
43
+
44
+ | Test | Proves |
45
+ |------------|-------------------------------------------------------------------------------|
46
+ | success | reserve → rsync up → exec → rsync back → cancel → exit 0 |
47
+ | fail | exit code propagation; rsync-back still runs on non-zero exit; cancel fires |
48
+ | multinode | MULTINODE_* env vars; peer DNS / passwordless ssh; cross-node NCCL via mpirun |
49
+
50
+ After every run, `gpu-dev list` should show neither reservation — both auto-cancelled.
51
+ Use `--keep-alive` on any of them if you want to debug interactively afterward.
52
+
53
+ ## Other submit flags (forwarded to `reserve`)
54
+
55
+ - `--hours N` — reservation lifetime ceiling (default 1.0)
56
+ - `--disk NAME` — attach a persistent disk to the master node
57
+ - `--no-persistent-disk` — skip persistent disk
58
+ - `--dockerfile PATH` — build a custom image from this Dockerfile
59
+ - `--dockerimage REF` — use a pre-built container image
60
+ - `--preserve-entrypoint` — keep the custom image's ENTRYPOINT (you must run sshd yourself for submit to work)
61
+ - `--timeout MINUTES` — wait-for-active timeout (default 1440 = 24h, since reservations may queue)
62
+ - `--no-pull` — skip the post-run sync-back
63
+ - `--keep-alive` — skip auto-cancel
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env bash
2
+ # Failure test for `gpu-dev submit`: writes a partial output, then exits 7.
3
+ # Verifies the post-run rsync still pulls the partial files even on failure,
4
+ # the auto-cancel runs on non-zero exit, and the local exit code is preserved.
5
+ set -e
6
+
7
+ echo "=== host ==="
8
+ hostname
9
+ date -u
10
+
11
+ # Write a partial file so we can verify it was synced back
12
+ echo "step1 done at $(date -u)" > step1.txt
13
+ nvidia-smi -L > gpus-before-fail.txt
14
+
15
+ # Now error out
16
+ echo "About to fail..." > step2.txt
17
+ python3 -c "import sys; sys.exit(7)"
18
+
19
+ # Should not reach here
20
+ echo "should-not-appear" > step3.txt
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env bash
2
+ # Multinode test for `gpu-dev submit`: runs on rank 0 only and orchestrates the
3
+ # whole cluster via mpirun (uses passwordless ssh + the headless service DNS we
4
+ # already set up). Verifies env vars, peer connectivity, and an actual NCCL
5
+ # all_reduce across all nodes.
6
+ set -euo pipefail
7
+ cd "$(dirname "$0")"
8
+
9
+ echo "=== rank 0 host: $(hostname) at $(date -u) ==="
10
+
11
+ echo "=== multinode env ==="
12
+ {
13
+ echo "MULTINODE_HOSTS=$MULTINODE_HOSTS"
14
+ echo "MULTINODE_PEER_PODS=$MULTINODE_PEER_PODS"
15
+ echo "MULTINODE_RANK=$MULTINODE_RANK"
16
+ echo "MULTINODE_SIZE=$MULTINODE_SIZE"
17
+ echo "MASTER_ADDR=$MASTER_ADDR"
18
+ echo "MASTER_PORT=$MASTER_PORT"
19
+ echo "MULTINODE_IPS=${MULTINODE_IPS:-(not set)}"
20
+ } | tee multinode-env.txt
21
+
22
+ if [[ -z "${MULTINODE_HOSTS:-}" ]]; then
23
+ echo "ERROR: MULTINODE_HOSTS empty — submit with --gpus >= 16 on h100" >&2
24
+ exit 2
25
+ fi
26
+
27
+ # Resolve IPs even if the bashrc helper didn't run (defensive)
28
+ IPS=""
29
+ for h in $(echo "$MULTINODE_HOSTS" | tr ',' ' '); do
30
+ ip=$(getent hosts "$h" | awk '{print $1}' | head -1)
31
+ [[ -n "$ip" ]] && IPS="${IPS:+$IPS,}$ip"
32
+ done
33
+ echo "Resolved IPS=$IPS" | tee resolved-ips.txt
34
+
35
+ echo "=== peer ssh check (port 2222 inside cluster) ==="
36
+ peer_host=$(echo "$MULTINODE_HOSTS" | cut -d, -f2)
37
+ ssh -o StrictHostKeyChecking=no -p 2222 "$peer_host" 'hostname; nvidia-smi -L | wc -l' \
38
+ | tee peer-ssh.txt
39
+
40
+ GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
41
+ echo "GPUS_PER_NODE=$GPUS_PER_NODE" | tee gpus-per-node.txt
42
+
43
+ # Build --host arg: ip1:N,ip2:N,...
44
+ HOST_ARG=$(echo "$IPS" | awk -v g="$GPUS_PER_NODE" -F, '{out=""; for(i=1;i<=NF;i++){out=out ($i ":" g) (i<NF?",":"")}; print out}')
45
+ echo "HOST_ARG=$HOST_ARG"
46
+
47
+ echo "=== NCCL all_reduce_perf via mpirun ==="
48
+ # Note: -g 1 = 1 GPU per process, -n 20 iterations. Sweep 1M..1G in factor-of-2 steps.
49
+ mpirun --host "$HOST_ARG" \
50
+ --mca plm_rsh_args "-p 2222 -o StrictHostKeyChecking=no" \
51
+ -x PATH -x LD_LIBRARY_PATH \
52
+ -x FI_PROVIDER -x FI_EFA_USE_DEVICE_RDMA \
53
+ -x NCCL_NET_GDR_LEVEL -x NCCL_ALGO \
54
+ -x NCCL_SOCKET_IFNAME -x NCCL_DEBUG -x NCCL_IB_HCA \
55
+ /opt/nccl-tests/build/all_reduce_perf -b 1M -e 1G -f 2 -g 1 -n 20 \
56
+ 2>&1 | tee nccl-all_reduce.log
57
+
58
+ echo "=== summary ==="
59
+ {
60
+ echo "rank=$MULTINODE_RANK size=$MULTINODE_SIZE"
61
+ echo "host_arg=$HOST_ARG"
62
+ echo "completed at $(date -u)"
63
+ } | tee summary.txt
64
+
65
+ echo "DONE"
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env bash
2
+ # Smoke test for `gpu-dev submit`: runs on a single GPU, expected exit 0.
3
+ set -euo pipefail
4
+
5
+ echo "=== host ==="
6
+ hostname
7
+ date -u
8
+
9
+ echo "=== nvidia-smi ==="
10
+ nvidia-smi | tee nvidia-info.txt
11
+
12
+ echo "=== compute ==="
13
+ python3 - <<'PY' | tee compute.txt
14
+ import torch
15
+ assert torch.cuda.is_available(), "CUDA not available"
16
+ n = torch.cuda.device_count()
17
+ x = torch.arange(1_000_000, device="cuda", dtype=torch.float32)
18
+ s = x.sum().item()
19
+ print(f"devices={n} sum(0..999_999)={s}")
20
+ PY
21
+
22
+ echo "ok at $(date -u)" > status.txt
23
+ echo "DONE"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes