gpu-dev 0.5.19__tar.gz → 0.5.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +21 -0
  4. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/pyproject.toml +1 -1
  5. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/Dockerfile +7 -6
  6. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/kubernetes.tf +11 -0
  7. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/.github/workflows/no-gitlinks.yml +0 -0
  8. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/.github/workflows/publish.yml +0 -0
  9. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/.gitignore +0 -0
  10. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/CLAUDE.md +0 -0
  11. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/PROGRESS.md +0 -0
  12. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/PR_DESCRIPTION.md +0 -0
  13. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/TODO.md +0 -0
  14. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/admin/README.md +0 -0
  15. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/admin/generate_stats.py +0 -0
  16. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/admin/requirements.txt +0 -0
  17. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/README.md +0 -0
  18. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  19. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  20. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  21. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  22. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  23. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  24. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  25. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  26. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  27. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  28. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  29. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  30. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  31. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  32. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  33. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  34. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/docs/USER_GUIDE.md +0 -0
  35. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/docs/devgpu-features.html +0 -0
  36. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/docs/docker-mark-blue.svg +0 -0
  37. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/docs/icons8-cursor-ai.svg +0 -0
  38. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/post.md +0 -0
  39. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/setup.cfg +0 -0
  40. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  41. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  42. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/README.md +0 -0
  43. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/alb.tf +0 -0
  44. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/availability.tf +0 -0
  45. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/backend.tf +0 -0
  46. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  47. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  48. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/bash_profile +0 -0
  49. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/bashrc +0 -0
  50. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  51. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  52. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  53. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  54. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/motd_script +0 -0
  55. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  56. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/profile +0 -0
  57. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  58. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  59. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  60. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/shell_env +0 -0
  61. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/ssh_config +0 -0
  62. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/zprofile +0 -0
  63. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/zshrc +0 -0
  64. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  65. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker-build.tf +0 -0
  66. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  67. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  68. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/ecr.tf +0 -0
  69. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/efs.tf +0 -0
  70. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/eks.tf +0 -0
  71. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/expiry.tf +0 -0
  72. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/git-cache.tf +0 -0
  73. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  74. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  75. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  76. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  77. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  78. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  79. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  80. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  81. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  82. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  83. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  84. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  85. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  86. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  87. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  88. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda.tf +0 -0
  89. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/main.tf +0 -0
  90. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/mig-config.tf +0 -0
  91. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  92. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  93. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  94. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  95. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  96. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  97. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/monitoring.tf +0 -0
  98. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/outputs.tf +0 -0
  99. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/pyproject.toml +0 -0
  100. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/queue.tf +0 -0
  101. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/route53.tf +0 -0
  102. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  103. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  104. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  105. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  106. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  107. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  108. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  109. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  110. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  111. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  112. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/switch-to.sh +0 -0
  113. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  114. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  115. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  116. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  117. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/terraform-gpu-devservers/variables.tf +0 -0
  118. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/tests/submit/README.md +0 -0
  119. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/tests/submit/fail/run.sh +0 -0
  120. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/tests/submit/multinode/run.sh +0 -0
  121. {gpu_dev-0.5.19 → gpu_dev-0.5.21}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.19
3
+ Version: 0.5.21
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.19
3
+ Version: 0.5.21
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1398,6 +1398,27 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
1398
1398
  rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
1399
1399
  sys.exit(2)
1400
1400
 
1401
+ # Catch the common typo where the user drops the leading -- on an option name and
1402
+ # the option's value gets swept into the command (because submit accepts arbitrary
1403
+ # commands via ignore_unknown_options). Without this guard the remote shell happily
1404
+ # runs `gpus 1 bash run.sh` and the user wonders why.
1405
+ _submit_flag_names = {"gpus", "gpu-type", "hours", "disk", "no-persistent-disk",
1406
+ "dockerfile", "dockerimage", "preserve-entrypoint", "runtime",
1407
+ "no-pull", "keep-alive", "name", "timeout"}
1408
+ if command[0] in _submit_flag_names:
1409
+ rprint(f"[red]❌ '{command[0]}' looks like a missing '--'. Did you mean '--{command[0]}'? "
1410
+ f"Put your command after '--', e.g. gpu-dev submit --{command[0]} <value> ... -- bash run.sh[/red]")
1411
+ sys.exit(2)
1412
+
1413
+ # rsync is on macOS by default and on virtually every Linux distro; bail early with a
1414
+ # readable message if the user has somehow uninstalled it locally rather than failing
1415
+ # mid-flight after the reservation has already been created.
1416
+ if runtime:
1417
+ import shutil
1418
+ if not shutil.which("rsync"):
1419
+ rprint("[red]❌ rsync not found on PATH locally. Install it (Mac: 'brew install rsync', Debian/Ubuntu: 'sudo apt install rsync') and retry.[/red]")
1420
+ sys.exit(2)
1421
+
1401
1422
  gt = gpu_type.lower()
1402
1423
  # Per-type max GPUs (mirrors gpu_configs in reserve flow)
1403
1424
  max_per_node = {
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.19"
7
+ version = "0.5.21"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -6,15 +6,15 @@ FROM pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
6
6
  ENV DEBIAN_FRONTEND=noninteractive
7
7
  ENV TZ=UTC
8
8
 
9
- # Update package lists with retries and install essential packages
9
+ # Update apt and install base packages in a single layer so the package metadata
10
+ # is always fresh relative to the install. Splitting update from install causes
11
+ # Ubuntu to 404 on stale cached versions whenever security updates retire old .debs.
10
12
  RUN for attempt in 1 2 3; do \
11
13
  echo "Package update attempt $attempt..." && \
12
14
  apt-get update -qq && break || \
13
15
  ([ $attempt -lt 3 ] && echo "Update failed, waiting 30s..." && sleep 30) \
14
- done
15
-
16
- # Install system packages in layers for better caching
17
- RUN apt-get install -y --no-install-recommends \
16
+ done && \
17
+ apt-get install -y --no-install-recommends \
18
18
  openssh-server \
19
19
  sudo \
20
20
  curl \
@@ -36,7 +36,8 @@ RUN apt-get install -y --no-install-recommends \
36
36
  unzip \
37
37
  ccache \
38
38
  htop \
39
- tree
39
+ tree \
40
+ rsync
40
41
  # Install Node.js 20 from NodeSource (required for Claude CLI)
41
42
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
42
43
  apt-get install -y nodejs
@@ -416,6 +416,17 @@ resource "kubernetes_manifest" "image_prepuller_daemonset" {
416
416
  app = "image-prepuller"
417
417
  }
418
418
  }
419
+ # The prepuller does nothing but pull an image + sit in a pause container.
420
+ # Default maxUnavailable=1 makes a 32GB image roll out across 26 nodes take
421
+ # ~2.5h, so user pods landing on un-updated nodes still pull from ECR fresh.
422
+ # Parallel is safe here — restarting the prepuller doesn't disrupt anything,
423
+ # and the kubelet's image cache is independent of the prepuller pod's lifecycle.
424
+ updateStrategy = {
425
+ type = "RollingUpdate"
426
+ rollingUpdate = {
427
+ maxUnavailable = "100%"
428
+ }
429
+ }
419
430
  template = {
420
431
  metadata = {
421
432
  labels = {
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes