gpu-dev 0.5.20__tar.gz → 0.5.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +12 -0
  4. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/pyproject.toml +1 -1
  5. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/Dockerfile +5 -5
  6. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/kubernetes.tf +11 -0
  7. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/.github/workflows/no-gitlinks.yml +0 -0
  8. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/.github/workflows/publish.yml +0 -0
  9. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/.gitignore +0 -0
  10. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/CLAUDE.md +0 -0
  11. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/PROGRESS.md +0 -0
  12. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/PR_DESCRIPTION.md +0 -0
  13. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/TODO.md +0 -0
  14. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/admin/README.md +0 -0
  15. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/admin/generate_stats.py +0 -0
  16. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/admin/requirements.txt +0 -0
  17. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/README.md +0 -0
  18. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  19. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  20. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  21. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  22. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  23. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  24. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  25. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  26. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  27. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  28. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  29. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  30. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  31. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  32. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  33. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  34. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/docs/USER_GUIDE.md +0 -0
  35. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/docs/devgpu-features.html +0 -0
  36. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/docs/docker-mark-blue.svg +0 -0
  37. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/docs/icons8-cursor-ai.svg +0 -0
  38. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/post.md +0 -0
  39. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/setup.cfg +0 -0
  40. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  41. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  42. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/README.md +0 -0
  43. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/alb.tf +0 -0
  44. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/availability.tf +0 -0
  45. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/backend.tf +0 -0
  46. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  47. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  48. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/bash_profile +0 -0
  49. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/bashrc +0 -0
  50. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  51. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  52. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  53. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  54. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/motd_script +0 -0
  55. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  56. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/profile +0 -0
  57. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  58. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  59. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  60. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/shell_env +0 -0
  61. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/ssh_config +0 -0
  62. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/zprofile +0 -0
  63. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/zshrc +0 -0
  64. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  65. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker-build.tf +0 -0
  66. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  67. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  68. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/ecr.tf +0 -0
  69. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/efs.tf +0 -0
  70. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/eks.tf +0 -0
  71. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/expiry.tf +0 -0
  72. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/git-cache.tf +0 -0
  73. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  74. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  75. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  76. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  77. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  78. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  79. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  80. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  81. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  82. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  83. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  84. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  85. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  86. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  87. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  88. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/lambda.tf +0 -0
  89. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/main.tf +0 -0
  90. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/mig-config.tf +0 -0
  91. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  92. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  93. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  94. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  95. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  96. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  97. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/monitoring.tf +0 -0
  98. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/outputs.tf +0 -0
  99. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/pyproject.toml +0 -0
  100. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/queue.tf +0 -0
  101. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/route53.tf +0 -0
  102. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  103. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  104. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  105. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  106. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  107. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  108. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  109. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  110. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  111. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  112. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/switch-to.sh +0 -0
  113. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  114. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  115. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  116. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  117. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/terraform-gpu-devservers/variables.tf +0 -0
  118. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/tests/submit/README.md +0 -0
  119. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/tests/submit/fail/run.sh +0 -0
  120. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/tests/submit/multinode/run.sh +0 -0
  121. {gpu_dev-0.5.20 → gpu_dev-0.5.21}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.20
3
+ Version: 0.5.21
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.20
3
+ Version: 0.5.21
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1398,6 +1398,18 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
1398
1398
  rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
1399
1399
  sys.exit(2)
1400
1400
 
1401
+ # Catch the common typo where the user drops the leading -- on an option name and
1402
+ # the option's value gets swept into the command (because submit accepts arbitrary
1403
+ # commands via ignore_unknown_options). Without this guard the remote shell happily
1404
+ # runs `gpus 1 bash run.sh` and the user wonders why.
1405
+ _submit_flag_names = {"gpus", "gpu-type", "hours", "disk", "no-persistent-disk",
1406
+ "dockerfile", "dockerimage", "preserve-entrypoint", "runtime",
1407
+ "no-pull", "keep-alive", "name", "timeout"}
1408
+ if command[0] in _submit_flag_names:
1409
+ rprint(f"[red]❌ '{command[0]}' looks like a missing '--'. Did you mean '--{command[0]}'? "
1410
+ f"Put your command after '--', e.g. gpu-dev submit --{command[0]} <value> ... -- bash run.sh[/red]")
1411
+ sys.exit(2)
1412
+
1401
1413
  # rsync is on macOS by default and on virtually every Linux distro; bail early with a
1402
1414
  # readable message if the user has somehow uninstalled it locally rather than failing
1403
1415
  # mid-flight after the reservation has already been created.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.20"
7
+ version = "0.5.21"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -6,15 +6,15 @@ FROM pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
6
6
  ENV DEBIAN_FRONTEND=noninteractive
7
7
  ENV TZ=UTC
8
8
 
9
- # Update package lists with retries and install essential packages
9
+ # Update apt and install base packages in a single layer so the package metadata
10
+ # is always fresh relative to the install. Splitting update from install causes
11
+ # Ubuntu to 404 on stale cached versions whenever security updates retire old .debs.
10
12
  RUN for attempt in 1 2 3; do \
11
13
  echo "Package update attempt $attempt..." && \
12
14
  apt-get update -qq && break || \
13
15
  ([ $attempt -lt 3 ] && echo "Update failed, waiting 30s..." && sleep 30) \
14
- done
15
-
16
- # Install system packages in layers for better caching
17
- RUN apt-get install -y --no-install-recommends \
16
+ done && \
17
+ apt-get install -y --no-install-recommends \
18
18
  openssh-server \
19
19
  sudo \
20
20
  curl \
@@ -416,6 +416,17 @@ resource "kubernetes_manifest" "image_prepuller_daemonset" {
416
416
  app = "image-prepuller"
417
417
  }
418
418
  }
419
+ # The prepuller does nothing but pull an image + sit in a pause container.
420
+ # Default maxUnavailable=1 makes a 32GB image roll out across 26 nodes take
421
+ # ~2.5h, so user pods landing on un-updated nodes still pull from ECR fresh.
422
+ # Parallel is safe here — restarting the prepuller doesn't disrupt anything,
423
+ # and the kubelet's image cache is independent of the prepuller pod's lifecycle.
424
+ updateStrategy = {
425
+ type = "RollingUpdate"
426
+ rollingUpdate = {
427
+ maxUnavailable = "100%"
428
+ }
429
+ }
419
430
  template = {
420
431
  metadata = {
421
432
  labels = {
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes