gpu-dev 0.5.21__tar.gz → 0.5.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/PKG-INFO +1 -1
  2. gpu_dev-0.5.22/README.md +143 -0
  3. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  4. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +2 -0
  5. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +3 -0
  6. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/pyproject.toml +1 -1
  7. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/Dockerfile +4 -1
  8. gpu_dev-0.5.22/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +88 -0
  9. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_processor/index.py +18 -0
  10. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda.tf +1 -1
  11. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/.github/workflows/no-gitlinks.yml +0 -0
  12. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/.github/workflows/publish.yml +0 -0
  13. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/.gitignore +0 -0
  14. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/CLAUDE.md +0 -0
  15. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/PROGRESS.md +0 -0
  16. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/PR_DESCRIPTION.md +0 -0
  17. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/TODO.md +0 -0
  18. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/admin/README.md +0 -0
  19. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/admin/generate_stats.py +0 -0
  20. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/admin/requirements.txt +0 -0
  21. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/README.md +0 -0
  22. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  23. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  24. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  25. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  26. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  27. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  28. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  29. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
  30. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  31. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  32. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  33. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  34. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  35. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  36. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  37. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/docs/USER_GUIDE.md +0 -0
  38. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/docs/devgpu-features.html +0 -0
  39. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/docs/docker-mark-blue.svg +0 -0
  40. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/docs/icons8-cursor-ai.svg +0 -0
  41. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/post.md +0 -0
  42. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/setup.cfg +0 -0
  43. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  44. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  45. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/README.md +0 -0
  46. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/alb.tf +0 -0
  47. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/availability.tf +0 -0
  48. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/backend.tf +0 -0
  49. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  50. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  51. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/bash_profile +0 -0
  52. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/bashrc +0 -0
  53. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  54. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  55. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  56. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  57. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/motd_script +0 -0
  58. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  59. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/profile +0 -0
  60. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  61. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  62. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  63. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/shell_env +0 -0
  64. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/ssh_config +0 -0
  65. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/zprofile +0 -0
  66. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/zshrc +0 -0
  67. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  68. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker-build.tf +0 -0
  69. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  70. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  71. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ecr.tf +0 -0
  72. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/efs.tf +0 -0
  73. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/eks.tf +0 -0
  74. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/expiry.tf +0 -0
  75. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/git-cache.tf +0 -0
  76. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/kubernetes.tf +0 -0
  77. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  78. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  84. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  85. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  86. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  87. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  88. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  89. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  90. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  91. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/main.tf +0 -0
  92. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/mig-config.tf +0 -0
  93. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  94. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  95. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  96. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  97. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  98. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  99. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/monitoring.tf +0 -0
  100. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/outputs.tf +0 -0
  101. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/pyproject.toml +0 -0
  102. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/queue.tf +0 -0
  103. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/route53.tf +0 -0
  104. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  105. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  106. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  107. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  108. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  109. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  110. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  111. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  112. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  113. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  114. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/switch-to.sh +0 -0
  115. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  116. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  117. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  118. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  119. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/variables.tf +0 -0
  120. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/tests/submit/README.md +0 -0
  121. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/tests/submit/fail/run.sh +0 -0
  122. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/tests/submit/multinode/run.sh +0 -0
  123. {gpu_dev-0.5.21 → gpu_dev-0.5.22}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.21
3
+ Version: 0.5.22
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -0,0 +1,143 @@
1
+ # osdc — Open Source Developer Cloud
2
+
3
+ A self-hosted developer platform for GPU work. Devs ask for `1 / 2 / 4 / 8`
4
+ GPUs of a given type, the platform parks them on a Kubernetes pod with SSH
5
+ access, and tears it down when the reservation expires.
6
+
7
+ Built for PyTorch contributors — auth is via the GitHub public keys of users
8
+ with commit access — but the design is generic enough to plug into other
9
+ groups.
10
+
11
+ ## What you get
12
+
13
+ - **Python CLI** (`gpu-dev`) with `reserve`, `list`, `extend`, `cancel`, and
14
+ `config` commands. Real-time polling until your pod is ready.
15
+ - **GPU types**: T4, L4, A100, H100, B200. Pick the count (1, 2, 4, 8) and the
16
+ duration in hours (fractional is fine, e.g. `--hours 0.25`).
17
+ - **SSH** straight into the pod via NodePort, with **your own GitHub public
18
+ keys** injected — no separate credentials to manage.
19
+ - **Persistent disk** that survives between reservations (opt-in), backed by
20
+ EBS snapshots. Or run with `--no-persist` for a clean `EmptyDir` workspace.
21
+ - **20 TB shared EFS** mounted at `/shared` with per-user folders.
22
+ - **NVIDIA profiling** ready out of the box (`ncu` / `nsys` work without
23
+ manual driver tweaks), with one node per GPU type reserved as
24
+ profiling-dedicated.
25
+ - **Grafana** dashboard at `<node-ip>:30080` with NVIDIA DCGM exporter
26
+ metrics — utilization, memory, temp, power.
27
+ - **Multi-node NCCL** working over EFA with `OFI_NCCL_PROTOCOL=SENDRECV`.
28
+ Tree algo gets ~21 GB/s bus bandwidth across 2× p5.48xlarge (16 H100).
29
+
30
+ ## How it fits together
31
+
32
+ ```
33
+ ┌────────┐ reserve ┌────────┐ enqueue ┌────────────┐
34
+ │ CLI │ ───────────► │ API │ ────────► │ SQS │
35
+ └────────┘ └────────┘ └─────┬──────┘
36
+ ▲ poll │
37
+ │ ▼
38
+ │ ┌──────────────────────────────────────┐
39
+ │ │ Lambda reservation processor │
40
+ │ │ - pick a node with free GPUs │
41
+ │ │ - attach EBS, mount /shared (EFS) │
42
+ │ │ - create K8s pod, inject GH keys │
43
+ │ └────────────────┬─────────────────────┘
44
+ │ │
45
+ │ ▼
46
+ │ ┌──────────────────┐
47
+ │ │ EKS (k8s) │
48
+ │ SSH (NodePort) │ GPU node groups │
49
+ └─────────────────────┤ T4 / L4 / H100 │
50
+ │ B200 / ... │
51
+ └──────────────────┘
52
+
53
+ DynamoDB holds reservation state & history; CloudWatch logs the lambdas.
54
+ ```
55
+
56
+ ## Repository layout
57
+
58
+ ```
59
+ .
60
+ ├── cli-tools/ # `gpu-dev` Python CLI (pyproject.toml)
61
+ ├── terraform-gpu-devservers/
62
+ │ # OpenTofu modules for EKS, node groups,
63
+ │ # SQS, Lambda, DynamoDB, EFS, monitoring
64
+ ├── admin/ # operator scripts
65
+ ├── docs/ # user guide and architecture notes
66
+ └── tests/
67
+ ```
68
+
69
+ ## Getting started — as a user
70
+
71
+ You need: GitHub access to the configured org (PyTorch by default), and your
72
+ public keys uploaded to GitHub.
73
+
74
+ ```bash
75
+ # 1. Install the CLI
76
+ pip install -e ./cli-tools/gpu-dev-cli
77
+
78
+ # 2. Point it at your deployment
79
+ gpu-dev config # walks you through API URL + GitHub username
80
+
81
+ # 3. Reserve a GPU
82
+ gpu-dev reserve -g 1 -t h100 -h 2 # 1× H100 for 2 hours
83
+ gpu-dev reserve -g 8 -t b200 -h 24 # 8× B200 for a day
84
+ gpu-dev reserve -g 1 -t t4 -h 0.25 # 1× T4 for 15 minutes
85
+
86
+ # 4. Watch it come up; SSH instructions print when ready
87
+ gpu-dev list
88
+
89
+ # 5. Extend if you need more time (max total 48 h)
90
+ gpu-dev extend <reservation-id> --hours 12
91
+
92
+ # 6. Done? Free it up.
93
+ gpu-dev cancel <reservation-id>
94
+ ```
95
+
96
+ Each reservation drops an SSH config file at
97
+ `~/.devgpu/<reservation_id>-sshconfig`, so connecting is just:
98
+
99
+ ```bash
100
+ ssh -F ~/.devgpu/<reservation_id>-sshconfig gpu-dev
101
+ ```
102
+
103
+ ## Getting started — as an operator
104
+
105
+ You need: an AWS account with EC2 GPU capacity (reserved or on-demand), an
106
+ OpenTofu workstation, and credentials for whatever IAM role the modules
107
+ assume.
108
+
109
+ ```bash
110
+ cd terraform-gpu-devservers
111
+ tf init # `tf` is aliased to `opentofu` in this repo
112
+ tf plan # read-only — agents are restricted to this
113
+ tf apply # only on a real workstation, not via the agent
114
+ ```
115
+
116
+ Important variables to set in your `*.tfvars`:
117
+
118
+ - `aws_region` (defaults to `us-east-2`)
119
+ - node group sizing per GPU type (T4 / L4 / H100 / B200)
120
+ - `grafana_admin_password`
121
+ - the GitHub org/team that's allowed to reserve
122
+
123
+ Once nodes are up, label one per GPU type as profiling-dedicated so DCGM
124
+ doesn't fight Nsight for the device:
125
+
126
+ ```bash
127
+ kubectl label node <h100-node> gpu.monitoring/profiling-dedicated=true
128
+ kubectl label node <b200-node> gpu.monitoring/profiling-dedicated=true
129
+ ```
130
+
131
+ Grafana lands at `http://<node-ip>:30080` (admin / your configured password).
132
+ Pre-loaded dashboards: NVIDIA DCGM (community ID 12239) and a custom GPU
133
+ overview.
134
+
135
+ ## Status
136
+
137
+ Working end-to-end on T4 / L4 / H100. B200 supported with on-demand capacity.
138
+ Active development — see [`PROGRESS.md`](PROGRESS.md) and [`TODO.md`](TODO.md)
139
+ for what's in flight and what's queued.
140
+
141
+ ## License
142
+
143
+ See [`LICENSE`](LICENSE) once added. For now: ask before reusing.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.21
3
+ Version: 0.5.22
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -2,6 +2,7 @@
2
2
  CLAUDE.md
3
3
  PROGRESS.md
4
4
  PR_DESCRIPTION.md
5
+ README.md
5
6
  TODO.md
6
7
  post.md
7
8
  pyproject.toml
@@ -44,6 +45,7 @@ terraform-gpu-devservers/efs.tf
44
45
  terraform-gpu-devservers/eks.tf
45
46
  terraform-gpu-devservers/expiry.tf
46
47
  terraform-gpu-devservers/git-cache.tf
48
+ terraform-gpu-devservers/gpu-dev-pod-irsa.tf
47
49
  terraform-gpu-devservers/kubernetes.tf
48
50
  terraform-gpu-devservers/lambda.tf
49
51
  terraform-gpu-devservers/main.tf
@@ -89,6 +89,7 @@ def select_gpu_type_interactive(
89
89
  table = Table()
90
90
  table.add_column("GPU Type", style="cyan")
91
91
  table.add_column("Avail", style="green")
92
+ table.add_column("Max\nReservable", style="bright_green")
92
93
  table.add_column("Total", style="blue")
93
94
  table.add_column("Queue\nLength", style="yellow")
94
95
  table.add_column("Est. Wait Time", style="magenta")
@@ -96,6 +97,7 @@ def select_gpu_type_interactive(
96
97
  choices = []
97
98
  for gpu_type, info in visible_info.items():
98
99
  available = info.get("available", 0)
100
+ max_reservable = info.get("max_reservable", 0)
99
101
  total = info.get("total", 0)
100
102
  queue_length = info.get("queue_length", 0)
101
103
  est_wait = info.get("estimated_wait_minutes", 0)
@@ -134,6 +136,7 @@ def select_gpu_type_interactive(
134
136
  table.add_row(
135
137
  gpu_type.upper(),
136
138
  available_display,
139
+ "-" if is_maintenance else str(max_reservable),
137
140
  str(total),
138
141
  str(queue_length) if not is_maintenance else "-",
139
142
  wait_display,
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.21"
7
+ version = "0.5.22"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -103,6 +103,8 @@ ENV NCCL_ASYNC_ERROR_HANDLING=1
103
103
  ENV SUPPORTS_EFA=true
104
104
 
105
105
  # Install Python packages (Jupyter and common ML packages)
106
+ # gpu-dev itself is bundled so users can run `gpu-dev submit` from inside their pod
107
+ # (combined with IRSA on the pod's service account, no manual aws sso login needed).
106
108
  RUN pip install --no-cache-dir --break-system-packages \
107
109
  jupyterlab \
108
110
  ipywidgets \
@@ -112,7 +114,8 @@ RUN pip install --no-cache-dir --break-system-packages \
112
114
  numpy \
113
115
  scikit-learn \
114
116
  plotly \
115
- tensorboard
117
+ tensorboard \
118
+ gpu-dev
116
119
 
117
120
  # Create dev user with UID 1081 to avoid conflicts with common base image users (e.g., ubuntu=1000)
118
121
  RUN useradd -u 1081 -m -s /usr/bin/zsh dev && \
@@ -0,0 +1,88 @@
1
+ # IRSA wiring for user-facing gpu-dev pods.
2
+ #
3
+ # Goal: when a user SSHs into their CPU dev pod (or any gpu-dev pod) and runs
4
+ # `gpu-dev submit ...`, boto3 picks up temporary AWS credentials via the
5
+ # IAM-roles-for-service-accounts mechanism — no manual `aws sso login` needed.
6
+ #
7
+ # Identity preservation: Lambda sets AWS_ROLE_SESSION_NAME=<user identity>
8
+ # on the pod env, so STS GetCallerIdentity returns
9
+ # arn:aws:sts::<acct>:assumed-role/<role>/<user>
10
+ # and the existing `authenticate_user` ARN-tail parsing keeps working unchanged.
11
+
12
+ # Policy mirrors cli-tools/gpu-dev-cli/minimal-iam-policy.json — same scope a
13
+ # user gets when they `aws sso login` from their laptop.
14
+ resource "aws_iam_role" "gpu_dev_pod_role" {
15
+ name = "gpu-dev-pod-role-${local.current_config.environment}"
16
+
17
+ assume_role_policy = jsonencode({
18
+ Version = "2012-10-17"
19
+ Statement = [
20
+ {
21
+ Effect = "Allow"
22
+ Principal = {
23
+ Federated = aws_iam_openid_connect_provider.eks.arn
24
+ }
25
+ Action = "sts:AssumeRoleWithWebIdentity"
26
+ Condition = {
27
+ StringEquals = {
28
+ "${replace(aws_iam_openid_connect_provider.eks.url, "https://", "")}:sub" = "system:serviceaccount:gpu-dev:gpu-dev-pod-sa"
29
+ "${replace(aws_iam_openid_connect_provider.eks.url, "https://", "")}:aud" = "sts.amazonaws.com"
30
+ }
31
+ }
32
+ }
33
+ ]
34
+ })
35
+
36
+ tags = {
37
+ Name = "GPU Dev Pod IRSA Role"
38
+ Environment = local.current_config.environment
39
+ }
40
+ }
41
+
42
+ resource "aws_iam_role_policy" "gpu_dev_pod_policy" {
43
+ name = "gpu-dev-pod-policy"
44
+ role = aws_iam_role.gpu_dev_pod_role.id
45
+
46
+ policy = jsonencode({
47
+ Version = "2012-10-17"
48
+ Statement = [
49
+ {
50
+ Effect = "Allow"
51
+ Action = [
52
+ "sqs:SendMessage",
53
+ "sqs:GetQueueUrl",
54
+ "sqs:GetQueueAttributes"
55
+ ]
56
+ Resource = "arn:aws:sqs:*:*:pytorch-gpu-dev-reservation-queue"
57
+ },
58
+ {
59
+ Effect = "Allow"
60
+ Action = [
61
+ "dynamodb:GetItem",
62
+ "dynamodb:Query",
63
+ "dynamodb:Scan"
64
+ ]
65
+ Resource = [
66
+ "arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-reservations",
67
+ "arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-reservations/index/*",
68
+ "arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-gpu-availability"
69
+ ]
70
+ },
71
+ {
72
+ Effect = "Allow"
73
+ Action = "sts:GetCallerIdentity"
74
+ Resource = "*"
75
+ }
76
+ ]
77
+ })
78
+ }
79
+
80
+ resource "kubernetes_service_account" "gpu_dev_pod" {
81
+ metadata {
82
+ name = "gpu-dev-pod-sa"
83
+ namespace = kubernetes_namespace.gpu_dev.metadata[0].name
84
+ annotations = {
85
+ "eks.amazonaws.com/role-arn" = aws_iam_role.gpu_dev_pod_role.arn
86
+ }
87
+ }
88
+ }
@@ -4577,6 +4577,16 @@ EOF_ZSHRC_EXT
4577
4577
  chown 1081:1081 /home/dev/.bashrc_ext /home/dev/.zshrc_ext
4578
4578
  echo "[STARTUP] ✓ Shell extension files written"
4579
4579
 
4580
+ # Background-refresh gpu-dev so older images / persistent disks pick up the
4581
+ # latest CLI without forcing the user to pip install it themselves. The
4582
+ # baseline gpu-dev is already in the image; this just upgrades.
4583
+ (
4584
+ pip install --no-cache-dir --break-system-packages --upgrade gpu-dev \
4585
+ > /tmp/gpu-dev-upgrade.log 2>&1 \
4586
+ && echo "[STARTUP] gpu-dev upgraded to $(gpu-dev --version 2>&1 | tail -1)" \
4587
+ || echo "[STARTUP] gpu-dev upgrade failed (non-fatal); see /tmp/gpu-dev-upgrade.log"
4588
+ ) &
4589
+
4580
4590
  # Ensure existing rc files source the extensions (for persistent disks with old configs)
4581
4591
  for rcfile in /home/dev/.bashrc /home/dev/.zshrc; do
4582
4592
  if [ -f "$rcfile" ]; then
@@ -5301,6 +5311,9 @@ EOF
5301
5311
  ),
5302
5312
  client.V1EnvVar(
5303
5313
  name="NVIDIA_DRIVER_CAPABILITIES", value="compute,utility"
5314
+ ),
5315
+ client.V1EnvVar(
5316
+ name="AWS_ROLE_SESSION_NAME", value=(user_id or "gpu-dev-pod")[:64]
5304
5317
  )
5305
5318
  ] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
5306
5319
  resources=client.V1ResourceRequirements(
@@ -5483,6 +5496,11 @@ EOF
5483
5496
  ] if not gpu_type.startswith("cpu-") else [],
5484
5497
  # Faster pod deletion (default is 30s)
5485
5498
  termination_grace_period_seconds=10,
5499
+ # IRSA: bind the pod to the gpu-dev-pod-sa service account so boto3 inside
5500
+ # the pod gets temporary creds via STS AssumeRoleWithWebIdentity. Combined
5501
+ # with the AWS_ROLE_SESSION_NAME env var below this lets users run
5502
+ # `gpu-dev submit` from inside their dev pod with no manual aws sso login.
5503
+ service_account_name="gpu-dev-pod-sa",
5486
5504
  # EFA requires host network namespace for RDMA access to efa0 interface
5487
5505
  **({
5488
5506
  "host_network": True,
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.22"
183
+ LAMBDA_VERSION = "0.5.23"
184
184
  MIN_CLI_VERSION = "0.5.16"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes