gpu-dev 0.5.21__tar.gz → 0.5.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/PKG-INFO +1 -1
- gpu_dev-0.5.23/README.md +143 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +2 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +11 -2
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +3 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/pyproject.toml +1 -1
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/Dockerfile +4 -1
- gpu_dev-0.5.23/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +88 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/reservation_processor/index.py +46 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/.gitignore +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/CLAUDE.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/PROGRESS.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/TODO.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/admin/README.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/post.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/setup.cfg +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.23}/tests/submit/success/run.sh +0 -0
gpu_dev-0.5.23/README.md
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# osdc — Open Source Developer Cloud
|
|
2
|
+
|
|
3
|
+
A self-hosted developer platform for GPU work. Devs ask for `1 / 2 / 4 / 8`
|
|
4
|
+
GPUs of a given type, the platform parks them on a Kubernetes pod with SSH
|
|
5
|
+
access, and tears it down when the reservation expires.
|
|
6
|
+
|
|
7
|
+
Built for PyTorch contributors — auth is via the GitHub public keys of users
|
|
8
|
+
with commit access — but the design is generic enough to plug into other
|
|
9
|
+
groups.
|
|
10
|
+
|
|
11
|
+
## What you get
|
|
12
|
+
|
|
13
|
+
- **Python CLI** (`gpu-dev`) with `reserve`, `list`, `extend`, `cancel`, and
|
|
14
|
+
`config` commands. Real-time polling until your pod is ready.
|
|
15
|
+
- **GPU types**: T4, L4, A100, H100, B200. Pick the count (1, 2, 4, 8) and the
|
|
16
|
+
duration in hours (fractional is fine, e.g. `--hours 0.25`).
|
|
17
|
+
- **SSH** straight into the pod via NodePort, with **your own GitHub public
|
|
18
|
+
keys** injected — no separate credentials to manage.
|
|
19
|
+
- **Persistent disk** that survives between reservations (opt-in), backed by
|
|
20
|
+
EBS snapshots. Or run with `--no-persist` for a clean `EmptyDir` workspace.
|
|
21
|
+
- **20 TB shared EFS** mounted at `/shared` with per-user folders.
|
|
22
|
+
- **NVIDIA profiling** ready out of the box (`ncu` / `nsys` work without
|
|
23
|
+
manual driver tweaks), with one node per GPU type reserved as
|
|
24
|
+
profiling-dedicated.
|
|
25
|
+
- **Grafana** dashboard at `<node-ip>:30080` with NVIDIA DCGM exporter
|
|
26
|
+
metrics — utilization, memory, temp, power.
|
|
27
|
+
- **Multi-node NCCL** working over EFA with `OFI_NCCL_PROTOCOL=SENDRECV`.
|
|
28
|
+
Tree algo gets ~21 GB/s bus bandwidth across 2× p5.48xlarge (16 H100).
|
|
29
|
+
|
|
30
|
+
## How it fits together
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
┌────────┐ reserve ┌────────┐ enqueue ┌────────────┐
|
|
34
|
+
│ CLI │ ───────────► │ API │ ────────► │ SQS │
|
|
35
|
+
└────────┘ └────────┘ └─────┬──────┘
|
|
36
|
+
▲ poll │
|
|
37
|
+
│ ▼
|
|
38
|
+
│ ┌──────────────────────────────────────┐
|
|
39
|
+
│ │ Lambda reservation processor │
|
|
40
|
+
│ │ - pick a node with free GPUs │
|
|
41
|
+
│ │ - attach EBS, mount /shared (EFS) │
|
|
42
|
+
│ │ - create K8s pod, inject GH keys │
|
|
43
|
+
│ └────────────────┬─────────────────────┘
|
|
44
|
+
│ │
|
|
45
|
+
│ ▼
|
|
46
|
+
│ ┌──────────────────┐
|
|
47
|
+
│ │ EKS (k8s) │
|
|
48
|
+
│ SSH (NodePort) │ GPU node groups │
|
|
49
|
+
└─────────────────────┤ T4 / L4 / H100 │
|
|
50
|
+
│ B200 / ... │
|
|
51
|
+
└──────────────────┘
|
|
52
|
+
|
|
53
|
+
DynamoDB holds reservation state & history; CloudWatch logs the lambdas.
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Repository layout
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
.
|
|
60
|
+
├── cli-tools/ # `gpu-dev` Python CLI (pyproject.toml)
|
|
61
|
+
├── terraform-gpu-devservers/
|
|
62
|
+
│ # OpenTofu modules for EKS, node groups,
|
|
63
|
+
│ # SQS, Lambda, DynamoDB, EFS, monitoring
|
|
64
|
+
├── admin/ # operator scripts
|
|
65
|
+
├── docs/ # user guide and architecture notes
|
|
66
|
+
└── tests/
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Getting started — as a user
|
|
70
|
+
|
|
71
|
+
You need: GitHub access to the configured org (PyTorch by default), and your
|
|
72
|
+
public keys uploaded to GitHub.
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# 1. Install the CLI
|
|
76
|
+
pip install -e ./cli-tools/gpu-dev-cli
|
|
77
|
+
|
|
78
|
+
# 2. Point it at your deployment
|
|
79
|
+
gpu-dev config # walks you through API URL + GitHub username
|
|
80
|
+
|
|
81
|
+
# 3. Reserve a GPU
|
|
82
|
+
gpu-dev reserve -g 1 -t h100 -h 2 # 1× H100 for 2 hours
|
|
83
|
+
gpu-dev reserve -g 8 -t b200 -h 24 # 8× B200 for a day
|
|
84
|
+
gpu-dev reserve -g 1 -t t4 -h 0.25 # 1× T4 for 15 minutes
|
|
85
|
+
|
|
86
|
+
# 4. Watch it come up; SSH instructions print when ready
|
|
87
|
+
gpu-dev list
|
|
88
|
+
|
|
89
|
+
# 5. Extend if you need more time (max total 48 h)
|
|
90
|
+
gpu-dev extend <reservation-id> --hours 12
|
|
91
|
+
|
|
92
|
+
# 6. Done? Free it up.
|
|
93
|
+
gpu-dev cancel <reservation-id>
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Each reservation drops an SSH config file at
|
|
97
|
+
`~/.devgpu/<reservation_id>-sshconfig`, so connecting is just:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
ssh -F ~/.devgpu/<reservation_id>-sshconfig gpu-dev
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Getting started — as an operator
|
|
104
|
+
|
|
105
|
+
You need: an AWS account with EC2 GPU capacity (reserved or on-demand), an
|
|
106
|
+
OpenTofu workstation, and credentials for whatever IAM role the modules
|
|
107
|
+
assume.
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
cd terraform-gpu-devservers
|
|
111
|
+
tf init # `tf` is aliased to `opentofu` in this repo
|
|
112
|
+
tf plan # read-only — agents are restricted to this
|
|
113
|
+
tf apply # only on a real workstation, not via the agent
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Important variables to set in your `*.tfvars`:
|
|
117
|
+
|
|
118
|
+
- `aws_region` (defaults to `us-east-2`)
|
|
119
|
+
- node group sizing per GPU type (T4 / L4 / H100 / B200)
|
|
120
|
+
- `grafana_admin_password`
|
|
121
|
+
- the GitHub org/team that's allowed to reserve
|
|
122
|
+
|
|
123
|
+
Once nodes are up, label one per GPU type as profiling-dedicated so DCGM
|
|
124
|
+
doesn't fight Nsight for the device:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
kubectl label node <h100-node> gpu.monitoring/profiling-dedicated=true
|
|
128
|
+
kubectl label node <b200-node> gpu.monitoring/profiling-dedicated=true
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Grafana lands at `http://<node-ip>:30080` (admin / your configured password).
|
|
132
|
+
Pre-loaded dashboards: NVIDIA DCGM (community ID 12239) and a custom GPU
|
|
133
|
+
overview.
|
|
134
|
+
|
|
135
|
+
## Status
|
|
136
|
+
|
|
137
|
+
Working end-to-end on T4 / L4 / H100. B200 supported with on-demand capacity.
|
|
138
|
+
Active development — see [`PROGRESS.md`](PROGRESS.md) and [`TODO.md`](TODO.md)
|
|
139
|
+
for what's in flight and what's queued.
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
See [`LICENSE`](LICENSE) once added. For now: ask before reusing.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
CLAUDE.md
|
|
3
3
|
PROGRESS.md
|
|
4
4
|
PR_DESCRIPTION.md
|
|
5
|
+
README.md
|
|
5
6
|
TODO.md
|
|
6
7
|
post.md
|
|
7
8
|
pyproject.toml
|
|
@@ -44,6 +45,7 @@ terraform-gpu-devservers/efs.tf
|
|
|
44
45
|
terraform-gpu-devservers/eks.tf
|
|
45
46
|
terraform-gpu-devservers/expiry.tf
|
|
46
47
|
terraform-gpu-devservers/git-cache.tf
|
|
48
|
+
terraform-gpu-devservers/gpu-dev-pod-irsa.tf
|
|
47
49
|
terraform-gpu-devservers/kubernetes.tf
|
|
48
50
|
terraform-gpu-devservers/lambda.tf
|
|
49
51
|
terraform-gpu-devservers/main.tf
|
|
@@ -240,8 +240,17 @@ class Config:
|
|
|
240
240
|
return self.user_config.get(key)
|
|
241
241
|
|
|
242
242
|
def get_github_username(self) -> Optional[str]:
|
|
243
|
-
"""Get GitHub username
|
|
244
|
-
|
|
243
|
+
"""Get GitHub username, falling back to GPU_DEV_GITHUB_USER env var.
|
|
244
|
+
|
|
245
|
+
Lambda sets GPU_DEV_GITHUB_USER on every pod from the reservation's
|
|
246
|
+
github_user field, so a user running gpu-dev from inside their dev pod
|
|
247
|
+
doesn\'t have to `gpu-dev config set github_user <name>` first.
|
|
248
|
+
"""
|
|
249
|
+
v = self.user_config.get("github_user")
|
|
250
|
+
if v:
|
|
251
|
+
return v
|
|
252
|
+
v = os.environ.get("GPU_DEV_GITHUB_USER")
|
|
253
|
+
return v or None
|
|
245
254
|
|
|
246
255
|
|
|
247
256
|
def load_config() -> Config:
|
|
@@ -89,6 +89,7 @@ def select_gpu_type_interactive(
|
|
|
89
89
|
table = Table()
|
|
90
90
|
table.add_column("GPU Type", style="cyan")
|
|
91
91
|
table.add_column("Avail", style="green")
|
|
92
|
+
table.add_column("Max\nReservable", style="bright_green")
|
|
92
93
|
table.add_column("Total", style="blue")
|
|
93
94
|
table.add_column("Queue\nLength", style="yellow")
|
|
94
95
|
table.add_column("Est. Wait Time", style="magenta")
|
|
@@ -96,6 +97,7 @@ def select_gpu_type_interactive(
|
|
|
96
97
|
choices = []
|
|
97
98
|
for gpu_type, info in visible_info.items():
|
|
98
99
|
available = info.get("available", 0)
|
|
100
|
+
max_reservable = info.get("max_reservable", 0)
|
|
99
101
|
total = info.get("total", 0)
|
|
100
102
|
queue_length = info.get("queue_length", 0)
|
|
101
103
|
est_wait = info.get("estimated_wait_minutes", 0)
|
|
@@ -134,6 +136,7 @@ def select_gpu_type_interactive(
|
|
|
134
136
|
table.add_row(
|
|
135
137
|
gpu_type.upper(),
|
|
136
138
|
available_display,
|
|
139
|
+
"-" if is_maintenance else str(max_reservable),
|
|
137
140
|
str(total),
|
|
138
141
|
str(queue_length) if not is_maintenance else "-",
|
|
139
142
|
wait_display,
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.23"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -103,6 +103,8 @@ ENV NCCL_ASYNC_ERROR_HANDLING=1
|
|
|
103
103
|
ENV SUPPORTS_EFA=true
|
|
104
104
|
|
|
105
105
|
# Install Python packages (Jupyter and common ML packages)
|
|
106
|
+
# gpu-dev itself is bundled so users can run `gpu-dev submit` from inside their pod
|
|
107
|
+
# (combined with IRSA on the pod's service account, no manual aws sso login needed).
|
|
106
108
|
RUN pip install --no-cache-dir --break-system-packages \
|
|
107
109
|
jupyterlab \
|
|
108
110
|
ipywidgets \
|
|
@@ -112,7 +114,8 @@ RUN pip install --no-cache-dir --break-system-packages \
|
|
|
112
114
|
numpy \
|
|
113
115
|
scikit-learn \
|
|
114
116
|
plotly \
|
|
115
|
-
tensorboard
|
|
117
|
+
tensorboard \
|
|
118
|
+
gpu-dev
|
|
116
119
|
|
|
117
120
|
# Create dev user with UID 1081 to avoid conflicts with common base image users (e.g., ubuntu=1000)
|
|
118
121
|
RUN useradd -u 1081 -m -s /usr/bin/zsh dev && \
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# IRSA wiring for user-facing gpu-dev pods.
|
|
2
|
+
#
|
|
3
|
+
# Goal: when a user SSHs into their CPU dev pod (or any gpu-dev pod) and runs
|
|
4
|
+
# `gpu-dev submit ...`, boto3 picks up temporary AWS credentials via the
|
|
5
|
+
# IAM-roles-for-service-accounts mechanism — no manual `aws sso login` needed.
|
|
6
|
+
#
|
|
7
|
+
# Identity preservation: Lambda sets AWS_ROLE_SESSION_NAME=<user identity>
|
|
8
|
+
# on the pod env, so STS GetCallerIdentity returns
|
|
9
|
+
# arn:aws:sts::<acct>:assumed-role/<role>/<user>
|
|
10
|
+
# and the existing `authenticate_user` ARN-tail parsing keeps working unchanged.
|
|
11
|
+
|
|
12
|
+
# Policy mirrors cli-tools/gpu-dev-cli/minimal-iam-policy.json — same scope a
|
|
13
|
+
# user gets when they `aws sso login` from their laptop.
|
|
14
|
+
resource "aws_iam_role" "gpu_dev_pod_role" {
|
|
15
|
+
name = "gpu-dev-pod-role-${local.current_config.environment}"
|
|
16
|
+
|
|
17
|
+
assume_role_policy = jsonencode({
|
|
18
|
+
Version = "2012-10-17"
|
|
19
|
+
Statement = [
|
|
20
|
+
{
|
|
21
|
+
Effect = "Allow"
|
|
22
|
+
Principal = {
|
|
23
|
+
Federated = aws_iam_openid_connect_provider.eks.arn
|
|
24
|
+
}
|
|
25
|
+
Action = "sts:AssumeRoleWithWebIdentity"
|
|
26
|
+
Condition = {
|
|
27
|
+
StringEquals = {
|
|
28
|
+
"${replace(aws_iam_openid_connect_provider.eks.url, "https://", "")}:sub" = "system:serviceaccount:gpu-dev:gpu-dev-pod-sa"
|
|
29
|
+
"${replace(aws_iam_openid_connect_provider.eks.url, "https://", "")}:aud" = "sts.amazonaws.com"
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
]
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
tags = {
|
|
37
|
+
Name = "GPU Dev Pod IRSA Role"
|
|
38
|
+
Environment = local.current_config.environment
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
resource "aws_iam_role_policy" "gpu_dev_pod_policy" {
|
|
43
|
+
name = "gpu-dev-pod-policy"
|
|
44
|
+
role = aws_iam_role.gpu_dev_pod_role.id
|
|
45
|
+
|
|
46
|
+
policy = jsonencode({
|
|
47
|
+
Version = "2012-10-17"
|
|
48
|
+
Statement = [
|
|
49
|
+
{
|
|
50
|
+
Effect = "Allow"
|
|
51
|
+
Action = [
|
|
52
|
+
"sqs:SendMessage",
|
|
53
|
+
"sqs:GetQueueUrl",
|
|
54
|
+
"sqs:GetQueueAttributes"
|
|
55
|
+
]
|
|
56
|
+
Resource = "arn:aws:sqs:*:*:pytorch-gpu-dev-reservation-queue"
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
Effect = "Allow"
|
|
60
|
+
Action = [
|
|
61
|
+
"dynamodb:GetItem",
|
|
62
|
+
"dynamodb:Query",
|
|
63
|
+
"dynamodb:Scan"
|
|
64
|
+
]
|
|
65
|
+
Resource = [
|
|
66
|
+
"arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-reservations",
|
|
67
|
+
"arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-reservations/index/*",
|
|
68
|
+
"arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-gpu-availability"
|
|
69
|
+
]
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
Effect = "Allow"
|
|
73
|
+
Action = "sts:GetCallerIdentity"
|
|
74
|
+
Resource = "*"
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
})
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
resource "kubernetes_service_account" "gpu_dev_pod" {
|
|
81
|
+
metadata {
|
|
82
|
+
name = "gpu-dev-pod-sa"
|
|
83
|
+
namespace = kubernetes_namespace.gpu_dev.metadata[0].name
|
|
84
|
+
annotations = {
|
|
85
|
+
"eks.amazonaws.com/role-arn" = aws_iam_role.gpu_dev_pod_role.arn
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -4486,6 +4486,18 @@ export MULTINODE_SIZE="$MULTINODE_SIZE"
|
|
|
4486
4486
|
export MASTER_ADDR="$MASTER_ADDR"
|
|
4487
4487
|
export MASTER_PORT="$MASTER_PORT"
|
|
4488
4488
|
|
|
4489
|
+
# IRSA + region — same reason as MULTINODE: sshd strips these from login shells, so
|
|
4490
|
+
# we bake the current container values into the rc file. Lets gpu-dev / aws / boto3
|
|
4491
|
+
# inside an SSH session pick up the gpu-dev-pod-sa IAM role automatically.
|
|
4492
|
+
export AWS_ROLE_ARN="$AWS_ROLE_ARN"
|
|
4493
|
+
export AWS_WEB_IDENTITY_TOKEN_FILE="$AWS_WEB_IDENTITY_TOKEN_FILE"
|
|
4494
|
+
export AWS_ROLE_SESSION_NAME="$AWS_ROLE_SESSION_NAME"
|
|
4495
|
+
export AWS_REGION="$AWS_REGION"
|
|
4496
|
+
export AWS_DEFAULT_REGION="$AWS_DEFAULT_REGION"
|
|
4497
|
+
export AWS_STS_REGIONAL_ENDPOINTS="$AWS_STS_REGIONAL_ENDPOINTS"
|
|
4498
|
+
# CLI falls back to this when ~/.config/gpu-dev/config.json has no github_user
|
|
4499
|
+
export GPU_DEV_GITHUB_USER="$GPU_DEV_GITHUB_USER"
|
|
4500
|
+
|
|
4489
4501
|
# Function to check for GPU reservation expiry warnings and startup script status
|
|
4490
4502
|
check_warnings() {{
|
|
4491
4503
|
# Check for startup script still running
|
|
@@ -4539,6 +4551,15 @@ export MULTINODE_SIZE="$MULTINODE_SIZE"
|
|
|
4539
4551
|
export MASTER_ADDR="$MASTER_ADDR"
|
|
4540
4552
|
export MASTER_PORT="$MASTER_PORT"
|
|
4541
4553
|
|
|
4554
|
+
# IRSA + region (see .bashrc_ext for rationale)
|
|
4555
|
+
export AWS_ROLE_ARN="$AWS_ROLE_ARN"
|
|
4556
|
+
export AWS_WEB_IDENTITY_TOKEN_FILE="$AWS_WEB_IDENTITY_TOKEN_FILE"
|
|
4557
|
+
export AWS_ROLE_SESSION_NAME="$AWS_ROLE_SESSION_NAME"
|
|
4558
|
+
export AWS_REGION="$AWS_REGION"
|
|
4559
|
+
export AWS_DEFAULT_REGION="$AWS_DEFAULT_REGION"
|
|
4560
|
+
export AWS_STS_REGIONAL_ENDPOINTS="$AWS_STS_REGIONAL_ENDPOINTS"
|
|
4561
|
+
export GPU_DEV_GITHUB_USER="$GPU_DEV_GITHUB_USER"
|
|
4562
|
+
|
|
4542
4563
|
# Function to check for GPU reservation expiry warnings and startup script status
|
|
4543
4564
|
check_warnings() {{
|
|
4544
4565
|
# Check for startup script still running
|
|
@@ -4577,6 +4598,16 @@ EOF_ZSHRC_EXT
|
|
|
4577
4598
|
chown 1081:1081 /home/dev/.bashrc_ext /home/dev/.zshrc_ext
|
|
4578
4599
|
echo "[STARTUP] ✓ Shell extension files written"
|
|
4579
4600
|
|
|
4601
|
+
# Background-refresh gpu-dev so older images / persistent disks pick up the
|
|
4602
|
+
# latest CLI without forcing the user to pip install it themselves. The
|
|
4603
|
+
# baseline gpu-dev is already in the image; this just upgrades.
|
|
4604
|
+
(
|
|
4605
|
+
pip install --no-cache-dir --break-system-packages --upgrade gpu-dev \
|
|
4606
|
+
> /tmp/gpu-dev-upgrade.log 2>&1 \
|
|
4607
|
+
&& echo "[STARTUP] gpu-dev upgraded to $(gpu-dev --version 2>&1 | tail -1)" \
|
|
4608
|
+
|| echo "[STARTUP] gpu-dev upgrade failed (non-fatal); see /tmp/gpu-dev-upgrade.log"
|
|
4609
|
+
) &
|
|
4610
|
+
|
|
4580
4611
|
# Ensure existing rc files source the extensions (for persistent disks with old configs)
|
|
4581
4612
|
for rcfile in /home/dev/.bashrc /home/dev/.zshrc; do
|
|
4582
4613
|
if [ -f "$rcfile" ]; then
|
|
@@ -5301,6 +5332,12 @@ EOF
|
|
|
5301
5332
|
),
|
|
5302
5333
|
client.V1EnvVar(
|
|
5303
5334
|
name="NVIDIA_DRIVER_CAPABILITIES", value="compute,utility"
|
|
5335
|
+
),
|
|
5336
|
+
client.V1EnvVar(
|
|
5337
|
+
name="AWS_ROLE_SESSION_NAME", value=(user_id or "gpu-dev-pod")[:64]
|
|
5338
|
+
),
|
|
5339
|
+
client.V1EnvVar(
|
|
5340
|
+
name="GPU_DEV_GITHUB_USER", value=github_user or ""
|
|
5304
5341
|
)
|
|
5305
5342
|
] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
|
|
5306
5343
|
resources=client.V1ResourceRequirements(
|
|
@@ -5483,6 +5520,15 @@ EOF
|
|
|
5483
5520
|
] if not gpu_type.startswith("cpu-") else [],
|
|
5484
5521
|
# Faster pod deletion (default is 30s)
|
|
5485
5522
|
termination_grace_period_seconds=10,
|
|
5523
|
+
# IRSA: bind the pod to the gpu-dev-pod-sa service account so boto3 inside
|
|
5524
|
+
# the pod gets temporary creds via STS AssumeRoleWithWebIdentity. Combined
|
|
5525
|
+
# with the AWS_ROLE_SESSION_NAME env var below this lets users run
|
|
5526
|
+
# `gpu-dev submit` from inside their dev pod with no manual aws sso login.
|
|
5527
|
+
service_account_name="gpu-dev-pod-sa",
|
|
5528
|
+
# fs_group=1081 makes the IRSA-projected token (default 0600 root:root)
|
|
5529
|
+
# readable by the dev user. Without it boto3-as-dev falls through to IMDS
|
|
5530
|
+
# and gets the node's IAM role, which doesn't have DDB/SQS permissions.
|
|
5531
|
+
security_context=client.V1PodSecurityContext(fs_group=1081),
|
|
5486
5532
|
# EFA requires host network namespace for RDMA access to efa0 interface
|
|
5487
5533
|
**({
|
|
5488
5534
|
"host_network": True,
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.24"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.16"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.23}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|