gpu-dev 0.5.21__tar.gz → 0.5.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/PKG-INFO +1 -1
- gpu_dev-0.5.22/README.md +143 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +2 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +3 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/pyproject.toml +1 -1
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/Dockerfile +4 -1
- gpu_dev-0.5.22/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +88 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_processor/index.py +18 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/.gitignore +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/CLAUDE.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/PROGRESS.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/TODO.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/admin/README.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/post.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/setup.cfg +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.21 → gpu_dev-0.5.22}/tests/submit/success/run.sh +0 -0
gpu_dev-0.5.22/README.md
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# osdc — Open Source Developer Cloud
|
|
2
|
+
|
|
3
|
+
A self-hosted developer platform for GPU work. Devs ask for `1 / 2 / 4 / 8`
|
|
4
|
+
GPUs of a given type, the platform parks them on a Kubernetes pod with SSH
|
|
5
|
+
access, and tears it down when the reservation expires.
|
|
6
|
+
|
|
7
|
+
Built for PyTorch contributors — auth is via the GitHub public keys of users
|
|
8
|
+
with commit access — but the design is generic enough to plug into other
|
|
9
|
+
groups.
|
|
10
|
+
|
|
11
|
+
## What you get
|
|
12
|
+
|
|
13
|
+
- **Python CLI** (`gpu-dev`) with `reserve`, `list`, `extend`, `cancel`, and
|
|
14
|
+
`config` commands. Real-time polling until your pod is ready.
|
|
15
|
+
- **GPU types**: T4, L4, A100, H100, B200. Pick the count (1, 2, 4, 8) and the
|
|
16
|
+
duration in hours (fractional is fine, e.g. `--hours 0.25`).
|
|
17
|
+
- **SSH** straight into the pod via NodePort, with **your own GitHub public
|
|
18
|
+
keys** injected — no separate credentials to manage.
|
|
19
|
+
- **Persistent disk** that survives between reservations (opt-in), backed by
|
|
20
|
+
EBS snapshots. Or run with `--no-persist` for a clean `EmptyDir` workspace.
|
|
21
|
+
- **20 TB shared EFS** mounted at `/shared` with per-user folders.
|
|
22
|
+
- **NVIDIA profiling** ready out of the box (`ncu` / `nsys` work without
|
|
23
|
+
manual driver tweaks), with one node per GPU type reserved as
|
|
24
|
+
profiling-dedicated.
|
|
25
|
+
- **Grafana** dashboard at `<node-ip>:30080` with NVIDIA DCGM exporter
|
|
26
|
+
metrics — utilization, memory, temp, power.
|
|
27
|
+
- **Multi-node NCCL** working over EFA with `OFI_NCCL_PROTOCOL=SENDRECV`.
|
|
28
|
+
Tree algo gets ~21 GB/s bus bandwidth across 2× p5.48xlarge (16 H100).
|
|
29
|
+
|
|
30
|
+
## How it fits together
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
┌────────┐ reserve ┌────────┐ enqueue ┌────────────┐
|
|
34
|
+
│ CLI │ ───────────► │ API │ ────────► │ SQS │
|
|
35
|
+
└────────┘ └────────┘ └─────┬──────┘
|
|
36
|
+
▲ poll │
|
|
37
|
+
│ ▼
|
|
38
|
+
│ ┌──────────────────────────────────────┐
|
|
39
|
+
│ │ Lambda reservation processor │
|
|
40
|
+
│ │ - pick a node with free GPUs │
|
|
41
|
+
│ │ - attach EBS, mount /shared (EFS) │
|
|
42
|
+
│ │ - create K8s pod, inject GH keys │
|
|
43
|
+
│ └────────────────┬─────────────────────┘
|
|
44
|
+
│ │
|
|
45
|
+
│ ▼
|
|
46
|
+
│ ┌──────────────────┐
|
|
47
|
+
│ │ EKS (k8s) │
|
|
48
|
+
│ SSH (NodePort) │ GPU node groups │
|
|
49
|
+
└─────────────────────┤ T4 / L4 / H100 │
|
|
50
|
+
│ B200 / ... │
|
|
51
|
+
└──────────────────┘
|
|
52
|
+
|
|
53
|
+
DynamoDB holds reservation state & history; CloudWatch logs the lambdas.
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Repository layout
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
.
|
|
60
|
+
├── cli-tools/ # `gpu-dev` Python CLI (pyproject.toml)
|
|
61
|
+
├── terraform-gpu-devservers/
|
|
62
|
+
│ # OpenTofu modules for EKS, node groups,
|
|
63
|
+
│ # SQS, Lambda, DynamoDB, EFS, monitoring
|
|
64
|
+
├── admin/ # operator scripts
|
|
65
|
+
├── docs/ # user guide and architecture notes
|
|
66
|
+
└── tests/
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Getting started — as a user
|
|
70
|
+
|
|
71
|
+
You need: GitHub access to the configured org (PyTorch by default), and your
|
|
72
|
+
public keys uploaded to GitHub.
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# 1. Install the CLI
|
|
76
|
+
pip install -e ./cli-tools/gpu-dev-cli
|
|
77
|
+
|
|
78
|
+
# 2. Point it at your deployment
|
|
79
|
+
gpu-dev config # walks you through API URL + GitHub username
|
|
80
|
+
|
|
81
|
+
# 3. Reserve a GPU
|
|
82
|
+
gpu-dev reserve -g 1 -t h100 -h 2 # 1× H100 for 2 hours
|
|
83
|
+
gpu-dev reserve -g 8 -t b200 -h 24 # 8× B200 for a day
|
|
84
|
+
gpu-dev reserve -g 1 -t t4 -h 0.25 # 1× T4 for 15 minutes
|
|
85
|
+
|
|
86
|
+
# 4. Watch it come up; SSH instructions print when ready
|
|
87
|
+
gpu-dev list
|
|
88
|
+
|
|
89
|
+
# 5. Extend if you need more time (max total 48 h)
|
|
90
|
+
gpu-dev extend <reservation-id> --hours 12
|
|
91
|
+
|
|
92
|
+
# 6. Done? Free it up.
|
|
93
|
+
gpu-dev cancel <reservation-id>
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Each reservation drops an SSH config file at
|
|
97
|
+
`~/.devgpu/<reservation_id>-sshconfig`, so connecting is just:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
ssh -F ~/.devgpu/<reservation_id>-sshconfig gpu-dev
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Getting started — as an operator
|
|
104
|
+
|
|
105
|
+
You need: an AWS account with EC2 GPU capacity (reserved or on-demand), an
|
|
106
|
+
OpenTofu workstation, and credentials for whatever IAM role the modules
|
|
107
|
+
assume.
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
cd terraform-gpu-devservers
|
|
111
|
+
tf init # `tf` is aliased to `opentofu` in this repo
|
|
112
|
+
tf plan # read-only — agents are restricted to this
|
|
113
|
+
tf apply # only on a real workstation, not via the agent
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Important variables to set in your `*.tfvars`:
|
|
117
|
+
|
|
118
|
+
- `aws_region` (defaults to `us-east-2`)
|
|
119
|
+
- node group sizing per GPU type (T4 / L4 / H100 / B200)
|
|
120
|
+
- `grafana_admin_password`
|
|
121
|
+
- the GitHub org/team that's allowed to reserve
|
|
122
|
+
|
|
123
|
+
Once nodes are up, label one per GPU type as profiling-dedicated so DCGM
|
|
124
|
+
doesn't fight Nsight for the device:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
kubectl label node <h100-node> gpu.monitoring/profiling-dedicated=true
|
|
128
|
+
kubectl label node <b200-node> gpu.monitoring/profiling-dedicated=true
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Grafana lands at `http://<node-ip>:30080` (admin / your configured password).
|
|
132
|
+
Pre-loaded dashboards: NVIDIA DCGM (community ID 12239) and a custom GPU
|
|
133
|
+
overview.
|
|
134
|
+
|
|
135
|
+
## Status
|
|
136
|
+
|
|
137
|
+
Working end-to-end on T4 / L4 / H100. B200 supported with on-demand capacity.
|
|
138
|
+
Active development — see [`PROGRESS.md`](PROGRESS.md) and [`TODO.md`](TODO.md)
|
|
139
|
+
for what's in flight and what's queued.
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
See [`LICENSE`](LICENSE) once added. For now: ask before reusing.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
CLAUDE.md
|
|
3
3
|
PROGRESS.md
|
|
4
4
|
PR_DESCRIPTION.md
|
|
5
|
+
README.md
|
|
5
6
|
TODO.md
|
|
6
7
|
post.md
|
|
7
8
|
pyproject.toml
|
|
@@ -44,6 +45,7 @@ terraform-gpu-devservers/efs.tf
|
|
|
44
45
|
terraform-gpu-devservers/eks.tf
|
|
45
46
|
terraform-gpu-devservers/expiry.tf
|
|
46
47
|
terraform-gpu-devservers/git-cache.tf
|
|
48
|
+
terraform-gpu-devservers/gpu-dev-pod-irsa.tf
|
|
47
49
|
terraform-gpu-devservers/kubernetes.tf
|
|
48
50
|
terraform-gpu-devservers/lambda.tf
|
|
49
51
|
terraform-gpu-devservers/main.tf
|
|
@@ -89,6 +89,7 @@ def select_gpu_type_interactive(
|
|
|
89
89
|
table = Table()
|
|
90
90
|
table.add_column("GPU Type", style="cyan")
|
|
91
91
|
table.add_column("Avail", style="green")
|
|
92
|
+
table.add_column("Max\nReservable", style="bright_green")
|
|
92
93
|
table.add_column("Total", style="blue")
|
|
93
94
|
table.add_column("Queue\nLength", style="yellow")
|
|
94
95
|
table.add_column("Est. Wait Time", style="magenta")
|
|
@@ -96,6 +97,7 @@ def select_gpu_type_interactive(
|
|
|
96
97
|
choices = []
|
|
97
98
|
for gpu_type, info in visible_info.items():
|
|
98
99
|
available = info.get("available", 0)
|
|
100
|
+
max_reservable = info.get("max_reservable", 0)
|
|
99
101
|
total = info.get("total", 0)
|
|
100
102
|
queue_length = info.get("queue_length", 0)
|
|
101
103
|
est_wait = info.get("estimated_wait_minutes", 0)
|
|
@@ -134,6 +136,7 @@ def select_gpu_type_interactive(
|
|
|
134
136
|
table.add_row(
|
|
135
137
|
gpu_type.upper(),
|
|
136
138
|
available_display,
|
|
139
|
+
"-" if is_maintenance else str(max_reservable),
|
|
137
140
|
str(total),
|
|
138
141
|
str(queue_length) if not is_maintenance else "-",
|
|
139
142
|
wait_display,
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.22"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -103,6 +103,8 @@ ENV NCCL_ASYNC_ERROR_HANDLING=1
|
|
|
103
103
|
ENV SUPPORTS_EFA=true
|
|
104
104
|
|
|
105
105
|
# Install Python packages (Jupyter and common ML packages)
|
|
106
|
+
# gpu-dev itself is bundled so users can run `gpu-dev submit` from inside their pod
|
|
107
|
+
# (combined with IRSA on the pod's service account, no manual aws sso login needed).
|
|
106
108
|
RUN pip install --no-cache-dir --break-system-packages \
|
|
107
109
|
jupyterlab \
|
|
108
110
|
ipywidgets \
|
|
@@ -112,7 +114,8 @@ RUN pip install --no-cache-dir --break-system-packages \
|
|
|
112
114
|
numpy \
|
|
113
115
|
scikit-learn \
|
|
114
116
|
plotly \
|
|
115
|
-
tensorboard
|
|
117
|
+
tensorboard \
|
|
118
|
+
gpu-dev
|
|
116
119
|
|
|
117
120
|
# Create dev user with UID 1081 to avoid conflicts with common base image users (e.g., ubuntu=1000)
|
|
118
121
|
RUN useradd -u 1081 -m -s /usr/bin/zsh dev && \
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# IRSA wiring for user-facing gpu-dev pods.
|
|
2
|
+
#
|
|
3
|
+
# Goal: when a user SSHs into their CPU dev pod (or any gpu-dev pod) and runs
|
|
4
|
+
# `gpu-dev submit ...`, boto3 picks up temporary AWS credentials via the
|
|
5
|
+
# IAM-roles-for-service-accounts mechanism — no manual `aws sso login` needed.
|
|
6
|
+
#
|
|
7
|
+
# Identity preservation: Lambda sets AWS_ROLE_SESSION_NAME=<user identity>
|
|
8
|
+
# on the pod env, so STS GetCallerIdentity returns
|
|
9
|
+
# arn:aws:sts::<acct>:assumed-role/<role>/<user>
|
|
10
|
+
# and the existing `authenticate_user` ARN-tail parsing keeps working unchanged.
|
|
11
|
+
|
|
12
|
+
# Policy mirrors cli-tools/gpu-dev-cli/minimal-iam-policy.json — same scope a
|
|
13
|
+
# user gets when they `aws sso login` from their laptop.
|
|
14
|
+
resource "aws_iam_role" "gpu_dev_pod_role" {
|
|
15
|
+
name = "gpu-dev-pod-role-${local.current_config.environment}"
|
|
16
|
+
|
|
17
|
+
assume_role_policy = jsonencode({
|
|
18
|
+
Version = "2012-10-17"
|
|
19
|
+
Statement = [
|
|
20
|
+
{
|
|
21
|
+
Effect = "Allow"
|
|
22
|
+
Principal = {
|
|
23
|
+
Federated = aws_iam_openid_connect_provider.eks.arn
|
|
24
|
+
}
|
|
25
|
+
Action = "sts:AssumeRoleWithWebIdentity"
|
|
26
|
+
Condition = {
|
|
27
|
+
StringEquals = {
|
|
28
|
+
"${replace(aws_iam_openid_connect_provider.eks.url, "https://", "")}:sub" = "system:serviceaccount:gpu-dev:gpu-dev-pod-sa"
|
|
29
|
+
"${replace(aws_iam_openid_connect_provider.eks.url, "https://", "")}:aud" = "sts.amazonaws.com"
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
]
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
tags = {
|
|
37
|
+
Name = "GPU Dev Pod IRSA Role"
|
|
38
|
+
Environment = local.current_config.environment
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
resource "aws_iam_role_policy" "gpu_dev_pod_policy" {
|
|
43
|
+
name = "gpu-dev-pod-policy"
|
|
44
|
+
role = aws_iam_role.gpu_dev_pod_role.id
|
|
45
|
+
|
|
46
|
+
policy = jsonencode({
|
|
47
|
+
Version = "2012-10-17"
|
|
48
|
+
Statement = [
|
|
49
|
+
{
|
|
50
|
+
Effect = "Allow"
|
|
51
|
+
Action = [
|
|
52
|
+
"sqs:SendMessage",
|
|
53
|
+
"sqs:GetQueueUrl",
|
|
54
|
+
"sqs:GetQueueAttributes"
|
|
55
|
+
]
|
|
56
|
+
Resource = "arn:aws:sqs:*:*:pytorch-gpu-dev-reservation-queue"
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
Effect = "Allow"
|
|
60
|
+
Action = [
|
|
61
|
+
"dynamodb:GetItem",
|
|
62
|
+
"dynamodb:Query",
|
|
63
|
+
"dynamodb:Scan"
|
|
64
|
+
]
|
|
65
|
+
Resource = [
|
|
66
|
+
"arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-reservations",
|
|
67
|
+
"arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-reservations/index/*",
|
|
68
|
+
"arn:aws:dynamodb:*:*:table/pytorch-gpu-dev-gpu-availability"
|
|
69
|
+
]
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
Effect = "Allow"
|
|
73
|
+
Action = "sts:GetCallerIdentity"
|
|
74
|
+
Resource = "*"
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
})
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
resource "kubernetes_service_account" "gpu_dev_pod" {
|
|
81
|
+
metadata {
|
|
82
|
+
name = "gpu-dev-pod-sa"
|
|
83
|
+
namespace = kubernetes_namespace.gpu_dev.metadata[0].name
|
|
84
|
+
annotations = {
|
|
85
|
+
"eks.amazonaws.com/role-arn" = aws_iam_role.gpu_dev_pod_role.arn
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -4577,6 +4577,16 @@ EOF_ZSHRC_EXT
|
|
|
4577
4577
|
chown 1081:1081 /home/dev/.bashrc_ext /home/dev/.zshrc_ext
|
|
4578
4578
|
echo "[STARTUP] ✓ Shell extension files written"
|
|
4579
4579
|
|
|
4580
|
+
# Background-refresh gpu-dev so older images / persistent disks pick up the
|
|
4581
|
+
# latest CLI without forcing the user to pip install it themselves. The
|
|
4582
|
+
# baseline gpu-dev is already in the image; this just upgrades.
|
|
4583
|
+
(
|
|
4584
|
+
pip install --no-cache-dir --break-system-packages --upgrade gpu-dev \
|
|
4585
|
+
> /tmp/gpu-dev-upgrade.log 2>&1 \
|
|
4586
|
+
&& echo "[STARTUP] gpu-dev upgraded to $(gpu-dev --version 2>&1 | tail -1)" \
|
|
4587
|
+
|| echo "[STARTUP] gpu-dev upgrade failed (non-fatal); see /tmp/gpu-dev-upgrade.log"
|
|
4588
|
+
) &
|
|
4589
|
+
|
|
4580
4590
|
# Ensure existing rc files source the extensions (for persistent disks with old configs)
|
|
4581
4591
|
for rcfile in /home/dev/.bashrc /home/dev/.zshrc; do
|
|
4582
4592
|
if [ -f "$rcfile" ]; then
|
|
@@ -5301,6 +5311,9 @@ EOF
|
|
|
5301
5311
|
),
|
|
5302
5312
|
client.V1EnvVar(
|
|
5303
5313
|
name="NVIDIA_DRIVER_CAPABILITIES", value="compute,utility"
|
|
5314
|
+
),
|
|
5315
|
+
client.V1EnvVar(
|
|
5316
|
+
name="AWS_ROLE_SESSION_NAME", value=(user_id or "gpu-dev-pod")[:64]
|
|
5304
5317
|
)
|
|
5305
5318
|
] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
|
|
5306
5319
|
resources=client.V1ResourceRequirements(
|
|
@@ -5483,6 +5496,11 @@ EOF
|
|
|
5483
5496
|
] if not gpu_type.startswith("cpu-") else [],
|
|
5484
5497
|
# Faster pod deletion (default is 30s)
|
|
5485
5498
|
termination_grace_period_seconds=10,
|
|
5499
|
+
# IRSA: bind the pod to the gpu-dev-pod-sa service account so boto3 inside
|
|
5500
|
+
# the pod gets temporary creds via STS AssumeRoleWithWebIdentity. Combined
|
|
5501
|
+
# with the AWS_ROLE_SESSION_NAME env var below this lets users run
|
|
5502
|
+
# `gpu-dev submit` from inside their dev pod with no manual aws sso login.
|
|
5503
|
+
service_account_name="gpu-dev-pod-sa",
|
|
5486
5504
|
# EFA requires host network namespace for RDMA access to efa0 interface
|
|
5487
5505
|
**({
|
|
5488
5506
|
"host_network": True,
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.23"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.16"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.21 → gpu_dev-0.5.22}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|