gpu-dev 0.5.31__tar.gz → 0.5.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/CLAUDE.md +52 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/PKG-INFO +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -4
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +65 -30
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +40 -23
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +14 -2
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/pyproject.toml +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ami-baker.tf +22 -3
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/availability.tf +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/Dockerfile +9 -9
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ecr.tf +73 -4
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/eks.tf +45 -5
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/expiry.tf +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/kubernetes.tf +13 -13
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/availability_updater/index.py +7 -5
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_processor/index.py +80 -21
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/main.tf +26 -2
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy-service.tf +8 -7
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/al2023-user-data.sh +15 -6
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +4 -3
- gpu_dev-0.5.31/PROGRESS.md +0 -288
- gpu_dev-0.5.31/PR_DESCRIPTION.md +0 -168
- gpu_dev-0.5.31/TODO.md +0 -64
- gpu_dev-0.5.31/post.md +0 -233
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/.gitignore +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/admin/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/setup.cfg +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.5.32}/tests/submit/success/run.sh +0 -0
|
@@ -183,6 +183,55 @@ kubectl port-forward -n monitoring svc/kube-prometheus-stack-prometheus 9090:909
|
|
|
183
183
|
kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
|
|
184
184
|
```
|
|
185
185
|
|
|
186
|
+
## Multi-Region Single-State Refactor (Research Notes, May 2026)
|
|
187
|
+
|
|
188
|
+
**Goal:** One `tf apply` manages all regions. No more `tf-all`, no double Docker builds, no double AMI bakes.
|
|
189
|
+
|
|
190
|
+
**Approach:** Module-per-region pattern.
|
|
191
|
+
```hcl
|
|
192
|
+
# root main.tf
|
|
193
|
+
module "us_east_2" {
|
|
194
|
+
source = "./modules/region"
|
|
195
|
+
region = "us-east-2"
|
|
196
|
+
gpu_types = { h100 = {...}, b200 = {...}, ... }
|
|
197
|
+
spot_types = []
|
|
198
|
+
providers = { aws = aws.us_east_2 }
|
|
199
|
+
}
|
|
200
|
+
module "us_east_1" {
|
|
201
|
+
source = "./modules/region"
|
|
202
|
+
region = "us-east-1"
|
|
203
|
+
gpu_types = { b300 = {...}, t4 = {...}, ... }
|
|
204
|
+
spot_types = ["b300", "b200", "h100", ...]
|
|
205
|
+
providers = { aws = aws.us_east_1 }
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
**What goes in the module:** VPC, subnets, EKS cluster, ASGs, launch templates, Lambda functions, DDB tables, EFS, monitoring, DNS. Basically everything in the current root except provider config and shared resources.
|
|
210
|
+
|
|
211
|
+
**What stays at root:** Provider blocks with aliases, ECR replication config, AMI copy (`aws_ami_copy` from primary to secondary regions), global IAM roles if any, CLI config.
|
|
212
|
+
|
|
213
|
+
**AMI sharing:** Build baked AMI in us-east-2 (primary), `aws_ami_copy` to other regions. One build, replicated. The `ami_baker` stays in root, outputs AMI ID, each module receives it as a variable.
|
|
214
|
+
|
|
215
|
+
**Docker sharing:** ECR replication already set up. Docker builds once in primary region, auto-replicates.
|
|
216
|
+
|
|
217
|
+
**Migration plan (since nobody uses east1 yet):**
|
|
218
|
+
1. `tofu workspace select prod-east1 && tofu destroy` — clean slate
|
|
219
|
+
2. Move all resources into `modules/region/`
|
|
220
|
+
3. Create provider aliases in root
|
|
221
|
+
4. Import prod (us-east-2) resources into new module state: `tofu import module.us_east_2.aws_vpc.gpu_dev_vpc vpc-xxx`
|
|
222
|
+
5. Add us-east-1 module — fresh create, no import needed
|
|
223
|
+
6. Delete workspace: `tofu workspace delete prod-east1`
|
|
224
|
+
|
|
225
|
+
**Risks:**
|
|
226
|
+
- Import step for prod is tedious (~50+ resources) but mechanical
|
|
227
|
+
- Lambda zip paths need to be relative to module, not root
|
|
228
|
+
- EKS auth (aws-auth ConfigMap) is per-cluster — each module manages its own
|
|
229
|
+
- CLI needs to know which region to query — already handled by config
|
|
230
|
+
|
|
231
|
+
**Estimated effort:** 1 dedicated session (~4-6 hours). Most time on the module extraction + prod import.
|
|
232
|
+
|
|
233
|
+
**Prerequisite for:** Adding us-west-1, us-west-2, or any future region (becomes one module block each).
|
|
234
|
+
|
|
186
235
|
## Recent Fixes (Oct 27, 2025)
|
|
187
236
|
|
|
188
237
|
**NVIDIA Profiling Bootstrap Configuration (Oct 27, 2025):**
|
|
@@ -232,6 +281,9 @@ kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
|
|
|
232
281
|
|
|
233
282
|
### 📋 Remaining Tasks
|
|
234
283
|
|
|
284
|
+
- **Merge multi-region into single tf state** - HIGH PRIORITY. Kill prod-east1 workspace, refactor into module-per-region in one state. See research notes below. Enables: one `tf apply`, shared AMI (aws_ami_copy), shared Docker (ECR replication already set up), no double builds. Prerequisite for adding west regions.
|
|
285
|
+
- **Add us-west-1 and us-west-2 spot regions** - BLOCKED on single-state refactor. After refactor, adding a region = adding one module block.
|
|
286
|
+
- **Spot UX improvements** - Queue position should be #1 for each type (not cross-type FIFO). Status should show "queued (waiting for capacity)" not just "queued". Interactive picker should show spot GPU counts from east1 not prod.
|
|
235
287
|
- **FQDN for devservers** - Set up proper domain names for development server access
|
|
236
288
|
- **Automated SSH config per reservation** - ✅ DONE - Each reservation now gets `~/.devgpu/<reservation_id>-sshconfig` file, use with `ssh -F ~/.devgpu/<reservation_id>-sshconfig <pod_name>`
|
|
237
289
|
- **Custom Docker image scaffold** - Create Dockerfile with pre-installed packages (Jupyter, etc.)
|
|
@@ -526,7 +526,7 @@ def main(ctx: click.Context) -> None:
|
|
|
526
526
|
"--gpu-type",
|
|
527
527
|
"-t",
|
|
528
528
|
type=click.Choice(
|
|
529
|
-
["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
529
|
+
["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"], case_sensitive=False
|
|
530
530
|
),
|
|
531
531
|
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
|
|
532
532
|
)
|
|
@@ -698,6 +698,7 @@ def reserve(
|
|
|
698
698
|
"b300": {"max_gpus": 8, "instance_type": "p6-b300.48xlarge"},
|
|
699
699
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
700
700
|
"cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
|
|
701
|
+
"cpu-spot": {"max_gpus": 0, "instance_type": "c7i.2xlarge"},
|
|
701
702
|
}
|
|
702
703
|
|
|
703
704
|
# Early validation of GPU type to extract max_gpus (needed for disk selection)
|
|
@@ -1418,7 +1419,7 @@ def reserve(
|
|
|
1418
1419
|
|
|
1419
1420
|
_SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1420
1421
|
"h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
|
|
1421
|
-
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
|
|
1422
|
+
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"]
|
|
1422
1423
|
|
|
1423
1424
|
|
|
1424
1425
|
@main.command(context_settings={"ignore_unknown_options": True})
|
|
@@ -1837,7 +1838,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1837
1838
|
ended = item.get("reservation_ended") or item.get("expired_at") or item.get("created_at", "")
|
|
1838
1839
|
if ended and ended < one_hour_ago:
|
|
1839
1840
|
continue
|
|
1840
|
-
item["_region"] = "
|
|
1841
|
+
item["_region"] = "east1"
|
|
1841
1842
|
results.append(item)
|
|
1842
1843
|
return results
|
|
1843
1844
|
except Exception:
|
|
@@ -1847,11 +1848,45 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1847
1848
|
active_future = executor.submit(fetch_active)
|
|
1848
1849
|
failures_future = executor.submit(fetch_recent_failures)
|
|
1849
1850
|
east1_future = executor.submit(fetch_east1)
|
|
1850
|
-
|
|
1851
|
+
prod_results = active_future.result() + failures_future.result()
|
|
1852
|
+
for r in prod_results:
|
|
1853
|
+
if "_region" not in r:
|
|
1854
|
+
r["_region"] = "prod"
|
|
1855
|
+
east1_results = east1_future.result()
|
|
1856
|
+
for r in east1_results:
|
|
1857
|
+
if "_region" not in r:
|
|
1858
|
+
r["_region"] = "east1"
|
|
1859
|
+
reservations = prod_results + east1_results
|
|
1851
1860
|
else:
|
|
1852
|
-
|
|
1861
|
+
prod_res = reservation_mgr.list_reservations(
|
|
1853
1862
|
user_filter=user_filter, statuses_to_include=statuses_to_include
|
|
1854
1863
|
)
|
|
1864
|
+
for r in prod_res:
|
|
1865
|
+
if "_region" not in r:
|
|
1866
|
+
r["_region"] = "prod"
|
|
1867
|
+
east1_res = fetch_east1() if not status else []
|
|
1868
|
+
if not east1_res:
|
|
1869
|
+
try:
|
|
1870
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
1871
|
+
if east1_env and config.user_config.get("environment") == "prod":
|
|
1872
|
+
import boto3 as _b3
|
|
1873
|
+
east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
|
|
1874
|
+
east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
|
|
1875
|
+
for s in (statuses_to_include or ["active", "preparing", "queued", "pending"]):
|
|
1876
|
+
resp = east1_table.query(
|
|
1877
|
+
IndexName="StatusIndex",
|
|
1878
|
+
KeyConditionExpression="#s = :status",
|
|
1879
|
+
ExpressionAttributeNames={"#s": "status"},
|
|
1880
|
+
ExpressionAttributeValues={":status": s},
|
|
1881
|
+
)
|
|
1882
|
+
for item in resp.get("Items", []):
|
|
1883
|
+
if user_filter and item.get("user_id") != user_filter:
|
|
1884
|
+
continue
|
|
1885
|
+
item["_region"] = "east1"
|
|
1886
|
+
east1_res.append(item)
|
|
1887
|
+
except Exception:
|
|
1888
|
+
pass
|
|
1889
|
+
reservations = prod_res + east1_res
|
|
1855
1890
|
except RuntimeError as e:
|
|
1856
1891
|
rprint(f"[red]❌ {str(e)}[/red]")
|
|
1857
1892
|
return False
|
|
@@ -1883,7 +1918,8 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1883
1918
|
|
|
1884
1919
|
# Create table with enhanced columns for queue info
|
|
1885
1920
|
# Check if we have cross-region reservations
|
|
1886
|
-
|
|
1921
|
+
_regions = frozenset(r.get("_region", "") for r in reservations if r.get("_region"))
|
|
1922
|
+
_has_multi_region = len(_regions) > 1 or "east1" in _regions
|
|
1887
1923
|
|
|
1888
1924
|
table = Table(title="GPU Reservations")
|
|
1889
1925
|
table.add_column("ID", style="cyan", no_wrap=True)
|
|
@@ -1894,7 +1930,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1894
1930
|
table.add_column("Queue Info", style="cyan")
|
|
1895
1931
|
table.add_column("Created", style="blue")
|
|
1896
1932
|
table.add_column("Expires/ETA", style="red")
|
|
1897
|
-
if
|
|
1933
|
+
if _has_multi_region:
|
|
1898
1934
|
table.add_column("Region", style="dim")
|
|
1899
1935
|
if details:
|
|
1900
1936
|
table.add_column("CLI Ver", style="dim", no_wrap=True)
|
|
@@ -1935,13 +1971,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1935
1971
|
# Use the new helper that shows time + remaining
|
|
1936
1972
|
expires_formatted = _format_expires_with_remaining(expires_at)
|
|
1937
1973
|
elif res_status in ["queued", "pending"]:
|
|
1938
|
-
# Show estimated wait time if available
|
|
1939
1974
|
estimated_wait = reservation.get(
|
|
1940
1975
|
"estimated_wait_minutes", "?")
|
|
1941
|
-
if estimated_wait
|
|
1976
|
+
if estimated_wait and estimated_wait not in ("?", "None", None):
|
|
1942
1977
|
expires_formatted = f"~{estimated_wait}min"
|
|
1943
1978
|
else:
|
|
1944
|
-
expires_formatted = "
|
|
1979
|
+
expires_formatted = "Waiting..."
|
|
1945
1980
|
elif res_status in ("expired", "failed", "cancelled"):
|
|
1946
1981
|
reason = reservation.get("failure_reason", "")
|
|
1947
1982
|
ended = reservation.get("reservation_ended") or reservation.get("expired_at", "")
|
|
@@ -1968,15 +2003,11 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1968
2003
|
# Format queue info for queued reservations
|
|
1969
2004
|
queue_info = ""
|
|
1970
2005
|
if res_status in ["queued", "pending"]:
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
"
|
|
1974
|
-
if queue_position != "?" and queue_position is not None:
|
|
1975
|
-
queue_info = f"#{queue_position}"
|
|
1976
|
-
if estimated_wait != "?" and estimated_wait is not None:
|
|
1977
|
-
queue_info += f" (~{estimated_wait}min)"
|
|
2006
|
+
detail = reservation.get("current_detailed_status") or reservation.get("detailed_status") or ""
|
|
2007
|
+
if "capacity" in detail.lower() or "spot" in detail.lower():
|
|
2008
|
+
queue_info = "Waiting for spot"
|
|
1978
2009
|
else:
|
|
1979
|
-
queue_info = "
|
|
2010
|
+
queue_info = "Spot pending"
|
|
1980
2011
|
elif res_status == "active":
|
|
1981
2012
|
# Show pod IP for multinode, SSH hint for single-node
|
|
1982
2013
|
pod_ip = reservation.get("pod_ip", "")
|
|
@@ -2099,9 +2130,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
2099
2130
|
row_data.append(
|
|
2100
2131
|
f"[dim]{lambda_version_display}[/dim]" if dim_row else lambda_version_display)
|
|
2101
2132
|
|
|
2102
|
-
if
|
|
2103
|
-
region = reservation.get("_region", "
|
|
2104
|
-
|
|
2133
|
+
if _has_multi_region:
|
|
2134
|
+
region = reservation.get("_region", "prod")
|
|
2135
|
+
if region in ("us-east-1", "east1"):
|
|
2136
|
+
row_data.append("[yellow]east1[/yellow]")
|
|
2137
|
+
else:
|
|
2138
|
+
row_data.append("prod")
|
|
2105
2139
|
|
|
2106
2140
|
table.add_row(*row_data)
|
|
2107
2141
|
|
|
@@ -2279,8 +2313,11 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
2279
2313
|
|
|
2280
2314
|
queue_info = ""
|
|
2281
2315
|
if res_status in ["queued", "pending"]:
|
|
2282
|
-
|
|
2283
|
-
|
|
2316
|
+
detail = reservation.get("current_detailed_status") or reservation.get("detailed_status") or ""
|
|
2317
|
+
if "capacity" in detail.lower() or "spot" in detail.lower():
|
|
2318
|
+
queue_info = "Waiting for spot"
|
|
2319
|
+
else:
|
|
2320
|
+
queue_info = "Spot pending"
|
|
2284
2321
|
elif res_status == "active":
|
|
2285
2322
|
queue_info = "Ready"
|
|
2286
2323
|
|
|
@@ -2313,10 +2350,10 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
2313
2350
|
expires_formatted = _format_expires_with_remaining(expires_at)
|
|
2314
2351
|
elif res_status in ["queued", "pending"]:
|
|
2315
2352
|
estimated_wait = reservation.get("estimated_wait_minutes", "?")
|
|
2316
|
-
if estimated_wait
|
|
2353
|
+
if estimated_wait and estimated_wait not in ("?", "None", None):
|
|
2317
2354
|
expires_formatted = f"~{estimated_wait}min"
|
|
2318
2355
|
else:
|
|
2319
|
-
expires_formatted = "
|
|
2356
|
+
expires_formatted = "Waiting..."
|
|
2320
2357
|
else:
|
|
2321
2358
|
expires_formatted = "N/A"
|
|
2322
2359
|
|
|
@@ -2971,7 +3008,7 @@ def _show_availability() -> None:
|
|
|
2971
3008
|
spot_table.add_column("Avail\nNow", style="green")
|
|
2972
3009
|
spot_table.add_column("Per\nNode", style="bright_green")
|
|
2973
3010
|
spot_table.add_column("Status", style="magenta")
|
|
2974
|
-
spot_table.add_column("
|
|
3011
|
+
spot_table.add_column("Spot Discount", style="dim")
|
|
2975
3012
|
_on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
2976
3013
|
for gt, info in sorted(spot_region_info.items()):
|
|
2977
3014
|
avail = info.get("available", 0)
|
|
@@ -2981,14 +3018,12 @@ def _show_availability() -> None:
|
|
|
2981
3018
|
si = info.get("spot_info", {}) or {}
|
|
2982
3019
|
sp = si.get("spot_price", "") if isinstance(si, dict) else ""
|
|
2983
3020
|
if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
|
|
2984
|
-
avail_signal = "[
|
|
3021
|
+
avail_signal = "[green]Available[/green]" if avail > 0 else "[dim]No price data[/dim]"
|
|
2985
3022
|
else:
|
|
2986
3023
|
try:
|
|
2987
3024
|
ratio = float(sp) / _on_demand.get(gt, 50)
|
|
2988
3025
|
pct = int((1 - ratio) * 100)
|
|
2989
|
-
|
|
2990
|
-
elif ratio < 0.7: avail_signal = f"[yellow]Medium ({pct}% off)[/yellow]"
|
|
2991
|
-
else: avail_signal = f"[red]Low ({pct}% off)[/red]"
|
|
3026
|
+
avail_signal = f"[green]{pct}% off on-demand[/green]" if pct > 0 else "[dim]At on-demand price[/dim]"
|
|
2992
3027
|
except (ValueError, TypeError):
|
|
2993
3028
|
avail_signal = "[yellow]Unknown[/yellow]"
|
|
2994
3029
|
spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
|
|
@@ -26,7 +26,7 @@ class Config:
|
|
|
26
26
|
"region": "us-east-1",
|
|
27
27
|
"workspace": "prod-east1",
|
|
28
28
|
"description": "Spot-only us-east-1 environment (T4/L4/CPU)",
|
|
29
|
-
"spot_types": ["b300", "b200", "h200", "h100", "a100"],
|
|
29
|
+
"spot_types": ["b300", "b200", "h200", "h100", "a100", "t4", "l4", "rtxpro6000"],
|
|
30
30
|
},
|
|
31
31
|
}
|
|
32
32
|
DEFAULT_ENVIRONMENT = "prod"
|
|
@@ -52,11 +52,19 @@ def check_interactive_support() -> bool:
|
|
|
52
52
|
|
|
53
53
|
def select_gpu_type_interactive(
|
|
54
54
|
availability_info: Dict[str, Dict[str, Any]],
|
|
55
|
+
_refresh: bool = False,
|
|
55
56
|
) -> Optional[str]:
|
|
56
57
|
"""Interactive GPU type selection with availability table"""
|
|
57
58
|
if not check_interactive_support():
|
|
58
59
|
return None
|
|
59
60
|
|
|
61
|
+
if _refresh:
|
|
62
|
+
from .reservations import ReservationManager
|
|
63
|
+
from .config import load_config
|
|
64
|
+
_cfg = load_config()
|
|
65
|
+
_mgr = ReservationManager(_cfg)
|
|
66
|
+
availability_info = _mgr.get_gpu_availability_by_type() or availability_info
|
|
67
|
+
|
|
60
68
|
# Hide MIG slice SKUs from the top-level selector — reached via the h100 submenu.
|
|
61
69
|
# Direct `--gpu-type h100-mig-1g` still works for non-interactive scripts.
|
|
62
70
|
visible_info = {
|
|
@@ -194,7 +202,7 @@ def select_gpu_type_interactive(
|
|
|
194
202
|
st.add_column("Avail\nNow", style="green")
|
|
195
203
|
st.add_column("Per\nNode", style="bright_green")
|
|
196
204
|
st.add_column("Status", style="magenta")
|
|
197
|
-
st.add_column("
|
|
205
|
+
st.add_column("Spot Discount", style="dim")
|
|
198
206
|
_on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
199
207
|
for gt, info in spot_gpus.items():
|
|
200
208
|
avail = info.get("available", 0)
|
|
@@ -205,7 +213,7 @@ def select_gpu_type_interactive(
|
|
|
205
213
|
# Availability signal from spot price vs on-demand
|
|
206
214
|
sp = si.get("spot_price", "") if isinstance(si, dict) else ""
|
|
207
215
|
if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
|
|
208
|
-
avail_signal = "[
|
|
216
|
+
avail_signal = "[green]Available[/green]" if avail > 0 else "[dim]No price data[/dim]"
|
|
209
217
|
else:
|
|
210
218
|
try:
|
|
211
219
|
ratio = float(sp) / _on_demand.get(gt, 50)
|
|
@@ -266,37 +274,46 @@ def select_gpu_type_interactive(
|
|
|
266
274
|
si_data = info.get("spot_info", {}) or {}
|
|
267
275
|
sp = si_data.get("spot_price", "") if isinstance(si_data, dict) else ""
|
|
268
276
|
# Derive availability signal
|
|
277
|
+
avail_now = int(info.get("available", 0))
|
|
269
278
|
if not sp or "No spot data" in str(si_data.get("spot_signal", "")):
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
279
|
+
if avail_now > 0:
|
|
280
|
+
signal = f"🟢 {avail_now} available now"
|
|
281
|
+
else:
|
|
282
|
+
continue
|
|
283
|
+
else:
|
|
284
|
+
try:
|
|
285
|
+
ratio = float(sp) / _on_demand.get(gt, 50)
|
|
286
|
+
pct = int((1 - ratio) * 100)
|
|
287
|
+
if ratio < 0.4: signal = f"🟢 High avail ({pct}% off)"
|
|
288
|
+
elif ratio < 0.7: signal = f"🟡 Medium ({pct}% off)"
|
|
289
|
+
else: signal = f"🔴 Low ({pct}% off)"
|
|
290
|
+
except (ValueError, TypeError):
|
|
291
|
+
signal = "availability unknown"
|
|
280
292
|
if avail > 0:
|
|
281
293
|
label = f"✅ {gt.upper()} * ({avail} free, {pn}/node, {signal})"
|
|
282
294
|
else:
|
|
283
295
|
label = f"⚡ {gt.upper()} * ({pn} GPUs/node, {signal})"
|
|
284
296
|
choices.append(questionary.Choice(title=label, value=f"spot:{gt}"))
|
|
285
297
|
|
|
286
|
-
|
|
298
|
+
choices.append(questionary.Separator("───"))
|
|
299
|
+
choices.append(questionary.Choice(title="🔄 Refresh availability", value="_refresh"))
|
|
287
300
|
|
|
288
|
-
|
|
301
|
+
console.print()
|
|
289
302
|
|
|
290
|
-
# Interactive selection
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
303
|
+
# Interactive selection — loop on refresh
|
|
304
|
+
while True:
|
|
305
|
+
try:
|
|
306
|
+
answer = questionary.select(
|
|
307
|
+
"Select GPU type:", choices=choices, style=custom_style
|
|
308
|
+
).ask()
|
|
295
309
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
310
|
+
if answer == "_refresh":
|
|
311
|
+
console.print("[dim]Refreshing...[/dim]")
|
|
312
|
+
return select_gpu_type_interactive(availability_info, _refresh=True)
|
|
313
|
+
return answer
|
|
314
|
+
except (KeyboardInterrupt, EOFError):
|
|
315
|
+
console.print("\n[yellow]Selection cancelled.[/yellow]")
|
|
316
|
+
return None
|
|
300
317
|
|
|
301
318
|
|
|
302
319
|
def _format_eta_seconds(delta_seconds: int) -> str:
|
|
@@ -826,8 +826,20 @@ class ReservationManager:
|
|
|
826
826
|
]
|
|
827
827
|
|
|
828
828
|
if len(matching_reservations) == 0:
|
|
829
|
-
|
|
830
|
-
|
|
829
|
+
# Not found by user_id — try direct lookup (for added users viewing other's reservations)
|
|
830
|
+
try:
|
|
831
|
+
from boto3.dynamodb.conditions import Key
|
|
832
|
+
scan_resp = self.reservations_table.scan(
|
|
833
|
+
FilterExpression="begins_with(reservation_id, :rid)",
|
|
834
|
+
ExpressionAttributeValues={":rid": reservation_id},
|
|
835
|
+
Limit=10,
|
|
836
|
+
)
|
|
837
|
+
matching_reservations = scan_resp.get("Items", [])
|
|
838
|
+
except Exception:
|
|
839
|
+
pass
|
|
840
|
+
if not matching_reservations:
|
|
841
|
+
return None
|
|
842
|
+
if len(matching_reservations) > 1:
|
|
831
843
|
return None # Ambiguous - need longer prefix
|
|
832
844
|
|
|
833
845
|
reservation = matching_reservations[0]
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.32"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -11,6 +11,7 @@ locals {
|
|
|
11
11
|
ami_baker_trigger = sha256(join("\n", [
|
|
12
12
|
data.aws_ami.eks_gpu_ami_x86_64.id,
|
|
13
13
|
filesha256("${path.module}/templates/al2023-user-data.sh"),
|
|
14
|
+
filesha256("${path.module}/templates/ami-baker-user-data.sh"),
|
|
14
15
|
local.latest_image_uri,
|
|
15
16
|
]))
|
|
16
17
|
ami_baker_name = "gpu-dev-baked-${substr(local.ami_baker_trigger, 0, 8)}"
|
|
@@ -19,11 +20,11 @@ locals {
|
|
|
19
20
|
image_uri = local.latest_image_uri
|
|
20
21
|
}))
|
|
21
22
|
|
|
22
|
-
# Use baked AMI when available, fall back to standard.
|
|
23
|
-
gpu_ami_id = length(data.aws_ami_ids.
|
|
23
|
+
# Use baked AMI when available (checked AFTER baker runs), fall back to standard.
|
|
24
|
+
gpu_ami_id = length(data.aws_ami_ids.gpu_baked_resolved.ids) > 0 ? data.aws_ami_ids.gpu_baked_resolved.ids[0] : data.aws_ami.eks_gpu_ami_x86_64.id
|
|
24
25
|
}
|
|
25
26
|
|
|
26
|
-
#
|
|
27
|
+
# Pre-build check: does the baked AMI already exist? Controls whether baker runs.
|
|
27
28
|
data "aws_ami_ids" "gpu_baked" {
|
|
28
29
|
owners = ["self"]
|
|
29
30
|
|
|
@@ -39,6 +40,24 @@ data "aws_ami_ids" "gpu_baked" {
|
|
|
39
40
|
sort_ascending = false
|
|
40
41
|
}
|
|
41
42
|
|
|
43
|
+
# Post-build lookup: re-reads AFTER the baker finishes, so a freshly built AMI
|
|
44
|
+
# is picked up in the same apply (no second apply needed).
|
|
45
|
+
data "aws_ami_ids" "gpu_baked_resolved" {
|
|
46
|
+
depends_on = [null_resource.ami_baker]
|
|
47
|
+
owners = ["self"]
|
|
48
|
+
|
|
49
|
+
filter {
|
|
50
|
+
name = "name"
|
|
51
|
+
values = [local.ami_baker_name]
|
|
52
|
+
}
|
|
53
|
+
filter {
|
|
54
|
+
name = "state"
|
|
55
|
+
values = ["available"]
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
sort_ascending = false
|
|
59
|
+
}
|
|
60
|
+
|
|
42
61
|
# Build the baked AMI when inputs change
|
|
43
62
|
resource "null_resource" "ami_baker" {
|
|
44
63
|
# Only run when the target AMI doesn't exist yet
|
|
@@ -48,7 +48,7 @@ resource "aws_lambda_function" "availability_updater" {
|
|
|
48
48
|
EKS_CLUSTER_NAME = aws_eks_cluster.gpu_dev_cluster.name
|
|
49
49
|
REGION = local.current_config.aws_region
|
|
50
50
|
SPOT_GPU_TYPES = lookup({
|
|
51
|
-
"prod-east1" = "b300,b200,h200,h100,a100"
|
|
51
|
+
"prod-east1" = "b300,b200,h200,h100,a100,t4,l4,rtxpro6000,cpu-spot"
|
|
52
52
|
}, terraform.workspace, "")
|
|
53
53
|
ASG_NAME_PREFIX = "${var.prefix}-gpu-nodes"
|
|
54
54
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Custom PyTorch GPU Development Server Image
|
|
2
|
-
# Based on pytorch/pytorch:2.
|
|
3
|
-
FROM pytorch/pytorch:2.
|
|
2
|
+
# Based on pytorch/pytorch:2.12.0-cuda13.2-cudnn9-devel
|
|
3
|
+
FROM pytorch/pytorch:2.12.0-cuda13.2-cudnn9-devel
|
|
4
4
|
|
|
5
5
|
# Set environment variables for non-interactive installation
|
|
6
6
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
@@ -42,22 +42,22 @@ RUN for attempt in 1 2 3; do \
|
|
|
42
42
|
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
|
|
43
43
|
apt-get install -y nodejs
|
|
44
44
|
|
|
45
|
-
# Install CUDA
|
|
45
|
+
# Install older CUDA toolkits alongside base CUDA 13.2
|
|
46
46
|
# Base image already has NVIDIA repo configured, no need for cuda-keyring
|
|
47
47
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
48
|
+
cuda-toolkit-12-8 \
|
|
48
49
|
cuda-toolkit-12-9 \
|
|
49
50
|
cuda-toolkit-13-0 \
|
|
50
51
|
cuda-toolkit-13-1 \
|
|
51
|
-
cuda-toolkit-13-2 \
|
|
52
52
|
&& apt-get clean \
|
|
53
53
|
&& rm -rf /var/lib/apt/lists/*
|
|
54
54
|
|
|
55
|
-
# CUDA
|
|
55
|
+
# CUDA 13.2 is the default (PyTorch 2.12 compiled against it)
|
|
56
56
|
# All versions available at /usr/local/cuda-{12.8,12.9,13.0,13.1,13.2}/
|
|
57
|
-
# Switch with: export CUDA_HOME=/usr/local/cuda-
|
|
58
|
-
ENV CUDA_HOME=/usr/local/cuda-
|
|
59
|
-
ENV PATH=/usr/local/cuda-
|
|
60
|
-
ENV LD_LIBRARY_PATH=/usr/local/cuda-
|
|
57
|
+
# Switch with: export CUDA_HOME=/usr/local/cuda-12.8
|
|
58
|
+
ENV CUDA_HOME=/usr/local/cuda-13.2
|
|
59
|
+
ENV PATH=/usr/local/cuda-13.2/bin:${PATH}
|
|
60
|
+
ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/lib64:${LD_LIBRARY_PATH}
|
|
61
61
|
|
|
62
62
|
# Install EFA stack (prebuilt libfabric + OpenMPI + aws-ofi-nccl with GPU/RDMA support)
|
|
63
63
|
# Uses AWS EFA installer which bundles tested, compatible versions of all components
|