gpu-dev 0.5.31__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/CLAUDE.md +52 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/PKG-INFO +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -4
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +106 -44
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +7 -6
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +15 -2
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +40 -23
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +19 -5
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/pyproject.toml +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ami-baker.tf +22 -3
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/availability.tf +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/Dockerfile +9 -9
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ecr.tf +73 -4
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/eks.tf +45 -5
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/expiry.tf +1 -1
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/kubernetes.tf +13 -13
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/availability_updater/index.py +7 -5
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py +258 -170
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda.tf +29 -5
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/main.tf +31 -5
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy-service.tf +8 -7
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/al2023-user-data.sh +102 -10
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +7 -3
- gpu_dev-0.5.31/PROGRESS.md +0 -288
- gpu_dev-0.5.31/PR_DESCRIPTION.md +0 -168
- gpu_dev-0.5.31/TODO.md +0 -64
- gpu_dev-0.5.31/post.md +0 -233
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/.gitignore +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/admin/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/setup.cfg +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.31 → gpu_dev-0.6.0}/tests/submit/success/run.sh +0 -0
|
@@ -183,6 +183,55 @@ kubectl port-forward -n monitoring svc/kube-prometheus-stack-prometheus 9090:909
|
|
|
183
183
|
kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
|
|
184
184
|
```
|
|
185
185
|
|
|
186
|
+
## Multi-Region Single-State Refactor (Research Notes, May 2026)
|
|
187
|
+
|
|
188
|
+
**Goal:** One `tf apply` manages all regions. No more `tf-all`, no double Docker builds, no double AMI bakes.
|
|
189
|
+
|
|
190
|
+
**Approach:** Module-per-region pattern.
|
|
191
|
+
```hcl
|
|
192
|
+
# root main.tf
|
|
193
|
+
module "us_east_2" {
|
|
194
|
+
source = "./modules/region"
|
|
195
|
+
region = "us-east-2"
|
|
196
|
+
gpu_types = { h100 = {...}, b200 = {...}, ... }
|
|
197
|
+
spot_types = []
|
|
198
|
+
providers = { aws = aws.us_east_2 }
|
|
199
|
+
}
|
|
200
|
+
module "us_east_1" {
|
|
201
|
+
source = "./modules/region"
|
|
202
|
+
region = "us-east-1"
|
|
203
|
+
gpu_types = { b300 = {...}, t4 = {...}, ... }
|
|
204
|
+
spot_types = ["b300", "b200", "h100", ...]
|
|
205
|
+
providers = { aws = aws.us_east_1 }
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
**What goes in the module:** VPC, subnets, EKS cluster, ASGs, launch templates, Lambda functions, DDB tables, EFS, monitoring, DNS. Basically everything in the current root except provider config and shared resources.
|
|
210
|
+
|
|
211
|
+
**What stays at root:** Provider blocks with aliases, ECR replication config, AMI copy (`aws_ami_copy` from primary to secondary regions), global IAM roles if any, CLI config.
|
|
212
|
+
|
|
213
|
+
**AMI sharing:** Build baked AMI in us-east-2 (primary), `aws_ami_copy` to other regions. One build, replicated. The `ami_baker` stays in root, outputs AMI ID, each module receives it as a variable.
|
|
214
|
+
|
|
215
|
+
**Docker sharing:** ECR replication already set up. Docker builds once in primary region, auto-replicates.
|
|
216
|
+
|
|
217
|
+
**Migration plan (since nobody uses east1 yet):**
|
|
218
|
+
1. `tofu workspace select prod-east1 && tofu destroy` — clean slate
|
|
219
|
+
2. Move all resources into `modules/region/`
|
|
220
|
+
3. Create provider aliases in root
|
|
221
|
+
4. Import prod (us-east-2) resources into new module state: `tofu import module.us_east_2.aws_vpc.gpu_dev_vpc vpc-xxx`
|
|
222
|
+
5. Add us-east-1 module — fresh create, no import needed
|
|
223
|
+
6. Delete workspace: `tofu workspace delete prod-east1`
|
|
224
|
+
|
|
225
|
+
**Risks:**
|
|
226
|
+
- Import step for prod is tedious (~50+ resources) but mechanical
|
|
227
|
+
- Lambda zip paths need to be relative to module, not root
|
|
228
|
+
- EKS auth (aws-auth ConfigMap) is per-cluster — each module manages its own
|
|
229
|
+
- CLI needs to know which region to query — already handled by config
|
|
230
|
+
|
|
231
|
+
**Estimated effort:** 1 dedicated session (~4-6 hours). Most time on the module extraction + prod import.
|
|
232
|
+
|
|
233
|
+
**Prerequisite for:** Adding us-west-1, us-west-2, or any future region (becomes one module block each).
|
|
234
|
+
|
|
186
235
|
## Recent Fixes (Oct 27, 2025)
|
|
187
236
|
|
|
188
237
|
**NVIDIA Profiling Bootstrap Configuration (Oct 27, 2025):**
|
|
@@ -232,6 +281,9 @@ kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
|
|
|
232
281
|
|
|
233
282
|
### 📋 Remaining Tasks
|
|
234
283
|
|
|
284
|
+
- **Merge multi-region into single tf state** - HIGH PRIORITY. Kill prod-east1 workspace, refactor into module-per-region in one state. See research notes below. Enables: one `tf apply`, shared AMI (aws_ami_copy), shared Docker (ECR replication already set up), no double builds. Prerequisite for adding west regions.
|
|
285
|
+
- **Add us-west-1 and us-west-2 spot regions** - BLOCKED on single-state refactor. After refactor, adding a region = adding one module block.
|
|
286
|
+
- **Spot UX improvements** - Queue position should be #1 for each type (not cross-type FIFO). Status should show "queued (waiting for capacity)" not just "queued". Interactive picker should show spot GPU counts from east1 not prod.
|
|
235
287
|
- **FQDN for devservers** - Set up proper domain names for development server access
|
|
236
288
|
- **Automated SSH config per reservation** - ✅ DONE - Each reservation now gets `~/.devgpu/<reservation_id>-sshconfig` file, use with `ssh -F ~/.devgpu/<reservation_id>-sshconfig <pod_name>`
|
|
237
289
|
- **Custom Docker image scaffold** - Create Dockerfile with pre-installed packages (Jupyter, etc.)
|
|
@@ -526,7 +526,7 @@ def main(ctx: click.Context) -> None:
|
|
|
526
526
|
"--gpu-type",
|
|
527
527
|
"-t",
|
|
528
528
|
type=click.Choice(
|
|
529
|
-
["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
529
|
+
["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"], case_sensitive=False
|
|
530
530
|
),
|
|
531
531
|
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
|
|
532
532
|
)
|
|
@@ -698,6 +698,7 @@ def reserve(
|
|
|
698
698
|
"b300": {"max_gpus": 8, "instance_type": "p6-b300.48xlarge"},
|
|
699
699
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
700
700
|
"cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
|
|
701
|
+
"cpu-spot": {"max_gpus": 0, "instance_type": "c7i.2xlarge"},
|
|
701
702
|
}
|
|
702
703
|
|
|
703
704
|
# Early validation of GPU type to extract max_gpus (needed for disk selection)
|
|
@@ -896,6 +897,13 @@ def reserve(
|
|
|
896
897
|
|
|
897
898
|
else:
|
|
898
899
|
# Non-interactive mode - use defaults and validate
|
|
900
|
+
# Route --spot to east1 when on prod (env vars override config region)
|
|
901
|
+
if spot and load_config().user_config.get("environment") == "prod":
|
|
902
|
+
east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
903
|
+
if east1_cfg:
|
|
904
|
+
import os as _os
|
|
905
|
+
_os.environ["AWS_REGION"] = east1_cfg["region"]
|
|
906
|
+
|
|
899
907
|
if gpu_type is None:
|
|
900
908
|
gpu_type = "a100"
|
|
901
909
|
if hours is None:
|
|
@@ -1418,7 +1426,7 @@ def reserve(
|
|
|
1418
1426
|
|
|
1419
1427
|
_SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1420
1428
|
"h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
|
|
1421
|
-
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
|
|
1429
|
+
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"]
|
|
1422
1430
|
|
|
1423
1431
|
|
|
1424
1432
|
@main.command(context_settings={"ignore_unknown_options": True})
|
|
@@ -1837,7 +1845,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1837
1845
|
ended = item.get("reservation_ended") or item.get("expired_at") or item.get("created_at", "")
|
|
1838
1846
|
if ended and ended < one_hour_ago:
|
|
1839
1847
|
continue
|
|
1840
|
-
item["_region"] = "
|
|
1848
|
+
item["_region"] = "east1"
|
|
1841
1849
|
results.append(item)
|
|
1842
1850
|
return results
|
|
1843
1851
|
except Exception:
|
|
@@ -1847,11 +1855,45 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1847
1855
|
active_future = executor.submit(fetch_active)
|
|
1848
1856
|
failures_future = executor.submit(fetch_recent_failures)
|
|
1849
1857
|
east1_future = executor.submit(fetch_east1)
|
|
1850
|
-
|
|
1858
|
+
prod_results = active_future.result() + failures_future.result()
|
|
1859
|
+
for r in prod_results:
|
|
1860
|
+
if "_region" not in r:
|
|
1861
|
+
r["_region"] = "prod"
|
|
1862
|
+
east1_results = east1_future.result()
|
|
1863
|
+
for r in east1_results:
|
|
1864
|
+
if "_region" not in r:
|
|
1865
|
+
r["_region"] = "east1"
|
|
1866
|
+
reservations = prod_results + east1_results
|
|
1851
1867
|
else:
|
|
1852
|
-
|
|
1868
|
+
prod_res = reservation_mgr.list_reservations(
|
|
1853
1869
|
user_filter=user_filter, statuses_to_include=statuses_to_include
|
|
1854
1870
|
)
|
|
1871
|
+
for r in prod_res:
|
|
1872
|
+
if "_region" not in r:
|
|
1873
|
+
r["_region"] = "prod"
|
|
1874
|
+
east1_res = fetch_east1() if not status else []
|
|
1875
|
+
if not east1_res:
|
|
1876
|
+
try:
|
|
1877
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
1878
|
+
if east1_env and config.user_config.get("environment") == "prod":
|
|
1879
|
+
import boto3 as _b3
|
|
1880
|
+
east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
|
|
1881
|
+
east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
|
|
1882
|
+
for s in (statuses_to_include or ["active", "preparing", "queued", "pending"]):
|
|
1883
|
+
resp = east1_table.query(
|
|
1884
|
+
IndexName="StatusIndex",
|
|
1885
|
+
KeyConditionExpression="#s = :status",
|
|
1886
|
+
ExpressionAttributeNames={"#s": "status"},
|
|
1887
|
+
ExpressionAttributeValues={":status": s},
|
|
1888
|
+
)
|
|
1889
|
+
for item in resp.get("Items", []):
|
|
1890
|
+
if user_filter and item.get("user_id") != user_filter:
|
|
1891
|
+
continue
|
|
1892
|
+
item["_region"] = "east1"
|
|
1893
|
+
east1_res.append(item)
|
|
1894
|
+
except Exception:
|
|
1895
|
+
pass
|
|
1896
|
+
reservations = prod_res + east1_res
|
|
1855
1897
|
except RuntimeError as e:
|
|
1856
1898
|
rprint(f"[red]❌ {str(e)}[/red]")
|
|
1857
1899
|
return False
|
|
@@ -1883,7 +1925,8 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1883
1925
|
|
|
1884
1926
|
# Create table with enhanced columns for queue info
|
|
1885
1927
|
# Check if we have cross-region reservations
|
|
1886
|
-
|
|
1928
|
+
_regions = frozenset(r.get("_region", "") for r in reservations if r.get("_region"))
|
|
1929
|
+
_has_multi_region = len(_regions) > 1 or "east1" in _regions
|
|
1887
1930
|
|
|
1888
1931
|
table = Table(title="GPU Reservations")
|
|
1889
1932
|
table.add_column("ID", style="cyan", no_wrap=True)
|
|
@@ -1894,7 +1937,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1894
1937
|
table.add_column("Queue Info", style="cyan")
|
|
1895
1938
|
table.add_column("Created", style="blue")
|
|
1896
1939
|
table.add_column("Expires/ETA", style="red")
|
|
1897
|
-
if
|
|
1940
|
+
if _has_multi_region:
|
|
1898
1941
|
table.add_column("Region", style="dim")
|
|
1899
1942
|
if details:
|
|
1900
1943
|
table.add_column("CLI Ver", style="dim", no_wrap=True)
|
|
@@ -1935,13 +1978,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1935
1978
|
# Use the new helper that shows time + remaining
|
|
1936
1979
|
expires_formatted = _format_expires_with_remaining(expires_at)
|
|
1937
1980
|
elif res_status in ["queued", "pending"]:
|
|
1938
|
-
# Show estimated wait time if available
|
|
1939
1981
|
estimated_wait = reservation.get(
|
|
1940
1982
|
"estimated_wait_minutes", "?")
|
|
1941
|
-
if estimated_wait
|
|
1983
|
+
if estimated_wait and estimated_wait not in ("?", "None", None):
|
|
1942
1984
|
expires_formatted = f"~{estimated_wait}min"
|
|
1943
1985
|
else:
|
|
1944
|
-
expires_formatted = "
|
|
1986
|
+
expires_formatted = "Waiting..."
|
|
1945
1987
|
elif res_status in ("expired", "failed", "cancelled"):
|
|
1946
1988
|
reason = reservation.get("failure_reason", "")
|
|
1947
1989
|
ended = reservation.get("reservation_ended") or reservation.get("expired_at", "")
|
|
@@ -1968,15 +2010,11 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1968
2010
|
# Format queue info for queued reservations
|
|
1969
2011
|
queue_info = ""
|
|
1970
2012
|
if res_status in ["queued", "pending"]:
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
"
|
|
1974
|
-
if queue_position != "?" and queue_position is not None:
|
|
1975
|
-
queue_info = f"#{queue_position}"
|
|
1976
|
-
if estimated_wait != "?" and estimated_wait is not None:
|
|
1977
|
-
queue_info += f" (~{estimated_wait}min)"
|
|
2013
|
+
detail = reservation.get("current_detailed_status") or reservation.get("detailed_status") or ""
|
|
2014
|
+
if "capacity" in detail.lower() or "spot" in detail.lower():
|
|
2015
|
+
queue_info = "Waiting for spot"
|
|
1978
2016
|
else:
|
|
1979
|
-
queue_info = "
|
|
2017
|
+
queue_info = "Spot pending"
|
|
1980
2018
|
elif res_status == "active":
|
|
1981
2019
|
# Show pod IP for multinode, SSH hint for single-node
|
|
1982
2020
|
pod_ip = reservation.get("pod_ip", "")
|
|
@@ -2099,9 +2137,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
2099
2137
|
row_data.append(
|
|
2100
2138
|
f"[dim]{lambda_version_display}[/dim]" if dim_row else lambda_version_display)
|
|
2101
2139
|
|
|
2102
|
-
if
|
|
2103
|
-
region = reservation.get("_region", "
|
|
2104
|
-
|
|
2140
|
+
if _has_multi_region:
|
|
2141
|
+
region = reservation.get("_region", "prod")
|
|
2142
|
+
if region in ("us-east-1", "east1"):
|
|
2143
|
+
row_data.append("[yellow]east1[/yellow]")
|
|
2144
|
+
else:
|
|
2145
|
+
row_data.append("prod")
|
|
2105
2146
|
|
|
2106
2147
|
table.add_row(*row_data)
|
|
2107
2148
|
|
|
@@ -2279,8 +2320,11 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
2279
2320
|
|
|
2280
2321
|
queue_info = ""
|
|
2281
2322
|
if res_status in ["queued", "pending"]:
|
|
2282
|
-
|
|
2283
|
-
|
|
2323
|
+
detail = reservation.get("current_detailed_status") or reservation.get("detailed_status") or ""
|
|
2324
|
+
if "capacity" in detail.lower() or "spot" in detail.lower():
|
|
2325
|
+
queue_info = "Waiting for spot"
|
|
2326
|
+
else:
|
|
2327
|
+
queue_info = "Spot pending"
|
|
2284
2328
|
elif res_status == "active":
|
|
2285
2329
|
queue_info = "Ready"
|
|
2286
2330
|
|
|
@@ -2313,10 +2357,10 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
2313
2357
|
expires_formatted = _format_expires_with_remaining(expires_at)
|
|
2314
2358
|
elif res_status in ["queued", "pending"]:
|
|
2315
2359
|
estimated_wait = reservation.get("estimated_wait_minutes", "?")
|
|
2316
|
-
if estimated_wait
|
|
2360
|
+
if estimated_wait and estimated_wait not in ("?", "None", None):
|
|
2317
2361
|
expires_formatted = f"~{estimated_wait}min"
|
|
2318
2362
|
else:
|
|
2319
|
-
expires_formatted = "
|
|
2363
|
+
expires_formatted = "Waiting..."
|
|
2320
2364
|
else:
|
|
2321
2365
|
expires_formatted = "N/A"
|
|
2322
2366
|
|
|
@@ -2531,10 +2575,21 @@ def cancel(
|
|
|
2531
2575
|
with Live(
|
|
2532
2576
|
Spinner("dots", text="📡 Cancelling reservations..."), console=console
|
|
2533
2577
|
) as live:
|
|
2578
|
+
# Build east1 reservation manager for cross-region cancellations
|
|
2579
|
+
east1_mgr = None
|
|
2580
|
+
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
2581
|
+
if east1_env:
|
|
2582
|
+
import os as _os
|
|
2583
|
+
_east1_config = Config()
|
|
2584
|
+
_east1_config.aws_region = east1_env["region"]
|
|
2585
|
+
east1_mgr = ReservationManager(_east1_config)
|
|
2586
|
+
|
|
2534
2587
|
for reservation in reservations:
|
|
2535
2588
|
res_id = reservation.get("reservation_id", "")
|
|
2536
2589
|
if res_id:
|
|
2537
|
-
|
|
2590
|
+
# Use east1 manager for east1 reservations
|
|
2591
|
+
mgr = east1_mgr if reservation.get("_region") in ("east1", "us-east-1") and east1_mgr else reservation_mgr
|
|
2592
|
+
success = mgr.cancel_reservation(
|
|
2538
2593
|
res_id, user_info["user_id"]
|
|
2539
2594
|
)
|
|
2540
2595
|
if success:
|
|
@@ -2971,7 +3026,7 @@ def _show_availability() -> None:
|
|
|
2971
3026
|
spot_table.add_column("Avail\nNow", style="green")
|
|
2972
3027
|
spot_table.add_column("Per\nNode", style="bright_green")
|
|
2973
3028
|
spot_table.add_column("Status", style="magenta")
|
|
2974
|
-
spot_table.add_column("
|
|
3029
|
+
spot_table.add_column("Spot Discount", style="dim")
|
|
2975
3030
|
_on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
2976
3031
|
for gt, info in sorted(spot_region_info.items()):
|
|
2977
3032
|
avail = info.get("available", 0)
|
|
@@ -2981,14 +3036,12 @@ def _show_availability() -> None:
|
|
|
2981
3036
|
si = info.get("spot_info", {}) or {}
|
|
2982
3037
|
sp = si.get("spot_price", "") if isinstance(si, dict) else ""
|
|
2983
3038
|
if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
|
|
2984
|
-
avail_signal = "[
|
|
3039
|
+
avail_signal = "[green]Available[/green]" if avail > 0 else "[dim]No price data[/dim]"
|
|
2985
3040
|
else:
|
|
2986
3041
|
try:
|
|
2987
3042
|
ratio = float(sp) / _on_demand.get(gt, 50)
|
|
2988
3043
|
pct = int((1 - ratio) * 100)
|
|
2989
|
-
|
|
2990
|
-
elif ratio < 0.7: avail_signal = f"[yellow]Medium ({pct}% off)[/yellow]"
|
|
2991
|
-
else: avail_signal = f"[red]Low ({pct}% off)[/red]"
|
|
3044
|
+
avail_signal = f"[green]{pct}% off on-demand[/green]" if pct > 0 else "[dim]At on-demand price[/dim]"
|
|
2992
3045
|
except (ValueError, TypeError):
|
|
2993
3046
|
avail_signal = "[yellow]Unknown[/yellow]"
|
|
2994
3047
|
spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
|
|
@@ -3266,21 +3319,30 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3266
3319
|
|
|
3267
3320
|
live.start()
|
|
3268
3321
|
|
|
3269
|
-
#
|
|
3270
|
-
_sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
|
|
3271
|
-
if _sel and _sel.get("_region") == "us-east-1":
|
|
3272
|
-
import os as _os
|
|
3273
|
-
east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
3274
|
-
_os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
|
|
3275
|
-
_east1_config = Config()
|
|
3276
|
-
_east1_config.aws_region = east1_cfg["region"]
|
|
3277
|
-
reservation_mgr = ReservationManager(_east1_config)
|
|
3278
|
-
|
|
3279
|
-
# Get connection info
|
|
3322
|
+
# Try current region first, then cross-region if not found
|
|
3280
3323
|
connection_info = reservation_mgr.get_connection_info(
|
|
3281
3324
|
reservation_id, user_info["user_id"]
|
|
3282
3325
|
)
|
|
3283
3326
|
|
|
3327
|
+
# If not found, try the other region
|
|
3328
|
+
if not connection_info:
|
|
3329
|
+
import os as _os
|
|
3330
|
+
current_env = config.user_config.get("environment", "prod")
|
|
3331
|
+
other_envs = {"prod": "prod-east1", "prod-east1": "prod"}
|
|
3332
|
+
other_env_name = other_envs.get(current_env)
|
|
3333
|
+
if other_env_name:
|
|
3334
|
+
other_env = Config.ENVIRONMENTS.get(other_env_name, {})
|
|
3335
|
+
if other_env:
|
|
3336
|
+
_os.environ["AWS_DEFAULT_REGION"] = other_env["region"]
|
|
3337
|
+
_other_config = Config()
|
|
3338
|
+
_other_config.aws_region = other_env["region"]
|
|
3339
|
+
other_mgr = ReservationManager(_other_config)
|
|
3340
|
+
connection_info = other_mgr.get_connection_info(
|
|
3341
|
+
reservation_id, user_info["user_id"]
|
|
3342
|
+
)
|
|
3343
|
+
if connection_info:
|
|
3344
|
+
reservation_mgr = other_mgr
|
|
3345
|
+
|
|
3284
3346
|
live.stop()
|
|
3285
3347
|
|
|
3286
3348
|
if not connection_info:
|
|
@@ -3829,7 +3891,7 @@ def set(key: str, value: str) -> None:
|
|
|
3829
3891
|
|
|
3830
3892
|
|
|
3831
3893
|
@config.command()
|
|
3832
|
-
@click.argument("env_name", type=click.Choice(["test", "prod"
|
|
3894
|
+
@click.argument("env_name", type=click.Choice(["test", "prod"]))
|
|
3833
3895
|
def environment(env_name: str) -> None:
|
|
3834
3896
|
"""Set the environment
|
|
3835
3897
|
|
|
@@ -3841,7 +3903,7 @@ def environment(env_name: str) -> None:
|
|
|
3841
3903
|
\b
|
|
3842
3904
|
Examples:
|
|
3843
3905
|
gpu-dev config environment prod # Production (us-east-2)
|
|
3844
|
-
gpu-dev config environment prod
|
|
3906
|
+
gpu-dev config environment prod # Production (spot accessible via interactive picker)
|
|
3845
3907
|
gpu-dev config environment test # Test (us-west-1)
|
|
3846
3908
|
|
|
3847
3909
|
Environment configurations:
|
|
@@ -26,7 +26,7 @@ class Config:
|
|
|
26
26
|
"region": "us-east-1",
|
|
27
27
|
"workspace": "prod-east1",
|
|
28
28
|
"description": "Spot-only us-east-1 environment (T4/L4/CPU)",
|
|
29
|
-
"spot_types": ["b300", "b200", "h200", "h100", "a100"],
|
|
29
|
+
"spot_types": ["b300", "b200", "h200", "h100", "a100", "t4", "l4", "rtxpro6000"],
|
|
30
30
|
},
|
|
31
31
|
}
|
|
32
32
|
DEFAULT_ENVIRONMENT = "prod"
|
|
@@ -42,13 +42,14 @@ class Config:
|
|
|
42
42
|
# Load unified config (handles migration from legacy files)
|
|
43
43
|
self.user_config = self._load_config()
|
|
44
44
|
|
|
45
|
-
# Get region
|
|
46
|
-
|
|
45
|
+
# Get region: env vars take priority (for spot routing), then config, then default
|
|
46
|
+
env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
|
|
47
|
+
if env_region and env_region != self.user_config.get("region"):
|
|
48
|
+
self.aws_region = env_region
|
|
49
|
+
elif self.user_config.get("region"):
|
|
47
50
|
self.aws_region = self.user_config["region"]
|
|
48
51
|
else:
|
|
49
|
-
self.aws_region =
|
|
50
|
-
"AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-2")
|
|
51
|
-
)
|
|
52
|
+
self.aws_region = "us-east-2"
|
|
52
53
|
|
|
53
54
|
os.environ["AWS_DEFAULT_REGION"] = self.aws_region
|
|
54
55
|
|
|
@@ -355,8 +355,21 @@ def unlock_disk(disk_name: str, user_id: str, config: Config) -> bool:
|
|
|
355
355
|
return False
|
|
356
356
|
|
|
357
357
|
if not disk['in_use']:
|
|
358
|
-
|
|
359
|
-
|
|
358
|
+
# DDB says not locked — but check if EBS volume is still physically attached
|
|
359
|
+
try:
|
|
360
|
+
ec2 = config.session.client('ec2', region_name=config.aws_region)
|
|
361
|
+
vols = ec2.describe_volumes(Filters=[
|
|
362
|
+
{"Name": "tag:gpu-dev-user", "Values": [user_id]},
|
|
363
|
+
{"Name": "tag:disk_name", "Values": [disk_name]},
|
|
364
|
+
{"Name": "status", "Values": ["in-use"]},
|
|
365
|
+
]).get("Volumes", [])
|
|
366
|
+
if not vols:
|
|
367
|
+
print(f"Disk '{disk_name}' is not locked")
|
|
368
|
+
return False
|
|
369
|
+
print(f"Disk '{disk_name}' DDB lock is clear but EBS volume is still attached — sending force-detach request")
|
|
370
|
+
except Exception:
|
|
371
|
+
print(f"Disk '{disk_name}' is not locked")
|
|
372
|
+
return False
|
|
360
373
|
|
|
361
374
|
operation_id = str(uuid.uuid4())
|
|
362
375
|
|
|
@@ -52,11 +52,19 @@ def check_interactive_support() -> bool:
|
|
|
52
52
|
|
|
53
53
|
def select_gpu_type_interactive(
|
|
54
54
|
availability_info: Dict[str, Dict[str, Any]],
|
|
55
|
+
_refresh: bool = False,
|
|
55
56
|
) -> Optional[str]:
|
|
56
57
|
"""Interactive GPU type selection with availability table"""
|
|
57
58
|
if not check_interactive_support():
|
|
58
59
|
return None
|
|
59
60
|
|
|
61
|
+
if _refresh:
|
|
62
|
+
from .reservations import ReservationManager
|
|
63
|
+
from .config import load_config
|
|
64
|
+
_cfg = load_config()
|
|
65
|
+
_mgr = ReservationManager(_cfg)
|
|
66
|
+
availability_info = _mgr.get_gpu_availability_by_type() or availability_info
|
|
67
|
+
|
|
60
68
|
# Hide MIG slice SKUs from the top-level selector — reached via the h100 submenu.
|
|
61
69
|
# Direct `--gpu-type h100-mig-1g` still works for non-interactive scripts.
|
|
62
70
|
visible_info = {
|
|
@@ -194,7 +202,7 @@ def select_gpu_type_interactive(
|
|
|
194
202
|
st.add_column("Avail\nNow", style="green")
|
|
195
203
|
st.add_column("Per\nNode", style="bright_green")
|
|
196
204
|
st.add_column("Status", style="magenta")
|
|
197
|
-
st.add_column("
|
|
205
|
+
st.add_column("Spot Discount", style="dim")
|
|
198
206
|
_on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
199
207
|
for gt, info in spot_gpus.items():
|
|
200
208
|
avail = info.get("available", 0)
|
|
@@ -205,7 +213,7 @@ def select_gpu_type_interactive(
|
|
|
205
213
|
# Availability signal from spot price vs on-demand
|
|
206
214
|
sp = si.get("spot_price", "") if isinstance(si, dict) else ""
|
|
207
215
|
if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
|
|
208
|
-
avail_signal = "[
|
|
216
|
+
avail_signal = "[green]Available[/green]" if avail > 0 else "[dim]No price data[/dim]"
|
|
209
217
|
else:
|
|
210
218
|
try:
|
|
211
219
|
ratio = float(sp) / _on_demand.get(gt, 50)
|
|
@@ -266,37 +274,46 @@ def select_gpu_type_interactive(
|
|
|
266
274
|
si_data = info.get("spot_info", {}) or {}
|
|
267
275
|
sp = si_data.get("spot_price", "") if isinstance(si_data, dict) else ""
|
|
268
276
|
# Derive availability signal
|
|
277
|
+
avail_now = int(info.get("available", 0))
|
|
269
278
|
if not sp or "No spot data" in str(si_data.get("spot_signal", "")):
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
279
|
+
if avail_now > 0:
|
|
280
|
+
signal = f"🟢 {avail_now} available now"
|
|
281
|
+
else:
|
|
282
|
+
continue
|
|
283
|
+
else:
|
|
284
|
+
try:
|
|
285
|
+
ratio = float(sp) / _on_demand.get(gt, 50)
|
|
286
|
+
pct = int((1 - ratio) * 100)
|
|
287
|
+
if ratio < 0.4: signal = f"🟢 High avail ({pct}% off)"
|
|
288
|
+
elif ratio < 0.7: signal = f"🟡 Medium ({pct}% off)"
|
|
289
|
+
else: signal = f"🔴 Low ({pct}% off)"
|
|
290
|
+
except (ValueError, TypeError):
|
|
291
|
+
signal = "availability unknown"
|
|
280
292
|
if avail > 0:
|
|
281
293
|
label = f"✅ {gt.upper()} * ({avail} free, {pn}/node, {signal})"
|
|
282
294
|
else:
|
|
283
295
|
label = f"⚡ {gt.upper()} * ({pn} GPUs/node, {signal})"
|
|
284
296
|
choices.append(questionary.Choice(title=label, value=f"spot:{gt}"))
|
|
285
297
|
|
|
286
|
-
|
|
298
|
+
choices.append(questionary.Separator("───"))
|
|
299
|
+
choices.append(questionary.Choice(title="🔄 Refresh availability", value="_refresh"))
|
|
287
300
|
|
|
288
|
-
|
|
301
|
+
console.print()
|
|
289
302
|
|
|
290
|
-
# Interactive selection
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
303
|
+
# Interactive selection — loop on refresh
|
|
304
|
+
while True:
|
|
305
|
+
try:
|
|
306
|
+
answer = questionary.select(
|
|
307
|
+
"Select GPU type:", choices=choices, style=custom_style
|
|
308
|
+
).ask()
|
|
295
309
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
310
|
+
if answer == "_refresh":
|
|
311
|
+
console.print("[dim]Refreshing...[/dim]")
|
|
312
|
+
return select_gpu_type_interactive(availability_info, _refresh=True)
|
|
313
|
+
return answer
|
|
314
|
+
except (KeyboardInterrupt, EOFError):
|
|
315
|
+
console.print("\n[yellow]Selection cancelled.[/yellow]")
|
|
316
|
+
return None
|
|
300
317
|
|
|
301
318
|
|
|
302
319
|
def _format_eta_seconds(delta_seconds: int) -> str:
|
|
@@ -826,8 +826,20 @@ class ReservationManager:
|
|
|
826
826
|
]
|
|
827
827
|
|
|
828
828
|
if len(matching_reservations) == 0:
|
|
829
|
-
|
|
830
|
-
|
|
829
|
+
# Not found by user_id — try direct lookup (for added users viewing other's reservations)
|
|
830
|
+
try:
|
|
831
|
+
from boto3.dynamodb.conditions import Key
|
|
832
|
+
scan_resp = self.reservations_table.scan(
|
|
833
|
+
FilterExpression="begins_with(reservation_id, :rid)",
|
|
834
|
+
ExpressionAttributeValues={":rid": reservation_id},
|
|
835
|
+
Limit=10,
|
|
836
|
+
)
|
|
837
|
+
matching_reservations = scan_resp.get("Items", [])
|
|
838
|
+
except Exception:
|
|
839
|
+
pass
|
|
840
|
+
if not matching_reservations:
|
|
841
|
+
return None
|
|
842
|
+
if len(matching_reservations) > 1:
|
|
831
843
|
return None # Ambiguous - need longer prefix
|
|
832
844
|
|
|
833
845
|
reservation = matching_reservations[0]
|
|
@@ -1689,6 +1701,7 @@ class ReservationManager:
|
|
|
1689
1701
|
initial_text = f"📡 Starting multinode reservation..." if is_multinode else "🔄 Sending reservation request..."
|
|
1690
1702
|
spinner = Spinner("dots", text=initial_text)
|
|
1691
1703
|
live.update(spinner)
|
|
1704
|
+
poll_delay = 0.5 # start fast, back off over time
|
|
1692
1705
|
|
|
1693
1706
|
while (
|
|
1694
1707
|
(timeout_seconds is None or time.time() -
|
|
@@ -1749,7 +1762,7 @@ class ReservationManager:
|
|
|
1749
1762
|
if not is_multinode:
|
|
1750
1763
|
spinner.text = "📡 Waiting for reservation status update..."
|
|
1751
1764
|
live.update(spinner)
|
|
1752
|
-
time.sleep(
|
|
1765
|
+
time.sleep(0.5)
|
|
1753
1766
|
continue
|
|
1754
1767
|
else:
|
|
1755
1768
|
node_details.append({
|
|
@@ -2281,8 +2294,9 @@ class ReservationManager:
|
|
|
2281
2294
|
|
|
2282
2295
|
return None
|
|
2283
2296
|
|
|
2284
|
-
#
|
|
2285
|
-
time.sleep(
|
|
2297
|
+
# Poll with backoff: 0.5s → 1s → 1.5s → 2s → 3s (cap)
|
|
2298
|
+
time.sleep(poll_delay)
|
|
2299
|
+
poll_delay = min(poll_delay + 0.5, 3.0)
|
|
2286
2300
|
|
|
2287
2301
|
except Exception as e:
|
|
2288
2302
|
console.print(
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.6.0"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|