gpu-dev 0.7.1__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/CLAUDE.md +1 -1
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/PKG-INFO +1 -1
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +43 -12
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +115 -165
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/pyproject.toml +1 -1
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/__init__.py +1 -1
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/.gitignore +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/README.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/admin/README.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/admin/generate_stats.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/admin/requirements.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/docs/SDK_REPRO.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/architecture.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/cli-demo.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/devgpu-features.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/feedback.png +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/gpu-fleet.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/index.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/k8s-under-the-hood.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/multinode.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/osdc-future-plans.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/problem.png +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/sandbox.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/sdk-demo.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/thesis.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/title-vid.mp4 +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/weneedgpus.png +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/presentation/wow.html +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/README.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/examples/batch_multi_gpu.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/examples/interactive_debug.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/examples/parallel_experiments.ipynb +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/examples/quickstart.ipynb +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/examples/run_tests.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/examples/submit_job.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_backend/aws.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/common/config.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/common/enums.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/common/errors.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/common/models.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/src/gpu_dev/py.typed +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/tests/__init__.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/sdk/python/tests/test_models.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/setup.cfg +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-deck/backend.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-deck/main.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-deck/terraform.tfvars.example +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/build-node.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/lambda.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/pytorch-prebuild.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/terraform-gpu-devservers/warm-pool.tf +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/tests/submit/README.md +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.7.1 → gpu_dev-0.7.3}/tests/submit/success/run.sh +0 -0
|
@@ -318,7 +318,7 @@ module "us_east_1" {
|
|
|
318
318
|
|
|
319
319
|
- **Merge multi-region into single tf state** - HIGH PRIORITY. Kill prod-east1 workspace, refactor into module-per-region in one state. See research notes below. Enables: one `tf apply`, shared AMI (aws_ami_copy), shared Docker (ECR replication already set up), no double builds. Prerequisite for adding west regions.
|
|
320
320
|
- **Add us-west-1 and us-west-2 spot regions** - BLOCKED on single-state refactor. After refactor, adding a region = adding one module block.
|
|
321
|
-
- **Spot UX improvements** - Queue position should be #1 for each type (not cross-type FIFO). Status should show "queued (waiting for capacity)" not just "queued". Interactive picker should show spot GPU counts from east1 not prod.
|
|
321
|
+
- **Spot UX improvements** - Queue position should be #1 for each type (not cross-type FIFO). Status should show "queued (waiting for capacity)" not just "queued". Interactive picker should show spot GPU counts from east1 not prod. NOTE (2026-05-30): spot is now **hidden by default** in `gpu-dev reserve` (interactive picker), `gpu-dev avail`, and watch mode — `cpu-spot` + the us-east-1 spot cluster only appear with `--spot` (reserve/avail flag) or the "⚡ Show spot options" picker entry. Spot was too bloated/half-baked for the default view. CLI-only change (`cli.py` `_show_availability`/`_show_availability_watch`/`avail`/`reserve`, `interactive.py` `select_gpu_type_interactive`).
|
|
322
322
|
- **FQDN for devservers** - Set up proper domain names for development server access
|
|
323
323
|
- **Automated SSH config per reservation** - ✅ DONE - Each reservation now gets `~/.devgpu/<reservation_id>-sshconfig` file, use with `ssh -F ~/.devgpu/<reservation_id>-sshconfig <pod_name>`
|
|
324
324
|
- **Custom Docker image scaffold** - Create Dockerfile with pre-installed packages (Jupyter, etc.)
|
|
@@ -829,9 +829,9 @@ def reserve(
|
|
|
829
829
|
rprint("[red]❌ Could not get GPU availability information[/red]")
|
|
830
830
|
return
|
|
831
831
|
|
|
832
|
-
# Interactive GPU type selection
|
|
832
|
+
# Interactive GPU type selection (spot hidden unless --spot)
|
|
833
833
|
if gpu_type is None:
|
|
834
|
-
gpu_type = select_gpu_type_interactive(availability_info)
|
|
834
|
+
gpu_type = select_gpu_type_interactive(availability_info, show_spot=spot)
|
|
835
835
|
if gpu_type is None:
|
|
836
836
|
rprint("[yellow]Reservation cancelled.[/yellow]")
|
|
837
837
|
return
|
|
@@ -3163,8 +3163,11 @@ def _format_gpu_display(gpu_count, gpu_type):
|
|
|
3163
3163
|
return f"{gpu_count}x {str(gpu_type).upper()}"
|
|
3164
3164
|
|
|
3165
3165
|
|
|
3166
|
-
def _show_availability() -> None:
|
|
3167
|
-
"""Shared function to show GPU availability
|
|
3166
|
+
def _show_availability(show_spot: bool = False) -> None:
|
|
3167
|
+
"""Shared function to show GPU availability.
|
|
3168
|
+
|
|
3169
|
+
Spot SKUs (cpu-spot + the us-east-1 spot cluster) are hidden unless show_spot.
|
|
3170
|
+
"""
|
|
3168
3171
|
try:
|
|
3169
3172
|
with Live(
|
|
3170
3173
|
Spinner("dots", text="📡 Checking GPU availability..."), console=console
|
|
@@ -3181,7 +3184,7 @@ def _show_availability() -> None:
|
|
|
3181
3184
|
_east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
|
|
3182
3185
|
|
|
3183
3186
|
def _fetch_east1_spot():
|
|
3184
|
-
if _env_name != "prod" or not _east1_spot_types:
|
|
3187
|
+
if not show_spot or _env_name != "prod" or not _east1_spot_types:
|
|
3185
3188
|
return {}
|
|
3186
3189
|
east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
|
|
3187
3190
|
east1_table = config.session.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability")
|
|
@@ -3247,8 +3250,16 @@ def _show_availability() -> None:
|
|
|
3247
3250
|
"CPU (arm64)": 6,
|
|
3248
3251
|
}
|
|
3249
3252
|
|
|
3250
|
-
# Split into categories
|
|
3251
|
-
|
|
3253
|
+
# Split into categories. Hide spot SKUs (e.g. cpu-spot) unless --spot,
|
|
3254
|
+
# but never hide everything if the env is spot-only.
|
|
3255
|
+
def _is_spot(k):
|
|
3256
|
+
return k == "cpu-spot" or k.endswith("-spot")
|
|
3257
|
+
_non_spot_exists = any(not _is_spot(k) for k in availability_info if "mig" not in k)
|
|
3258
|
+
_hide_spot = (not show_spot) and _non_spot_exists
|
|
3259
|
+
full_types = {
|
|
3260
|
+
k: v for k, v in availability_info.items()
|
|
3261
|
+
if "mig" not in k and not (_hide_spot and _is_spot(k))
|
|
3262
|
+
}
|
|
3252
3263
|
mig_types = {k: v for k, v in availability_info.items() if "mig" in k}
|
|
3253
3264
|
|
|
3254
3265
|
def _sort_by_arch(items):
|
|
@@ -3344,8 +3355,12 @@ def _show_availability() -> None:
|
|
|
3344
3355
|
rprint(" [green]●[/green]: 1+ full node available - [yellow]●[/yellow]: GPUs available, but no full node - [red]●[/red]: No GPUs available")
|
|
3345
3356
|
|
|
3346
3357
|
# Show usage tip
|
|
3358
|
+
if _hide_spot:
|
|
3359
|
+
rprint(
|
|
3360
|
+
"\n[dim]💡 Spot instances hidden — pass '--spot' to show (us-east-1, ~70% cheaper, may be preempted)[/dim]"
|
|
3361
|
+
)
|
|
3347
3362
|
rprint(
|
|
3348
|
-
"\n[dim]💡 Use 'gpu-dev reserve' (interactive) to see all options including MIG slices
|
|
3363
|
+
"\n[dim]💡 Use 'gpu-dev reserve' (interactive) to see all options including MIG slices[/dim]"
|
|
3349
3364
|
)
|
|
3350
3365
|
|
|
3351
3366
|
else:
|
|
@@ -3355,10 +3370,13 @@ def _show_availability() -> None:
|
|
|
3355
3370
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
3356
3371
|
|
|
3357
3372
|
|
|
3358
|
-
def _show_availability_watch(interval: int) -> None:
|
|
3373
|
+
def _show_availability_watch(interval: int, show_spot: bool = False) -> None:
|
|
3359
3374
|
_env_name = load_config().user_config.get("environment", "prod")
|
|
3360
3375
|
_spot_types = frozenset(Config.ENVIRONMENTS.get(_env_name, {}).get("spot_types", []))
|
|
3361
3376
|
|
|
3377
|
+
def _is_spot(k):
|
|
3378
|
+
return k == "cpu-spot" or k.endswith("-spot")
|
|
3379
|
+
|
|
3362
3380
|
"""Watch mode for GPU availability with auto-refresh"""
|
|
3363
3381
|
import time
|
|
3364
3382
|
from datetime import datetime
|
|
@@ -3385,6 +3403,13 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
3385
3403
|
# Get availability data
|
|
3386
3404
|
availability_info = reservation_mgr.get_gpu_availability_by_type()
|
|
3387
3405
|
|
|
3406
|
+
# Hide spot SKUs (e.g. cpu-spot) unless --spot, never hide everything.
|
|
3407
|
+
if availability_info and not show_spot:
|
|
3408
|
+
if any(not _is_spot(k) for k in availability_info if "mig" not in k):
|
|
3409
|
+
availability_info = {
|
|
3410
|
+
k: v for k, v in availability_info.items() if not _is_spot(k)
|
|
3411
|
+
}
|
|
3412
|
+
|
|
3388
3413
|
if availability_info:
|
|
3389
3414
|
# GPU architecture mapping (for display)
|
|
3390
3415
|
gpu_architectures = {
|
|
@@ -4024,8 +4049,14 @@ def help(ctx: click.Context) -> None:
|
|
|
4024
4049
|
default=5,
|
|
4025
4050
|
help="Refresh interval in seconds for watch mode (default: 5)",
|
|
4026
4051
|
)
|
|
4052
|
+
@click.option(
|
|
4053
|
+
"--spot",
|
|
4054
|
+
is_flag=True,
|
|
4055
|
+
default=False,
|
|
4056
|
+
help="Also show spot instances (us-east-1, ~70% cheaper, may be preempted). Hidden by default.",
|
|
4057
|
+
)
|
|
4027
4058
|
@click.pass_context
|
|
4028
|
-
def avail(ctx: click.Context, watch: bool, interval: int) -> None:
|
|
4059
|
+
def avail(ctx: click.Context, watch: bool, interval: int, spot: bool) -> None:
|
|
4029
4060
|
"""Show GPU availability by type and queue estimates
|
|
4030
4061
|
|
|
4031
4062
|
Displays real-time information about GPU availability for each GPU type.
|
|
@@ -4045,9 +4076,9 @@ def avail(ctx: click.Context, watch: bool, interval: int) -> None:
|
|
|
4045
4076
|
This helps you choose the right GPU type and understand wait times before reserving.
|
|
4046
4077
|
"""
|
|
4047
4078
|
if watch:
|
|
4048
|
-
_show_availability_watch(interval)
|
|
4079
|
+
_show_availability_watch(interval, show_spot=spot)
|
|
4049
4080
|
else:
|
|
4050
|
-
_show_availability()
|
|
4081
|
+
_show_availability(show_spot=spot)
|
|
4051
4082
|
|
|
4052
4083
|
|
|
4053
4084
|
@main.command()
|
|
@@ -50,11 +50,22 @@ def check_interactive_support() -> bool:
|
|
|
50
50
|
return True
|
|
51
51
|
|
|
52
52
|
|
|
53
|
+
def _is_spot_type(gt: str) -> bool:
|
|
54
|
+
"""Spot SKUs hidden from default views: the cpu-spot type + any `*-spot` type."""
|
|
55
|
+
return gt == "cpu-spot" or gt.endswith("-spot")
|
|
56
|
+
|
|
57
|
+
|
|
53
58
|
def select_gpu_type_interactive(
|
|
54
59
|
availability_info: Dict[str, Dict[str, Any]],
|
|
55
60
|
_refresh: bool = False,
|
|
61
|
+
show_spot: bool = False,
|
|
56
62
|
) -> Optional[str]:
|
|
57
|
-
"""Interactive GPU type selection with availability table
|
|
63
|
+
"""Interactive GPU type selection with availability table.
|
|
64
|
+
|
|
65
|
+
Spot SKUs (cpu-spot + the cross-region us-east-1 spot cluster) are hidden by
|
|
66
|
+
default — pass show_spot=True (CLI `--spot`) or pick the "Show spot options"
|
|
67
|
+
entry to reveal them.
|
|
68
|
+
"""
|
|
58
69
|
if not check_interactive_support():
|
|
59
70
|
return None
|
|
60
71
|
|
|
@@ -65,33 +76,19 @@ def select_gpu_type_interactive(
|
|
|
65
76
|
_mgr = ReservationManager(_cfg)
|
|
66
77
|
availability_info = _mgr.get_gpu_availability_by_type() or availability_info
|
|
67
78
|
|
|
79
|
+
# Don't hide spot when the whole environment is spot-only (nothing left to show).
|
|
80
|
+
_non_spot_exists = any(
|
|
81
|
+
not _is_spot_type(gt) for gt in availability_info if "-mig-" not in gt
|
|
82
|
+
)
|
|
83
|
+
_hide_spot = (not show_spot) and _non_spot_exists
|
|
84
|
+
|
|
68
85
|
# Hide MIG slice SKUs from the top-level selector — reached via the h100 submenu.
|
|
69
86
|
# Direct `--gpu-type h100-mig-1g` still works for non-interactive scripts.
|
|
70
87
|
visible_info = {
|
|
71
88
|
gt: info for gt, info in availability_info.items()
|
|
72
|
-
if "-mig-" not in gt
|
|
89
|
+
if "-mig-" not in gt and not (_hide_spot and _is_spot_type(gt))
|
|
73
90
|
}
|
|
74
91
|
|
|
75
|
-
# Aggregate MIG slice availability per parent type, hinted on the h100/b200 rows.
|
|
76
|
-
def _mig_aggregates(parent: str):
|
|
77
|
-
avail = sum(
|
|
78
|
-
int(info.get("available", 0))
|
|
79
|
-
for gt, info in (availability_info or {}).items()
|
|
80
|
-
if gt.startswith(f"{parent}-mig-")
|
|
81
|
-
)
|
|
82
|
-
cap = sum(
|
|
83
|
-
int(info.get("total", 0))
|
|
84
|
-
for gt, info in (availability_info or {}).items()
|
|
85
|
-
if gt.startswith(f"{parent}-mig-")
|
|
86
|
-
)
|
|
87
|
-
return avail, cap
|
|
88
|
-
|
|
89
|
-
h100_mig_avail, h100_mig_capacity = _mig_aggregates("h100")
|
|
90
|
-
b200_mig_avail, b200_mig_capacity = _mig_aggregates("b200")
|
|
91
|
-
# Backwards-compat aliases for the existing h100 row code below.
|
|
92
|
-
mig_total_available = h100_mig_avail
|
|
93
|
-
mig_total_capacity = h100_mig_capacity
|
|
94
|
-
|
|
95
92
|
# Detect spot types and fetch cross-region spot availability
|
|
96
93
|
from .config import Config, load_config
|
|
97
94
|
_cfg = load_config()
|
|
@@ -102,9 +99,10 @@ def select_gpu_type_interactive(
|
|
|
102
99
|
has_spot_types = len(_spot_types) > 0
|
|
103
100
|
|
|
104
101
|
# Cross-region: if we're on prod, also fetch prod-east1 spot availability
|
|
102
|
+
# (skipped entirely when spot is hidden — saves a DynamoDB scan).
|
|
105
103
|
spot_region_info = {}
|
|
106
104
|
spot_region_name = None
|
|
107
|
-
if _env_name == "prod":
|
|
105
|
+
if _env_name == "prod" and not _hide_spot:
|
|
108
106
|
east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
|
|
109
107
|
if east1_env:
|
|
110
108
|
spot_region_name = "prod-east1"
|
|
@@ -130,16 +128,11 @@ def select_gpu_type_interactive(
|
|
|
130
128
|
except Exception as e:
|
|
131
129
|
pass # east1 not accessible — show without spot
|
|
132
130
|
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
for gt, info in visible_info.items():
|
|
137
|
-
if "mig" in gt:
|
|
138
|
-
mig_gpus[gt] = info
|
|
139
|
-
else:
|
|
140
|
-
full_gpus[gt] = info
|
|
131
|
+
# visible_info already excludes -mig- SKUs and (when hidden) spot, so these are
|
|
132
|
+
# all "full" rows; MIG slices render as a sub-row under their parent.
|
|
133
|
+
full_gpus = dict(visible_info)
|
|
141
134
|
|
|
142
|
-
# Spot types from cross-region (prod-east1)
|
|
135
|
+
# Spot types from cross-region (prod-east1).
|
|
143
136
|
spot_gpus = {k: v for k, v in spot_region_info.items() if k in _spot_types}
|
|
144
137
|
|
|
145
138
|
def _format_wait(available, est_wait):
|
|
@@ -154,162 +147,119 @@ def select_gpu_type_interactive(
|
|
|
154
147
|
return f"{h}h{f' {m}min' if m else ''}", "⏳"
|
|
155
148
|
return "Unknown", "⚠️"
|
|
156
149
|
|
|
157
|
-
def
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
150
|
+
def _mig_breakdown(parent):
|
|
151
|
+
"""Compact per-slice availability for a parent, e.g. (['12×1G','4×2G'], 16, 32)."""
|
|
152
|
+
parts, tot_a, tot_c = [], 0, 0
|
|
153
|
+
for cgt, ci in sorted((availability_info or {}).items()):
|
|
154
|
+
if not cgt.startswith(f"{parent}-mig-"):
|
|
155
|
+
continue
|
|
156
|
+
a, c = int(ci.get("available", 0)), int(ci.get("total", 0))
|
|
157
|
+
tot_a += a
|
|
158
|
+
tot_c += c
|
|
159
|
+
parts.append(f"{a}×{cgt.rsplit('-', 1)[-1].upper()}")
|
|
160
|
+
return parts, tot_a, tot_c
|
|
161
|
+
|
|
162
|
+
# ── The selectable list IS the table ──────────────────────────────────────
|
|
163
|
+
# questionary indents Separators and Choices identically, so a Separator
|
|
164
|
+
# header + aligned column text line up with the selectable rows. Arrow keys
|
|
165
|
+
# move through the table; Enter picks the highlighted row. No separate print.
|
|
166
|
+
def _row_cells(gt, info, is_spot=False):
|
|
167
|
+
avail = int(info.get("available", 0))
|
|
168
|
+
wd, emoji = _format_wait(avail, info.get("estimated_wait_minutes", 0))
|
|
169
|
+
ql = int(info.get("queue_length", 0))
|
|
170
|
+
if ql > 0:
|
|
171
|
+
wd += f" · {ql} queued"
|
|
172
|
+
typ = f"{gt.upper()} *" if is_spot else gt.upper()
|
|
173
|
+
return [typ, str(avail), str(int(info.get("max_reservable", 0))),
|
|
174
|
+
str(int(info.get("total", 0)))], f"{emoji} {wd}"
|
|
175
|
+
|
|
176
|
+
# Rows: (cells[type, avail, maxres, total], status, value, kind).
|
|
177
|
+
data_rows = []
|
|
178
|
+
for gt, info in full_gpus.items():
|
|
179
|
+
if info.get("maintenance", False):
|
|
180
|
+
data_rows.append((
|
|
181
|
+
[gt.upper(), "-", "-", str(int(info.get("total", 0)))],
|
|
182
|
+
f"MAINTENANCE: {info.get('maintenance_reason', '')}", gt, "maint"))
|
|
183
|
+
continue
|
|
184
|
+
cells, status = _row_cells(gt, info)
|
|
185
|
+
data_rows.append((cells, status, gt, "gpu"))
|
|
186
|
+
parts, mig_a, mig_c = _mig_breakdown(gt)
|
|
187
|
+
if parts:
|
|
188
|
+
data_rows.append((
|
|
189
|
+
[" └─ MIG", str(mig_a), "-", str(mig_c)],
|
|
190
|
+
f"{' '.join(parts)} · pick {gt.upper()} ↑", None, "mig"))
|
|
191
|
+
|
|
192
|
+
spot_data = []
|
|
197
193
|
if spot_gpus:
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
st = Table()
|
|
201
|
-
st.add_column("GPU Type", style="cyan")
|
|
202
|
-
st.add_column("Avail\nNow", style="green")
|
|
203
|
-
st.add_column("Per\nNode", style="bright_green")
|
|
204
|
-
st.add_column("Status", style="magenta")
|
|
205
|
-
st.add_column("Spot Discount", style="dim")
|
|
206
|
-
_on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
194
|
+
_pn = {"b300": 8, "b200": 8, "h200": 8, "h100": 8, "a100": 8, "t4": 4, "l4": 4}
|
|
195
|
+
_od = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
207
196
|
for gt, info in spot_gpus.items():
|
|
208
|
-
avail = info.get("available", 0)
|
|
209
|
-
pn = spot_per_node.get(gt, 8)
|
|
210
|
-
ad = f"[green]{avail}[/green]" if avail > 0 else "[dim]0[/dim]"
|
|
211
|
-
status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
|
|
197
|
+
avail = int(info.get("available", 0))
|
|
212
198
|
si = info.get("spot_info", {}) or {}
|
|
213
|
-
# Availability signal from spot price vs on-demand
|
|
214
199
|
sp = si.get("spot_price", "") if isinstance(si, dict) else ""
|
|
215
|
-
if not sp or
|
|
216
|
-
|
|
200
|
+
if not sp or "No spot data" in str(si.get("spot_signal", "")):
|
|
201
|
+
if avail <= 0:
|
|
202
|
+
continue
|
|
203
|
+
disc = "available now"
|
|
217
204
|
else:
|
|
218
205
|
try:
|
|
219
|
-
|
|
220
|
-
pct = int((1 - ratio) * 100)
|
|
221
|
-
if ratio < 0.4:
|
|
222
|
-
avail_signal = f"[green]High ({pct}% off)[/green]"
|
|
223
|
-
elif ratio < 0.7:
|
|
224
|
-
avail_signal = f"[yellow]Medium ({pct}% off)[/yellow]"
|
|
225
|
-
else:
|
|
226
|
-
avail_signal = f"[red]Low ({pct}% off)[/red]"
|
|
206
|
+
disc = f"~{int((1 - float(sp) / _od.get(gt, 50)) * 100)}% off"
|
|
227
207
|
except (ValueError, TypeError):
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
208
|
+
disc = "spot price n/a"
|
|
209
|
+
status = ("✅ node up" if avail > 0 else "⚡ spins up ~10min") + f" · {disc}"
|
|
210
|
+
spot_data.append((
|
|
211
|
+
[f"{gt.upper()} *", str(avail), f"{_pn.get(gt, 8)}/node", "-"],
|
|
212
|
+
status, f"spot:{gt}", "spot"))
|
|
233
213
|
|
|
234
|
-
#
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
214
|
+
# Column widths over the 4 text columns (header + all rows).
|
|
215
|
+
headers = ["GPU Type", "Avail", "MaxRes", "Total"]
|
|
216
|
+
_all_cells = [headers] + [r[0] for r in data_rows] + [s[0] for s in spot_data]
|
|
217
|
+
widths = [max(len(str(row[i])) for row in _all_cells) for i in range(4)]
|
|
218
|
+
|
|
219
|
+
def _fmt(cells, status=""):
|
|
220
|
+
body = " ".join(str(c).ljust(widths[i]) for i, c in enumerate(cells))
|
|
221
|
+
return f"{body} {status}".rstrip()
|
|
222
|
+
|
|
223
|
+
console.print()
|
|
224
|
+
choices = [questionary.Separator(_fmt(headers, "Status"))]
|
|
225
|
+
if not data_rows:
|
|
226
|
+
choices.append(questionary.Separator("(no GPU types available)"))
|
|
227
|
+
for cells, status, value, kind in data_rows:
|
|
228
|
+
title = _fmt(cells, status)
|
|
229
|
+
if kind == "mig":
|
|
230
|
+
choices.append(questionary.Separator(title))
|
|
231
|
+
elif kind == "maint":
|
|
232
|
+
choices.append(questionary.Choice(title=title, value=value, disabled="maintenance"))
|
|
248
233
|
else:
|
|
249
|
-
|
|
250
|
-
if ql > 0:
|
|
251
|
-
label += f" - {ql} in queue"
|
|
252
|
-
if gt == "h100" and mig_total_capacity > 0:
|
|
253
|
-
label += f" — also {mig_total_available}/{mig_total_capacity} MIG slices"
|
|
254
|
-
elif gt == "b200" and b200_mig_capacity > 0:
|
|
255
|
-
label += f" — also {b200_mig_avail}/{b200_mig_capacity} MIG slices"
|
|
256
|
-
choices.append(questionary.Choice(title=label, value=gt))
|
|
257
|
-
|
|
258
|
-
if mig_gpus:
|
|
259
|
-
choices.append(questionary.Separator("═══ 🔬 MIG Slices (fractional GPUs) ═══"))
|
|
260
|
-
for gt, info in mig_gpus.items():
|
|
261
|
-
avail = info.get("available", 0)
|
|
262
|
-
total = info.get("total", 0)
|
|
263
|
-
_, si = _format_wait(avail, info.get("estimated_wait_minutes", 0))
|
|
264
|
-
choices.append(questionary.Choice(
|
|
265
|
-
title=f"{si} {gt.upper()} ({avail}/{total} available)", value=gt))
|
|
234
|
+
choices.append(questionary.Choice(title=title, value=value))
|
|
266
235
|
|
|
267
|
-
if
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
for gt, info in spot_gpus.items():
|
|
272
|
-
avail = info.get("available", 0)
|
|
273
|
-
pn = _spot_per_node.get(gt, 8)
|
|
274
|
-
si_data = info.get("spot_info", {}) or {}
|
|
275
|
-
sp = si_data.get("spot_price", "") if isinstance(si_data, dict) else ""
|
|
276
|
-
# Derive availability signal
|
|
277
|
-
avail_now = int(info.get("available", 0))
|
|
278
|
-
if not sp or "No spot data" in str(si_data.get("spot_signal", "")):
|
|
279
|
-
if avail_now > 0:
|
|
280
|
-
signal = f"🟢 {avail_now} available now"
|
|
281
|
-
else:
|
|
282
|
-
continue
|
|
283
|
-
else:
|
|
284
|
-
try:
|
|
285
|
-
ratio = float(sp) / _on_demand.get(gt, 50)
|
|
286
|
-
pct = int((1 - ratio) * 100)
|
|
287
|
-
if ratio < 0.4: signal = f"🟢 High avail ({pct}% off)"
|
|
288
|
-
elif ratio < 0.7: signal = f"🟡 Medium ({pct}% off)"
|
|
289
|
-
else: signal = f"🔴 Low ({pct}% off)"
|
|
290
|
-
except (ValueError, TypeError):
|
|
291
|
-
signal = "availability unknown"
|
|
292
|
-
if avail > 0:
|
|
293
|
-
label = f"✅ {gt.upper()} * ({avail} free, {pn}/node, {signal})"
|
|
294
|
-
else:
|
|
295
|
-
label = f"⚡ {gt.upper()} * ({pn} GPUs/node, {signal})"
|
|
296
|
-
choices.append(questionary.Choice(title=label, value=f"spot:{gt}"))
|
|
236
|
+
if spot_data:
|
|
237
|
+
choices.append(questionary.Separator("⚡ Spot — us-east-1, ~70% cheaper, may be preempted:"))
|
|
238
|
+
for cells, status, value, _kind in spot_data:
|
|
239
|
+
choices.append(questionary.Choice(title=_fmt(cells, status), value=value))
|
|
297
240
|
|
|
298
241
|
choices.append(questionary.Separator("───"))
|
|
242
|
+
if _hide_spot:
|
|
243
|
+
choices.append(questionary.Choice(
|
|
244
|
+
title="⚡ Show spot options (us-east-1, ~70% cheaper, may be preempted)",
|
|
245
|
+
value="_show_spot"))
|
|
299
246
|
choices.append(questionary.Choice(title="🔄 Refresh availability", value="_refresh"))
|
|
300
247
|
|
|
301
248
|
console.print()
|
|
302
249
|
|
|
303
|
-
# Interactive selection — loop on refresh
|
|
250
|
+
# Interactive selection — loop on refresh / spot toggle
|
|
304
251
|
while True:
|
|
305
252
|
try:
|
|
306
253
|
answer = questionary.select(
|
|
307
|
-
"Select GPU type:", choices=choices, style=custom_style
|
|
254
|
+
"Select GPU type (↑/↓, Enter):", choices=choices, style=custom_style
|
|
308
255
|
).ask()
|
|
309
256
|
|
|
310
257
|
if answer == "_refresh":
|
|
311
258
|
console.print("[dim]Refreshing...[/dim]")
|
|
312
|
-
return select_gpu_type_interactive(
|
|
259
|
+
return select_gpu_type_interactive(
|
|
260
|
+
availability_info, _refresh=True, show_spot=show_spot)
|
|
261
|
+
if answer == "_show_spot":
|
|
262
|
+
return select_gpu_type_interactive(availability_info, show_spot=True)
|
|
313
263
|
return answer
|
|
314
264
|
except (KeyboardInterrupt, EOFError):
|
|
315
265
|
console.print("\n[yellow]Selection cancelled.[/yellow]")
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.3"
|
|
8
8
|
description = "CLI + Python SDK for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|