gpu-dev 0.7.6__tar.gz → 0.7.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_dev-0.7.11/.github/workflows/tests.yml +20 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/.gitignore +11 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/CLAUDE.md +89 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/PKG-INFO +6 -1
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +112 -34
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +26 -3
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +28 -18
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +13 -1
- gpu_dev-0.7.11/conftest.py +92 -0
- gpu_dev-0.7.11/docs/FAST_REPRO_DESIGN.md +141 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/PKG-INFO +6 -1
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/SOURCES.txt +55 -2
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/requires.txt +6 -0
- gpu_dev-0.7.11/post-may-2026.md +185 -0
- gpu_dev-0.7.11/presentation/CLAUDE.md +220 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/cli-demo.html +5 -5
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/gpu-fleet.html +5 -5
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/k8s-under-the-hood.html +8 -8
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/multinode.html +10 -10
- gpu_dev-0.7.11/presentation/pyproject.toml +33 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/sdk-demo.html +6 -6
- gpu_dev-0.7.11/presentation/teaser.html +317 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/pyproject.toml +17 -1
- gpu_dev-0.7.11/sdk/python/examples/parallel_experiments.ipynb +408 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_backend/aws.py +4 -1
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/availability.tf +2 -1
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/Dockerfile +18 -7
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/bashrc +9 -1
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/zshrc +5 -2
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/git-cache.tf +2 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/kubernetes.tf +7 -2
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/availability_updater/index.py +39 -3
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +11 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_processor/index.py +206 -13
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda.tf +16 -1
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/mig-parted-config.yaml +15 -0
- gpu_dev-0.7.11/terraform-gpu-devservers/pytorch-ondemand.tf +178 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/pytorch-prebuild.tf +47 -3
- gpu_dev-0.7.11/tests/integration/README.md +35 -0
- gpu_dev-0.7.11/tests/integration/__init__.py +0 -0
- gpu_dev-0.7.11/tests/integration/conftest.py +131 -0
- gpu_dev-0.7.11/tests/integration/test_claude.py +40 -0
- gpu_dev-0.7.11/tests/integration/test_cpu_lifecycle.py +36 -0
- gpu_dev-0.7.11/tests/integration/test_repro_known_failure.py +54 -0
- gpu_dev-0.7.11/tests/integration/test_t4_lifecycle.py +39 -0
- gpu_dev-0.7.11/tests/integration/test_warm_pool.py +54 -0
- gpu_dev-0.7.11/tests/unit/__init__.py +0 -0
- gpu_dev-0.7.11/tests/unit/cli/__init__.py +0 -0
- gpu_dev-0.7.11/tests/unit/cli/test_auth.py +442 -0
- gpu_dev-0.7.11/tests/unit/cli/test_avail.py +295 -0
- gpu_dev-0.7.11/tests/unit/cli/test_cancel.py +380 -0
- gpu_dev-0.7.11/tests/unit/cli/test_config_cmd.py +187 -0
- gpu_dev-0.7.11/tests/unit/cli/test_config_module.py +476 -0
- gpu_dev-0.7.11/tests/unit/cli/test_connect.py +373 -0
- gpu_dev-0.7.11/tests/unit/cli/test_disks.py +747 -0
- gpu_dev-0.7.11/tests/unit/cli/test_edit.py +321 -0
- gpu_dev-0.7.11/tests/unit/cli/test_interactive.py +489 -0
- gpu_dev-0.7.11/tests/unit/cli/test_list_show.py +547 -0
- gpu_dev-0.7.11/tests/unit/cli/test_name_generator.py +272 -0
- gpu_dev-0.7.11/tests/unit/cli/test_repro.py +454 -0
- gpu_dev-0.7.11/tests/unit/cli/test_reservations_mgr.py +593 -0
- gpu_dev-0.7.11/tests/unit/cli/test_reserve.py +394 -0
- gpu_dev-0.7.11/tests/unit/cli/test_smoke.py +12 -0
- gpu_dev-0.7.11/tests/unit/cli/test_ssh_alias.py +130 -0
- gpu_dev-0.7.11/tests/unit/cli/test_submit.py +401 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/__init__.py +0 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_availability.py +488 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_cancellation.py +355 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_claim.py +348 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_mig_gpu_config.py +598 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_pod_resources.py +255 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_ref_staging.py +292 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_smoke.py +12 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_version_gate.py +178 -0
- gpu_dev-0.7.11/tests/unit/lambda_fn/test_warm_pool.py +682 -0
- gpu_dev-0.7.11/tests/unit/sdk/__init__.py +0 -0
- gpu_dev-0.7.11/tests/unit/sdk/test_backend_aws.py +790 -0
- gpu_dev-0.7.11/tests/unit/sdk/test_client.py +519 -0
- gpu_dev-0.7.11/tests/unit/sdk/test_errors_enums.py +308 -0
- gpu_dev-0.7.11/tests/unit/sdk/test_models_extra.py +361 -0
- gpu_dev-0.7.11/tests/unit/sdk/test_sandbox.py +352 -0
- gpu_dev-0.7.11/tests/unit/sdk/test_sdk_config.py +258 -0
- gpu_dev-0.7.11/tests/unit/sdk/test_transport_ssh.py +327 -0
- gpu_dev-0.7.6/sdk/python/examples/parallel_experiments.ipynb +0 -362
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/README.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/admin/README.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/admin/generate_stats.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/admin/requirements.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/SDK_REPRO.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/architecture.html +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/devgpu-features.html +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/feedback.png +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/index.html +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/osdc-future-plans.html +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/problem.png +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/sandbox.html +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/thesis.html +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/title-vid.mp4 +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/weneedgpus.png +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/wow.html +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/README.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/batch_multi_gpu.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/interactive_debug.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/quickstart.ipynb +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/run_tests.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/submit_job.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/config.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/enums.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/errors.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/models.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/py.typed +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/tests/test_models.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/setup.cfg +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-deck/backend.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-deck/main.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-deck/terraform.tfvars.example +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/build-node.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/warm-pool.tf +0 -0
- {gpu_dev-0.7.6/sdk/python → gpu_dev-0.7.11}/tests/__init__.py +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/tests/submit/README.md +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.7.6 → gpu_dev-0.7.11}/tests/submit/success/run.sh +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
unit:
|
|
9
|
+
name: unit + mocks
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- name: Install uv
|
|
14
|
+
uses: astral-sh/setup-uv@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- name: Install package + test deps
|
|
18
|
+
run: uv pip install -e ".[test]"
|
|
19
|
+
- name: Run unit + mock tests (integration excluded)
|
|
20
|
+
run: uv run pytest -m "not integration"
|
|
@@ -73,3 +73,14 @@ lambda/*/package/
|
|
|
73
73
|
admin/output/
|
|
74
74
|
|
|
75
75
|
.claude/worktrees/
|
|
76
|
+
.claude/settings.local.json
|
|
77
|
+
.claude/scheduled_tasks.lock
|
|
78
|
+
|
|
79
|
+
# Org-specific (filled in locally; not committed)
|
|
80
|
+
docs/INTERNAL_AUTH.md
|
|
81
|
+
|
|
82
|
+
# Local scratch / staging terraform working dir
|
|
83
|
+
*.pid
|
|
84
|
+
terraform-gpu-devservers/staging/.terraform/
|
|
85
|
+
terraform-gpu-devservers/staging/__pycache__/
|
|
86
|
+
terraform-gpu-devservers/staging/*.log
|
|
@@ -28,6 +28,59 @@ For terraform, we use opentofu, don't ever run tf apply directly. You're free to
|
|
|
28
28
|
- Group imports in standard order: standard library, third-party, local imports
|
|
29
29
|
- Use absolute imports when possible
|
|
30
30
|
|
|
31
|
+
## Testing (DO THIS FOR EVERY CHANGE)
|
|
32
|
+
|
|
33
|
+
There is a real test suite now. **Every change must keep it green, and add/adjust
|
|
34
|
+
tests.** Two tiers:
|
|
35
|
+
|
|
36
|
+
**1. Unit + mocks — ALWAYS run, must stay green (CI runs this on every push/PR).**
|
|
37
|
+
Fully mocked (boto3 / k8s / SSH / subprocess), no network, ~2s.
|
|
38
|
+
```bash
|
|
39
|
+
uv pip install -e ".[test]" # one-time: pytest, moto, kubernetes
|
|
40
|
+
uv run pytest -m "not integration" # ~1140 tests; run before every commit
|
|
41
|
+
```
|
|
42
|
+
- Layout: `tests/unit/{sdk,cli,lambda_fn}/test_*.py`; shared fixtures in the root
|
|
43
|
+
`conftest.py` (`cli_runner`, `lambda_index` = the lambda imported as `index`
|
|
44
|
+
with env pre-set, `aws_mocks` = MagicMock boto3 handles).
|
|
45
|
+
- When you touch CLI / SDK / lambda code, update or add the matching `test_*.py`.
|
|
46
|
+
- CI: `.github/workflows/tests.yml`. Lambda imports need env vars + sys.path — the
|
|
47
|
+
root `conftest.py` already sets both.
|
|
48
|
+
|
|
49
|
+
**2. e2e integration on STAGING — run for anything touching the
|
|
50
|
+
reserve/pod/SSH/lambda path before merging.** Real reservations on the **staging**
|
|
51
|
+
cluster (us-west-1), cpu + t4 only, auto-cancelled. Staging is the DEFAULT target
|
|
52
|
+
and github_user comes from your config, so the bare command is enough:
|
|
53
|
+
```bash
|
|
54
|
+
uv run pytest -m integration --run-integration -v
|
|
55
|
+
```
|
|
56
|
+
- Staging is the default (`GPU_DEV_TEST_ENV` defaults to `staging` → us-west-1,
|
|
57
|
+
standard `pytorch-gpu-dev-*` prefix, tf workspace `default`). The integration
|
|
58
|
+
conftest pins the region so the unit-test us-east-2 default can't leak in. Wired
|
|
59
|
+
in `cli-tools/.../config.py` ENVIRONMENTS.
|
|
60
|
+
- Covers: cpu-x86 + t4 reserve→active→cancel, list-while-active, exec
|
|
61
|
+
(`nproc`/`nvidia-smi`/`torch.cuda`), **`claude -p` answers "Paris"** (pod Claude
|
|
62
|
+
Code/Bedrock), and the **warm pool** (fast warm claim + custom-image
|
|
63
|
+
warm-ineligibility). Each cancels in a `finally` (no leaked pods).
|
|
64
|
+
- Warm-pool tests need `WARM_POOL_TARGETS` deployed on staging — set in
|
|
65
|
+
`lambda.tf` for the `default` workspace (`{t4, cpu-x86, cpu-arm}`). Staging IS the
|
|
66
|
+
tf `default` workspace (us-west-1, environment=test) — there is no `test`/`staging`
|
|
67
|
+
workspace: `tofu workspace select default && tofu apply`. Until then the warm
|
|
68
|
+
tests skip ("came up cold"). Custom-image test: set `GPU_DEV_TEST_IMAGE`.
|
|
69
|
+
- Repro test (`test_repro_known_failure.py`): set `GPU_DEV_REPRO_REF` +
|
|
70
|
+
`GPU_DEV_REPRO_TEST` to a known-red (commit, test). Find one with the
|
|
71
|
+
**treehugger MCP** (`hud`, user-scope — `get_hud_data`/`master_commit_red`).
|
|
72
|
+
Note: prebuilt torch is h100/b200 arch, so a CUDA test on t4 needs a full build;
|
|
73
|
+
prefer a failure that runs on the box's GPU or on cpu.
|
|
74
|
+
- Skips cleanly if staging is unreachable or the runner has no outbound SSH (e.g. a
|
|
75
|
+
sandbox). The reservation role can query/SQS but lacks `DescribeTable`, so the
|
|
76
|
+
reachability probe uses scan+get-queue-url, not describe.
|
|
77
|
+
- Validated live (2026-05-31): cpu + t4 lifecycle PASS; warm-claim test confirmed
|
|
78
|
+
it reaches the real reserve (skips until WARM_POOL_TARGETS is applied).
|
|
79
|
+
|
|
80
|
+
**Rule of thumb:** unit+mocks for *every* change; add e2e coverage when you add a
|
|
81
|
+
new command/flow; run the staging e2e before merging anything that could affect a
|
|
82
|
+
live reservation. Don't say "done/tested" without having run the relevant tier.
|
|
83
|
+
|
|
31
84
|
## Content
|
|
32
85
|
|
|
33
86
|
- torchci - a next.js app containing a PyTorch CI tracker
|
|
@@ -51,6 +104,42 @@ Currently we're working on a developer servers with GPUs in AWS. This means we'l
|
|
|
51
104
|
|
|
52
105
|
# AGENT SECTION
|
|
53
106
|
|
|
107
|
+
## Fast-repro redesign — by-SHA artifact cache + on-demand build (2026-06-01)
|
|
108
|
+
|
|
109
|
+
Goal: `gpu-dev repro <ref>` for any pytorch commit from the last ~72h lands a built,
|
|
110
|
+
importable tree in <2min. Design: `docs/FAST_REPRO_DESIGN.md`. **All merged to main**
|
|
111
|
+
(PRs #186–#189); **needs `tofu apply` (prod, workspace `prod`) + image rebuild**.
|
|
112
|
+
|
|
113
|
+
- **by-SHA artifact cache** (#186): whole *built* trees keyed by commit SHA at
|
|
114
|
+
`/ccache_shared/prebuilt/by-sha/<sha>.tar.{zst,gz}` (`.sha` written last = the
|
|
115
|
+
completion gate). Cron seeds one per viable/strict bump (hardlink, no extra space).
|
|
116
|
+
`stage-pytorch` (cold `--ref`) + `gpu-dev repro` consume on hit → `import torch`
|
|
117
|
+
with ZERO build. `repro` also publishes its in-pod build via `publish-pytorch-build`
|
|
118
|
+
(detached) so the cache fills from real usage. All paths safe-fallback on miss;
|
|
119
|
+
`ls-remote` is `timeout 15`.
|
|
120
|
+
- **retention** (#188): prebuild cron prunes by-sha entries >72h every tick (storage
|
|
121
|
+
budget ~500-650GB on the elastic ccache EFS). The by-sha set IS the snapshot ladder.
|
|
122
|
+
- **mold linker** (#187): Dockerfile installs `mold`; cron + in-pod repro build wrap
|
|
123
|
+
with `mold -run` (guarded on `command -v mold`). Drops the libtorch_cuda.so relink
|
|
124
|
+
~1-3min → ~15s. **Needs image rebuild** to activate (prod runs a stale image; that's
|
|
125
|
+
also why prod publishes gzip not zstd — the Dockerfile has zstd already).
|
|
126
|
+
- **on-demand build worker** (#189, `pytorch-ondemand.tf`): always-on Deployment on
|
|
127
|
+
NodeType=build drains `prebuilt/build-queue/<sha>.req` (own hostPath tree
|
|
128
|
+
`/mnt/ondemand-build` → builds at `/home/dev/pytorch` so build/ paths are
|
|
129
|
+
pod-compatible; mold+ccache), publishes by-sha, writes `.worker-alive` heartbeat.
|
|
130
|
+
`repro` enqueues + polls ONLY when the heartbeat is fresh (else straight to in-pod
|
|
131
|
+
build → zero regression if not deployed). Makes the FIRST repro of an uncached
|
|
132
|
+
commit fast. Coordination 100% via shared EFS — no new networking/RBAC/lambda.
|
|
133
|
+
- cuDNN fidelity (`USE_CUDNN=1`) DEFERRED — forcing it can fail the build if cuDNN
|
|
134
|
+
isn't found under cuda-13.2; needs prod e2e. Base image is cudnn9-devel.
|
|
135
|
+
- Fast path is **prod-arch only** (`sm_90;sm_100` = H100/B200); t4/staging is wrong-arch.
|
|
136
|
+
- Also: SSH alias now keys off reservation id not pod name (#185) so warm/repro pods
|
|
137
|
+
are reachable via `ssh gpu-dev-<resid>` / `connect` (routing is via the FQDN, the
|
|
138
|
+
alias is a local label). CCACHE_MAXSIZE settled at 250G (#184).
|
|
139
|
+
- Prod e2e: `gpu-dev repro <fresh-sha> <test> --gpu-type h100 --no-connect` (first =
|
|
140
|
+
off-pod build + stage; rerun = by-sha HIT zero build). Worker logs:
|
|
141
|
+
`k -n management logs deploy/pytorch-ondemand-builder -f`.
|
|
142
|
+
|
|
54
143
|
## Instant-sandboxes branch — WIP & things to fix (2026-05-29)
|
|
55
144
|
|
|
56
145
|
Big push on warm pools + instant claims + prebuilt pytorch. Tracking state here so it's not lost.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.11
|
|
4
4
|
Summary: CLI + Python SDK for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,6 +15,11 @@ Requires-Dist: questionary>=2.1.1
|
|
|
15
15
|
Requires-Dist: websockets>=12.0
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
|
+
Provides-Extra: test
|
|
19
|
+
Requires-Dist: pytest>=7.4; extra == "test"
|
|
20
|
+
Requires-Dist: pytest-cov>=4.1; extra == "test"
|
|
21
|
+
Requires-Dist: moto[dynamodb,ec2,sqs]>=5.0; extra == "test"
|
|
22
|
+
Requires-Dist: kubernetes>=28.1; extra == "test"
|
|
18
23
|
|
|
19
24
|
# GPU Developer CLI & SDK
|
|
20
25
|
|
|
@@ -319,6 +319,9 @@ def _show_single_reservation(connection_info: dict) -> None:
|
|
|
319
319
|
reservation_id = connection_info["reservation_id"]
|
|
320
320
|
reservation_name = connection_info.get("name")
|
|
321
321
|
pod_name = connection_info.get("pod_name", "")
|
|
322
|
+
# SSH host alias keys off the reservation id (works for warm-claimed pods,
|
|
323
|
+
# whose pod_name != gpu-dev-<resid8>). pod_name is shown separately below.
|
|
324
|
+
host_alias = f"gpu-dev-{short_id}"
|
|
322
325
|
ssh_config_path = get_ssh_config_path(reservation_id, reservation_name)
|
|
323
326
|
use_include = is_ssh_include_enabled()
|
|
324
327
|
|
|
@@ -328,14 +331,14 @@ def _show_single_reservation(connection_info: dict) -> None:
|
|
|
328
331
|
if use_include:
|
|
329
332
|
# User approved Include - show simple commands
|
|
330
333
|
from .reservations import _make_vscode_link
|
|
331
|
-
ssh_command_display = f"[green]ssh {
|
|
332
|
-
vscode_url = _make_vscode_link(
|
|
333
|
-
vscode_cmd_text = f"code --remote ssh-remote+{
|
|
334
|
+
ssh_command_display = f"[green]ssh {host_alias}[/green]"
|
|
335
|
+
vscode_url = _make_vscode_link(host_alias)
|
|
336
|
+
vscode_cmd_text = f"code --remote ssh-remote+{host_alias} /home/dev"
|
|
334
337
|
vscode_command_display = f"[link={vscode_url}][green]{vscode_cmd_text}[/green][/link]"
|
|
335
338
|
vscode_info = f"[blue]VS Code Remote:[/blue] {vscode_command_display}\n"
|
|
336
339
|
else:
|
|
337
340
|
# User declined Include - show commands with -F flag
|
|
338
|
-
ssh_command_display = f"[green]ssh -F {ssh_config_path} {
|
|
341
|
+
ssh_command_display = f"[green]ssh -F {ssh_config_path} {host_alias}[/green]"
|
|
339
342
|
vscode_command_display = f"Add [green]Include ~/.gpu-dev/*-sshconfig[/green] to ~/.ssh/config and ~/.cursor/ssh_config (or: [green]gpu-dev config ssh-include enable[/green])"
|
|
340
343
|
vscode_info = f"[blue]VS Code/Cursor:[/blue] {vscode_command_display}\n"
|
|
341
344
|
else:
|
|
@@ -1554,27 +1557,82 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
|
1554
1557
|
except RuntimeError as e:
|
|
1555
1558
|
rprint(f"[red]❌ {str(e)}[/red]"); return
|
|
1556
1559
|
|
|
1557
|
-
# ref
|
|
1560
|
+
# Resolve the ref in-pod -> WANT (sha, for the by-sha cache) + FREF (fetch ref).
|
|
1561
|
+
# A MERGED pr/N reproduces the actual squash/merge commit on main (the real trunk
|
|
1562
|
+
# state that was red) — NOT pull/N/merge (the PR re-applied onto *current* trunk,
|
|
1563
|
+
# which goes green once the fix lands). Open PRs keep pull/N/merge (= CI's view).
|
|
1558
1564
|
r = ref.strip(); prnum = None
|
|
1559
1565
|
if r.startswith("pr/"): prnum = r[3:]
|
|
1560
1566
|
elif r.startswith("#"): prnum = r[1:]
|
|
1561
1567
|
elif r.isdigit(): prnum = r
|
|
1568
|
+
gh = "https://github.com/pytorch/pytorch.git"
|
|
1562
1569
|
if prnum:
|
|
1563
|
-
|
|
1564
|
-
|
|
1570
|
+
api = f"https://api.github.com/repos/pytorch/pytorch/pulls/{prnum}"
|
|
1571
|
+
resolve = (
|
|
1572
|
+
f"PRJSON=$(curl -s -m 10 -H 'Accept: application/vnd.github+json' -H 'User-Agent: gpu-dev' {api} 2>/dev/null); "
|
|
1573
|
+
"MCS=$(printf '%s' \"$PRJSON\" | grep -oE '\"merge_commit_sha\": *\"[0-9a-f]+\"' | head -1 | cut -d'\"' -f4); "
|
|
1574
|
+
"if printf '%s' \"$PRJSON\" | grep -q '\"merged\": *true' && [ -n \"$MCS\" ]; then "
|
|
1575
|
+
f"WANT=\"$MCS\"; FREF=\"$MCS\"; echo \"[repro] pr/{prnum} is merged -> reproducing trunk commit $MCS\"; "
|
|
1576
|
+
f"else FREF=pull/{prnum}/merge; WANT=$(timeout 15 git ls-remote {gh} $FREF 2>/dev/null | head -1 | cut -f1); "
|
|
1577
|
+
f"[ -n \"$WANT\" ] || {{ FREF=pull/{prnum}/head; WANT=$(timeout 15 git ls-remote {gh} $FREF 2>/dev/null | head -1 | cut -f1); echo '[repro] open PR, no /merge -> /head'; }}; fi; ")
|
|
1565
1578
|
else:
|
|
1566
1579
|
rq = shlex.quote(r)
|
|
1567
|
-
|
|
1580
|
+
resolve = (f"FREF={rq}; WANT=$(timeout 15 git ls-remote {gh} {rq} 2>/dev/null | head -1 | cut -f1); "
|
|
1581
|
+
f"[ -n \"$WANT\" ] || case {rq} in *[!0-9a-fA-F]*) WANT= ;; *) WANT={rq} ;; esac; ")
|
|
1582
|
+
# in-pod fallback checkout (by-sha miss + farm unavailable): fetch the resolved ref,
|
|
1583
|
+
# else check out the sha directly (reachable for a merged-PR land commit / trunk).
|
|
1584
|
+
checkout = ("git fetch origin \"$FREF\" 2>/dev/null && git checkout -f FETCH_HEAD "
|
|
1585
|
+
"|| git checkout -f \"$WANT\" 2>/dev/null "
|
|
1586
|
+
"|| { git fetch --force origin 2>/dev/null && git checkout -f \"$WANT\"; }")
|
|
1568
1587
|
|
|
1569
1588
|
testcmd = " ".join(shlex.quote(a) for a in test_args)
|
|
1589
|
+
# by-sha artifact cache: if a fully-built tree for the resolved SHA already exists
|
|
1590
|
+
# (shared EFS, seeded by the build node + prior repros), stage it -> ZERO build.
|
|
1591
|
+
# Otherwise build, then publish the result so the next dev (anyone) gets it instant.
|
|
1570
1592
|
remote = (
|
|
1571
1593
|
"set -e; cd /home/dev/pytorch; "
|
|
1572
1594
|
"git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true; "
|
|
1573
|
-
|
|
1595
|
+
"BYSHA=/ccache_shared/prebuilt/by-sha; QUEUE=/ccache_shared/prebuilt/build-queue; HIT=; "
|
|
1596
|
+
# bs <sha>: stage a fully-built by-sha tree into /home/dev/pytorch (zero build); 0 on success.
|
|
1597
|
+
# explicit ext check, not a glob: the pod login shell is zsh, where an unmatched glob is a hard error.
|
|
1598
|
+
# require the .sha completion gate (written last) so we never stage a half-published tarball.
|
|
1599
|
+
"bs() { local s=\"$1\" tb=; [ -f \"$BYSHA/$s.sha\" ] || return 1; for e in zst gz; do [ -f \"$BYSHA/$s.tar.$e\" ] && { tb=\"$BYSHA/$s.tar.$e\"; break; }; done; [ -n \"$tb\" ] || return 1; "
|
|
1600
|
+
"rm -rf /home/dev/pytorch.new; mkdir -p /home/dev/pytorch.new; "
|
|
1601
|
+
"case \"$tb\" in *.zst) zstd -dc \"$tb\" 2>/dev/null | tar -C /home/dev/pytorch.new --strip-components=1 -xf - 2>/dev/null ;; "
|
|
1602
|
+
"*) tar -C /home/dev/pytorch.new --strip-components=1 -xzf \"$tb\" 2>/dev/null ;; esac; "
|
|
1603
|
+
"[ -d /home/dev/pytorch.new/.git ] || { rm -rf /home/dev/pytorch.new; return 1; }; "
|
|
1604
|
+
"rm -rf /home/dev/pytorch; mv /home/dev/pytorch.new /home/dev/pytorch; return 0; }; "
|
|
1605
|
+
+ resolve +
|
|
1606
|
+
"echo \"[repro] target ${WANT:-?}\"; "
|
|
1607
|
+
# 1) already cached -> stage it (zero build)
|
|
1608
|
+
"if [ -n \"$WANT\" ] && bs \"$WANT\"; then cd /home/dev/pytorch; HIT=1; echo '[repro] by-sha cache HIT -> staged prebuilt tree (zero build)'; fi; "
|
|
1609
|
+
# 2) not cached, build farm alive -> request an off-pod build, wait, then stage
|
|
1610
|
+
"if [ -z \"$HIT\" ] && [ -n \"$WANT\" ] && [ -n \"$(find \"$QUEUE/.worker-alive\" -mmin -2 2>/dev/null)\" ]; then "
|
|
1611
|
+
"echo \"[repro] no cached build; requesting off-pod build of $WANT (build farm; streaming progress)…\"; printf '%s\\n' \"$FREF\" > \"$QUEUE/$WANT.req\" 2>/dev/null || true; "
|
|
1612
|
+
# poll for the artifact; meanwhile tail the farm's build log (ninja [x/N]) so it's not a silent hang.
|
|
1613
|
+
"i=0; LL=0; while [ $i -lt 400 ]; do [ -f \"$BYSHA/$WANT.sha\" ] && break; [ -f \"$QUEUE/$WANT.req\" ] || break; "
|
|
1614
|
+
"if [ -f \"$QUEUE/$WANT.log\" ]; then NL=$(wc -l < \"$QUEUE/$WANT.log\" 2>/dev/null || echo 0); "
|
|
1615
|
+
"if [ \"$NL\" -gt \"$LL\" ]; then tail -n +$((LL+1)) \"$QUEUE/$WANT.log\" 2>/dev/null | grep -aE '\\[[0-9]+/[0-9]+\\]|Building wheel|Successfully built|error' | tail -1 | sed 's/^/ [farm] /'; LL=$NL; fi; fi; "
|
|
1616
|
+
"sleep 3; i=$((i+1)); done; "
|
|
1617
|
+
"if bs \"$WANT\"; then cd /home/dev/pytorch; HIT=1; echo '[repro] off-pod build ready -> staged (zero build)'; else echo '[repro] off-pod build unavailable, building locally'; fi; fi; "
|
|
1618
|
+
# 3) fall back to in-pod fetch + build (+ cache the result for the next dev)
|
|
1619
|
+
"if [ -z \"$HIT\" ]; then "
|
|
1620
|
+
"echo \"[repro] checking out $FREF\"; " + checkout + "; "
|
|
1574
1621
|
"echo \"[repro] HEAD $(git rev-parse --short HEAD)\"; "
|
|
1575
1622
|
"git -c protocol.file.allow=always submodule update --init --recursive --jobs 8 >/dev/null 2>&1 || true; "
|
|
1576
1623
|
"if ! PYTHONPATH=/home/dev/pytorch python -c 'import torch' 2>/dev/null; then "
|
|
1577
|
-
"echo
|
|
1624
|
+
"echo \"[repro] prebuilt torch != this commit -> rebuilding (ccache-accelerated, but the further this commit is from viable/strict, the more recompiles). checked-out: $(git log -1 --format='%h %ci')\"; "
|
|
1625
|
+
# mold -run routes the libtorch_cuda.so relink through mold (~15s vs minutes); guarded.
|
|
1626
|
+
# Explicit if/else (not `$M pip`): the pod login shell is zsh, which doesn't word-split
|
|
1627
|
+
# unquoted vars. -v streams the cmake/ninja [x/N] progress instead of pip's blind spinner.
|
|
1628
|
+
"if command -v mold >/dev/null 2>&1; then mold -run pip install --break-system-packages -e . --no-build-isolation -v; "
|
|
1629
|
+
"else pip install --break-system-packages -e . --no-build-isolation -v; fi; fi; "
|
|
1630
|
+
# cache this build for the next dev (detached so it survives the ssh session)
|
|
1631
|
+
"SHA=$(git rev-parse HEAD 2>/dev/null); "
|
|
1632
|
+
"if command -v publish-pytorch-build >/dev/null 2>&1 && [ -n \"$SHA\" ] && [ ! -f \"$BYSHA/$SHA.sha\" ]; then "
|
|
1633
|
+
"echo '[repro] caching this build (by-sha) for next time…'; "
|
|
1634
|
+
"setsid publish-pytorch-build \"$SHA\" >/dev/null 2>&1 < /dev/null & fi; "
|
|
1635
|
+
"fi; "
|
|
1578
1636
|
f"echo '[repro] running: python {testcmd}'; "
|
|
1579
1637
|
f"PYTHONPATH=/home/dev/pytorch python {testcmd}"
|
|
1580
1638
|
)
|
|
@@ -1879,7 +1937,9 @@ def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dock
|
|
|
1879
1937
|
sys.exit(1)
|
|
1880
1938
|
create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
|
|
1881
1939
|
|
|
1882
|
-
|
|
1940
|
+
# Host alias matches the Host line written by create_ssh_config_for_reservation
|
|
1941
|
+
# (keyed off the reservation id, so warm-claimed masters resolve too).
|
|
1942
|
+
ssh_alias = f"gpu-dev-{master_id[:8]}"
|
|
1883
1943
|
ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
|
|
1884
1944
|
rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
|
|
1885
1945
|
|
|
@@ -3166,11 +3226,15 @@ def _show_direct_success(res: dict, elapsed: float) -> None:
|
|
|
3166
3226
|
"""Print the success block for an instant warm-pool claim,
|
|
3167
3227
|
matching the normal reserve output (SSH config + VS Code/Cursor remote)."""
|
|
3168
3228
|
from gpu_dev_cli.reservations import (
|
|
3169
|
-
create_ssh_config_for_reservation, _generate_vscode_command,
|
|
3229
|
+
create_ssh_config_for_reservation, _generate_vscode_command,
|
|
3230
|
+
_generate_cursor_command, _make_vscode_link, _make_cursor_link)
|
|
3170
3231
|
rid = res.get("reservation_id", "") or ""
|
|
3171
3232
|
ssh_command = res.get("ssh_command", "") or ""
|
|
3172
3233
|
pod_name = res.get("pod_name", "") or ""
|
|
3173
3234
|
fqdn = res.get("fqdn") or ""
|
|
3235
|
+
# Host alias keys off the reservation id — warm-claimed pods have a pod_name
|
|
3236
|
+
# that is NOT gpu-dev-<resid8>, so we must not use pod_name as the ssh alias.
|
|
3237
|
+
host_alias = f"gpu-dev-{rid[:8]}" if rid else pod_name
|
|
3174
3238
|
|
|
3175
3239
|
rprint(f"\n[green]✅ Instant reservation ready in {elapsed:.1f}s![/green]")
|
|
3176
3240
|
rprint(f"[bold]📋 Reservation ID:[/bold] {rid}")
|
|
@@ -3179,24 +3243,28 @@ def _show_direct_success(res: dict, elapsed: float) -> None:
|
|
|
3179
3243
|
if rid:
|
|
3180
3244
|
rprint(f"[bold]⚡ Quick Connect:[/bold] gpu-dev connect {rid[:8]}")
|
|
3181
3245
|
|
|
3182
|
-
# Build the per-reservation SSH config so `ssh
|
|
3246
|
+
# Build the per-reservation SSH config so `ssh gpu-dev-<resid8>` and connect work cleanly.
|
|
3183
3247
|
use_include = False
|
|
3184
3248
|
if fqdn and pod_name and rid:
|
|
3185
3249
|
try:
|
|
3186
3250
|
_cfg, use_include = create_ssh_config_for_reservation(fqdn, pod_name, rid, None)
|
|
3187
3251
|
except Exception:
|
|
3188
3252
|
pass
|
|
3189
|
-
if
|
|
3190
|
-
rprint(f"[bold]🖥️ SSH Command:[/bold] ssh {
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3253
|
+
if use_include and rid:
|
|
3254
|
+
rprint(f"[bold]🖥️ SSH Command:[/bold] ssh {host_alias}")
|
|
3255
|
+
vscode_url = _make_vscode_link(host_alias)
|
|
3256
|
+
cursor_url = _make_cursor_link(host_alias)
|
|
3257
|
+
rprint(f"[bold]💻 VS Code Remote:[/bold] [link={vscode_url}]code --remote ssh-remote+{host_alias} /home/dev[/link]")
|
|
3258
|
+
rprint(f"[bold]🖥️ Cursor Remote:[/bold] [link={cursor_url}]cursor --remote ssh-remote+{host_alias} /home/dev[/link]")
|
|
3259
|
+
else:
|
|
3260
|
+
if ssh_command:
|
|
3261
|
+
rprint(f"[bold]🖥️ SSH Command:[/bold] {ssh_command}")
|
|
3262
|
+
vsc = _generate_vscode_command(ssh_command) if ssh_command else None
|
|
3263
|
+
cur = _generate_cursor_command(ssh_command) if ssh_command else None
|
|
3264
|
+
if vsc:
|
|
3265
|
+
rprint(f"[bold]💻 VS Code Remote:[/bold] {vsc}")
|
|
3266
|
+
if cur:
|
|
3267
|
+
rprint(f"[bold]🖥️ Cursor Remote:[/bold] {cur}")
|
|
3200
3268
|
|
|
3201
3269
|
|
|
3202
3270
|
def _format_gpu_display(gpu_count, gpu_type):
|
|
@@ -3385,15 +3453,22 @@ def _show_availability(show_spot: bool = False) -> None:
|
|
|
3385
3453
|
spot_table = Table(title="⚡ Spot Instances (us-east-1, ~70% cheaper)")
|
|
3386
3454
|
spot_table.add_column("GPU Type", style="cyan")
|
|
3387
3455
|
spot_table.add_column("Avail\nNow", style="green")
|
|
3456
|
+
spot_table.add_column("In\nUse", style="yellow")
|
|
3388
3457
|
spot_table.add_column("Per\nNode", style="bright_green")
|
|
3389
3458
|
spot_table.add_column("Status", style="magenta")
|
|
3390
3459
|
spot_table.add_column("Spot Discount", style="dim")
|
|
3391
3460
|
_on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
3392
3461
|
for gt, info in sorted(spot_region_info.items()):
|
|
3393
3462
|
avail = info.get("available", 0)
|
|
3463
|
+
total = info.get("total", 0)
|
|
3464
|
+
in_use = max(0, total - avail) # GPUs on up spot nodes already taken
|
|
3394
3465
|
per_node = spot_gpus_per_node.get(gt, 8)
|
|
3395
3466
|
avail_display = f"[green]{avail}[/green]" if avail > 0 else f"[dim]0[/dim]"
|
|
3396
|
-
|
|
3467
|
+
in_use_display = f"[yellow]{in_use}[/yellow]" if in_use > 0 else f"[dim]0[/dim]"
|
|
3468
|
+
if in_use > 0:
|
|
3469
|
+
status = "[yellow]Node up (in use)[/yellow]" if avail == 0 else "[green]Node up[/green]"
|
|
3470
|
+
else:
|
|
3471
|
+
status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
|
|
3397
3472
|
si = info.get("spot_info", {}) or {}
|
|
3398
3473
|
sp = si.get("spot_price", "") if isinstance(si, dict) else ""
|
|
3399
3474
|
if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
|
|
@@ -3405,7 +3480,7 @@ def _show_availability(show_spot: bool = False) -> None:
|
|
|
3405
3480
|
avail_signal = f"[green]{pct}% off on-demand[/green]" if pct > 0 else "[dim]At on-demand price[/dim]"
|
|
3406
3481
|
except (ValueError, TypeError):
|
|
3407
3482
|
avail_signal = "[yellow]Unknown[/yellow]"
|
|
3408
|
-
spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
|
|
3483
|
+
spot_table.add_row(f"{gt.upper()} *", avail_display, in_use_display, str(per_node), status, avail_signal)
|
|
3409
3484
|
console.print(spot_table)
|
|
3410
3485
|
rprint("[dim]* = spot: ~70% cheaper, AWS can reclaim with 2-min notice, fulfillment not guaranteed.[/dim]")
|
|
3411
3486
|
rprint("[dim] Separate cluster (us-east-1) with separate disks. Select via gpu-dev reserve (interactive).[/dim]")
|
|
@@ -3779,7 +3854,8 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3779
3854
|
for node in nodes:
|
|
3780
3855
|
status_display = "✅ Active" if node.get("status") == "active" else f"⏳ {node.get('status', 'unknown')}"
|
|
3781
3856
|
pod_name = node.get("pod_name", "unknown")
|
|
3782
|
-
|
|
3857
|
+
node_rid = node.get("reservation_id")
|
|
3858
|
+
ssh_cmd_short = f"ssh gpu-dev-{node_rid[:8]}" if node_rid else "N/A"
|
|
3783
3859
|
|
|
3784
3860
|
table.add_row(
|
|
3785
3861
|
f"Node {node.get('node_index', 0) + 1}",
|
|
@@ -4036,10 +4112,11 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
|
|
|
4036
4112
|
)
|
|
4037
4113
|
|
|
4038
4114
|
if config_path:
|
|
4115
|
+
node_alias = f"gpu-dev-{node_res_id[:8]}"
|
|
4039
4116
|
if use_include:
|
|
4040
|
-
rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh {
|
|
4117
|
+
rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh {node_alias}[/cyan]")
|
|
4041
4118
|
else:
|
|
4042
|
-
rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh -F {config_path} {
|
|
4119
|
+
rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh -F {config_path} {node_alias}[/cyan]")
|
|
4043
4120
|
else:
|
|
4044
4121
|
rprint(f"[yellow]⚠️ Node {node_idx + 1}: Failed to create SSH config[/yellow]")
|
|
4045
4122
|
|
|
@@ -4067,12 +4144,13 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
|
|
|
4067
4144
|
)
|
|
4068
4145
|
|
|
4069
4146
|
if config_path:
|
|
4147
|
+
host_alias = f"gpu-dev-{reservation_id[:8]}"
|
|
4070
4148
|
rprint(f"[green]✅ SSH config created:[/green] [cyan]{config_path}[/cyan]\n")
|
|
4071
4149
|
if use_include:
|
|
4072
|
-
rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh {
|
|
4150
|
+
rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh {host_alias}[/cyan]")
|
|
4073
4151
|
rprint(f"[dim] or:[/dim] [cyan]gpu-dev connect {reservation_id[:8]}[/cyan]")
|
|
4074
4152
|
else:
|
|
4075
|
-
rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh -F {config_path} {
|
|
4153
|
+
rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh -F {config_path} {host_alias}[/cyan]")
|
|
4076
4154
|
rprint(f"[dim] or:[/dim] [cyan]gpu-dev connect {reservation_id[:8]}[/cyan]")
|
|
4077
4155
|
else:
|
|
4078
4156
|
rprint("[red]❌ Failed to create SSH config[/red]")
|
|
@@ -4639,13 +4717,13 @@ def ssh_include(action: str):
|
|
|
4639
4717
|
|
|
4640
4718
|
\b
|
|
4641
4719
|
When enabled:
|
|
4642
|
-
• Simple SSH commands: ssh
|
|
4643
|
-
• VS Code Remote works: code --remote ssh-remote
|
|
4720
|
+
• Simple SSH commands: ssh gpu-dev-<reservation-id>
|
|
4721
|
+
• VS Code Remote works: code --remote ssh-remote+gpu-dev-<reservation-id>
|
|
4644
4722
|
• Cursor Remote works: Open Remote SSH in Cursor
|
|
4645
4723
|
|
|
4646
4724
|
\b
|
|
4647
4725
|
When disabled:
|
|
4648
|
-
• Need -F flag: ssh -F ~/.gpu-dev/<id>-sshconfig
|
|
4726
|
+
• Need -F flag: ssh -F ~/.gpu-dev/<id>-sshconfig gpu-dev-<reservation-id>
|
|
4649
4727
|
• VS Code/Cursor requires manual config setup
|
|
4650
4728
|
|
|
4651
4729
|
\b
|
|
@@ -29,6 +29,15 @@ class Config:
|
|
|
29
29
|
"description": "Spot-only us-east-1 environment (T4/L4/CPU)",
|
|
30
30
|
"spot_types": ["b300", "b200", "h200", "h100", "a100", "t4", "l4", "rtxpro6000"],
|
|
31
31
|
},
|
|
32
|
+
# Staging (us-west-1, tf "default" workspace, environment=test). Same
|
|
33
|
+
# standard resource prefix as prod, just a different region — so only the
|
|
34
|
+
# region changes. Live capacity: cpu-x86/arm + t4. Used for integration
|
|
35
|
+
# tests. Select via `GPU_DEV_ENVIRONMENT=staging` (or the "test" env alias).
|
|
36
|
+
"staging": {
|
|
37
|
+
"region": "us-west-1",
|
|
38
|
+
"workspace": "default",
|
|
39
|
+
"description": "Staging (us-west-1, cpu + t4)",
|
|
40
|
+
},
|
|
32
41
|
}
|
|
33
42
|
DEFAULT_ENVIRONMENT = "prod"
|
|
34
43
|
|
|
@@ -43,19 +52,33 @@ class Config:
|
|
|
43
52
|
# Load unified config (handles migration from legacy files)
|
|
44
53
|
self.user_config = self._load_config()
|
|
45
54
|
|
|
46
|
-
#
|
|
55
|
+
# Active environment: GPU_DEV_ENVIRONMENT env wins (handy for tests/CI),
|
|
56
|
+
# then the persisted config, then the default. Its region/prefix back the
|
|
57
|
+
# fallbacks below so e.g. `GPU_DEV_ENVIRONMENT=staging` reaches us-west-2.
|
|
58
|
+
env_override = os.getenv("GPU_DEV_ENVIRONMENT")
|
|
59
|
+
env_name = env_override or self.user_config.get(
|
|
60
|
+
"environment", self.DEFAULT_ENVIRONMENT)
|
|
61
|
+
env_cfg = self.ENVIRONMENTS.get(env_name, {})
|
|
62
|
+
|
|
63
|
+
# Get region: AWS_* env vars take priority (for spot routing); then an
|
|
64
|
+
# explicit GPU_DEV_ENVIRONMENT switch uses that env's region (beating the
|
|
65
|
+
# persisted one); then the persisted config; then the env's region; default.
|
|
47
66
|
env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
|
|
48
67
|
if env_region and env_region != self.user_config.get("region"):
|
|
49
68
|
self.aws_region = env_region
|
|
69
|
+
elif env_override and env_cfg.get("region"):
|
|
70
|
+
self.aws_region = env_cfg["region"]
|
|
50
71
|
elif self.user_config.get("region"):
|
|
51
72
|
self.aws_region = self.user_config["region"]
|
|
73
|
+
elif env_cfg.get("region"):
|
|
74
|
+
self.aws_region = env_cfg["region"]
|
|
52
75
|
else:
|
|
53
76
|
self.aws_region = "us-east-2"
|
|
54
77
|
|
|
55
78
|
os.environ["AWS_DEFAULT_REGION"] = self.aws_region
|
|
56
79
|
|
|
57
|
-
# Resource naming convention -
|
|
58
|
-
self.prefix = "pytorch-gpu-dev"
|
|
80
|
+
# Resource naming convention — per-environment prefix (default for prod).
|
|
81
|
+
self.prefix = env_cfg.get("prefix", "pytorch-gpu-dev")
|
|
59
82
|
|
|
60
83
|
# Construct ARNs from convention
|
|
61
84
|
self.queue_name = f"{self.prefix}-reservation-queue"
|
|
@@ -177,12 +177,14 @@ def _generate_cursor_command(ssh_command: str) -> Optional[str]:
|
|
|
177
177
|
return None
|
|
178
178
|
|
|
179
179
|
|
|
180
|
-
def _generate_ssh_config(hostname: str,
|
|
180
|
+
def _generate_ssh_config(hostname: str, host_alias: str) -> str:
|
|
181
181
|
"""Generate SSH config for a reservation
|
|
182
182
|
|
|
183
183
|
Args:
|
|
184
|
-
hostname: The FQDN hostname (e.g., old_bison.devservers.io)
|
|
185
|
-
|
|
184
|
+
hostname: The FQDN hostname (e.g., old_bison.devservers.io). SSH routing
|
|
185
|
+
happens via this HostName (the ProxyCommand routes on the FQDN), so
|
|
186
|
+
host_alias is a purely local label.
|
|
187
|
+
host_alias: The local SSH host alias (e.g., gpu-dev-<resid8>)
|
|
186
188
|
|
|
187
189
|
Returns:
|
|
188
190
|
SSH config content as string
|
|
@@ -196,7 +198,7 @@ def _generate_ssh_config(hostname: str, pod_name: str) -> str:
|
|
|
196
198
|
extra = " AddKeysToAgent yes\n"
|
|
197
199
|
if sys.platform == "darwin":
|
|
198
200
|
extra += " IgnoreUnknown UseKeychain\n UseKeychain yes\n"
|
|
199
|
-
config_content = f"""Host {
|
|
201
|
+
config_content = f"""Host {host_alias}
|
|
200
202
|
HostName {hostname}
|
|
201
203
|
User dev
|
|
202
204
|
ForwardAgent yes
|
|
@@ -255,10 +257,10 @@ def _check_ssh_config_permission() -> bool:
|
|
|
255
257
|
console.print("[dim] • ~/.cursor/ssh_config[/dim]")
|
|
256
258
|
console.print("[dim]Line added: Include ~/.gpu-dev/*-sshconfig[/dim]\n")
|
|
257
259
|
console.print("[green]Benefits:[/green]")
|
|
258
|
-
console.print(" • Simple commands: [green]ssh
|
|
259
|
-
console.print(" • VS Code Remote works: [green]code --remote ssh-remote
|
|
260
|
+
console.print(" • Simple commands: [green]ssh gpu-dev-<reservation-id>[/green]")
|
|
261
|
+
console.print(" • VS Code Remote works: [green]code --remote ssh-remote+gpu-dev-<reservation-id>[/green]")
|
|
260
262
|
console.print(" • Cursor Remote works: Open Remote SSH in Cursor")
|
|
261
|
-
console.print("\n[dim]Without this, you'll need to use: [green]ssh -F ~/.gpu-dev/<id>-sshconfig
|
|
263
|
+
console.print("\n[dim]Without this, you'll need to use: [green]ssh -F ~/.gpu-dev/<id>-sshconfig gpu-dev-<reservation-id>[/green][/dim]")
|
|
262
264
|
console.print("[yellow]━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[/yellow]\n")
|
|
263
265
|
|
|
264
266
|
approved = click.confirm("Add Include directive to SSH config files?", default=True)
|
|
@@ -326,7 +328,8 @@ def create_ssh_config_for_reservation(hostname: str, pod_name: str, reservation_
|
|
|
326
328
|
|
|
327
329
|
Args:
|
|
328
330
|
hostname: The FQDN hostname (e.g., old_bison.devservers.io)
|
|
329
|
-
pod_name: The pod name
|
|
331
|
+
pod_name: The k8s pod name (kept for API compat; no longer used for the
|
|
332
|
+
host alias — warm-claimed pods have a pod_name != gpu-dev-<resid8>)
|
|
330
333
|
reservation_id: The reservation ID (full or short)
|
|
331
334
|
name: Optional reservation name to use for filename (falls back to short ID)
|
|
332
335
|
|
|
@@ -346,8 +349,12 @@ def create_ssh_config_for_reservation(hostname: str, pod_name: str, reservation_
|
|
|
346
349
|
short_id = reservation_id[:8]
|
|
347
350
|
filename = f"{short_id}-sshconfig"
|
|
348
351
|
|
|
352
|
+
# Key the host alias off the reservation id (not pod_name) so warm-claimed pods,
|
|
353
|
+
# whose pod_name differs from gpu-dev-<resid8>, are still reachable as gpu-dev-<resid8>.
|
|
354
|
+
host_alias = f"gpu-dev-{short_id}"
|
|
355
|
+
|
|
349
356
|
config_file = gpu_dev_dir / filename
|
|
350
|
-
config_content = _generate_ssh_config(hostname,
|
|
357
|
+
config_content = _generate_ssh_config(hostname, host_alias)
|
|
351
358
|
|
|
352
359
|
try:
|
|
353
360
|
config_file.write_text(config_content)
|
|
@@ -2220,10 +2227,11 @@ class ReservationManager:
|
|
|
2220
2227
|
console.print(
|
|
2221
2228
|
f"[yellow]⚠️ Could not create SSH config for node {node['index']+1}: {str(e)}[/yellow]")
|
|
2222
2229
|
|
|
2223
|
-
# Show connection info
|
|
2230
|
+
# Show connection info (alias keys off the reservation id)
|
|
2231
|
+
node_alias = f"gpu-dev-{res_id[:8]}" if res_id else pod_name
|
|
2224
2232
|
if config_path and pod_name and use_include:
|
|
2225
2233
|
console.print(
|
|
2226
|
-
f"[cyan]🖥️ Node {node['index']+1}:[/cyan] [green]ssh {
|
|
2234
|
+
f"[cyan]🖥️ Node {node['index']+1}:[/cyan] [green]ssh {node_alias}[/green]")
|
|
2227
2235
|
else:
|
|
2228
2236
|
ssh_command = res.get(
|
|
2229
2237
|
"ssh_command", "ssh user@pending")
|
|
@@ -2321,27 +2329,29 @@ class ReservationManager:
|
|
|
2321
2329
|
console.print(
|
|
2322
2330
|
f"[yellow]⚠️ Could not create SSH config: {str(e)}[/yellow]")
|
|
2323
2331
|
|
|
2324
|
-
# Show SSH command using config file if created, otherwise fallback
|
|
2332
|
+
# Show SSH command using config file if created, otherwise fallback.
|
|
2333
|
+
# Alias keys off the reservation id (works for warm-claimed pods too).
|
|
2334
|
+
host_alias = f"gpu-dev-{short_id}"
|
|
2325
2335
|
if config_path and pod_name:
|
|
2326
2336
|
if use_include:
|
|
2327
2337
|
# User approved Include - show simple commands
|
|
2328
2338
|
console.print(
|
|
2329
|
-
f"[cyan]🖥️ SSH Command:[/cyan] [green]ssh {
|
|
2339
|
+
f"[cyan]🖥️ SSH Command:[/cyan] [green]ssh {host_alias}[/green]")
|
|
2330
2340
|
# Create clickable VS Code link
|
|
2331
|
-
vscode_url = _make_vscode_link(
|
|
2332
|
-
vscode_command = f"code --remote ssh-remote+{
|
|
2341
|
+
vscode_url = _make_vscode_link(host_alias)
|
|
2342
|
+
vscode_command = f"code --remote ssh-remote+{host_alias} /home/dev"
|
|
2333
2343
|
console.print(
|
|
2334
2344
|
f"[cyan]💻 VS Code Remote:[/cyan] [link={vscode_url}][green]{vscode_command}[/green][/link]")
|
|
2335
2345
|
|
|
2336
2346
|
# Create clickable Cursor link
|
|
2337
|
-
cursor_url = _make_cursor_link(
|
|
2338
|
-
cursor_command = f"cursor --remote ssh-remote+{
|
|
2347
|
+
cursor_url = _make_cursor_link(host_alias)
|
|
2348
|
+
cursor_command = f"cursor --remote ssh-remote+{host_alias} /home/dev"
|
|
2339
2349
|
console.print(
|
|
2340
2350
|
f"[cyan]🖥️ Cursor Remote:[/cyan] [link={cursor_url}][green]{cursor_command}[/green][/link]")
|
|
2341
2351
|
else:
|
|
2342
2352
|
# User declined Include - show commands with -F flag
|
|
2343
2353
|
console.print(
|
|
2344
|
-
f"[cyan]🖥️ SSH Command:[/cyan] [green]ssh -F {config_path} {
|
|
2354
|
+
f"[cyan]🖥️ SSH Command:[/cyan] [green]ssh -F {config_path} {host_alias}[/green]")
|
|
2345
2355
|
console.print(
|
|
2346
2356
|
f"[cyan]💻 VS Code/Cursor:[/cyan] Add [green]Include ~/.gpu-dev/*-sshconfig[/green] to ~/.ssh/config and ~/.cursor/ssh_config")
|
|
2347
2357
|
console.print(
|