gpu-dev 0.7.5__tar.gz → 0.7.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_dev-0.7.10/.github/workflows/tests.yml +20 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/.gitignore +11 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/CLAUDE.md +89 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/PKG-INFO +6 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +165 -45
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +26 -3
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +28 -18
- gpu_dev-0.7.10/conftest.py +92 -0
- gpu_dev-0.7.10/docs/FAST_REPRO_DESIGN.md +141 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/SDK_REPRO.md +47 -4
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/PKG-INFO +6 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/SOURCES.txt +55 -2
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/requires.txt +6 -0
- gpu_dev-0.7.10/post-may-2026.md +185 -0
- gpu_dev-0.7.10/presentation/CLAUDE.md +220 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/cli-demo.html +5 -5
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/gpu-fleet.html +5 -5
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/k8s-under-the-hood.html +8 -8
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/multinode.html +10 -10
- gpu_dev-0.7.10/presentation/pyproject.toml +33 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/sdk-demo.html +6 -6
- gpu_dev-0.7.10/presentation/teaser.html +317 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/pyproject.toml +17 -1
- gpu_dev-0.7.10/sdk/python/examples/parallel_experiments.ipynb +408 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/__init__.py +1 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_backend/aws.py +4 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/availability.tf +2 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/Dockerfile +18 -7
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/bashrc +9 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/zshrc +5 -2
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/git-cache.tf +2 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/kubernetes.tf +7 -2
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/availability_updater/index.py +39 -3
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +11 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_processor/index.py +216 -21
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda.tf +16 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/mig-parted-config.yaml +15 -0
- gpu_dev-0.7.10/terraform-gpu-devservers/pytorch-ondemand.tf +178 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/pytorch-prebuild.tf +47 -3
- gpu_dev-0.7.10/tests/integration/README.md +35 -0
- gpu_dev-0.7.10/tests/integration/__init__.py +0 -0
- gpu_dev-0.7.10/tests/integration/conftest.py +131 -0
- gpu_dev-0.7.10/tests/integration/test_claude.py +40 -0
- gpu_dev-0.7.10/tests/integration/test_cpu_lifecycle.py +36 -0
- gpu_dev-0.7.10/tests/integration/test_repro_known_failure.py +54 -0
- gpu_dev-0.7.10/tests/integration/test_t4_lifecycle.py +39 -0
- gpu_dev-0.7.10/tests/integration/test_warm_pool.py +54 -0
- gpu_dev-0.7.10/tests/unit/__init__.py +0 -0
- gpu_dev-0.7.10/tests/unit/cli/__init__.py +0 -0
- gpu_dev-0.7.10/tests/unit/cli/test_auth.py +442 -0
- gpu_dev-0.7.10/tests/unit/cli/test_avail.py +295 -0
- gpu_dev-0.7.10/tests/unit/cli/test_cancel.py +380 -0
- gpu_dev-0.7.10/tests/unit/cli/test_config_cmd.py +187 -0
- gpu_dev-0.7.10/tests/unit/cli/test_config_module.py +476 -0
- gpu_dev-0.7.10/tests/unit/cli/test_connect.py +373 -0
- gpu_dev-0.7.10/tests/unit/cli/test_disks.py +747 -0
- gpu_dev-0.7.10/tests/unit/cli/test_edit.py +321 -0
- gpu_dev-0.7.10/tests/unit/cli/test_interactive.py +489 -0
- gpu_dev-0.7.10/tests/unit/cli/test_list_show.py +547 -0
- gpu_dev-0.7.10/tests/unit/cli/test_name_generator.py +272 -0
- gpu_dev-0.7.10/tests/unit/cli/test_repro.py +454 -0
- gpu_dev-0.7.10/tests/unit/cli/test_reservations_mgr.py +593 -0
- gpu_dev-0.7.10/tests/unit/cli/test_reserve.py +394 -0
- gpu_dev-0.7.10/tests/unit/cli/test_smoke.py +12 -0
- gpu_dev-0.7.10/tests/unit/cli/test_ssh_alias.py +130 -0
- gpu_dev-0.7.10/tests/unit/cli/test_submit.py +401 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/__init__.py +0 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_availability.py +488 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_cancellation.py +355 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_claim.py +348 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_mig_gpu_config.py +598 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_pod_resources.py +255 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_ref_staging.py +292 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_smoke.py +12 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_version_gate.py +178 -0
- gpu_dev-0.7.10/tests/unit/lambda_fn/test_warm_pool.py +682 -0
- gpu_dev-0.7.10/tests/unit/sdk/__init__.py +0 -0
- gpu_dev-0.7.10/tests/unit/sdk/test_backend_aws.py +790 -0
- gpu_dev-0.7.10/tests/unit/sdk/test_client.py +519 -0
- gpu_dev-0.7.10/tests/unit/sdk/test_errors_enums.py +308 -0
- gpu_dev-0.7.10/tests/unit/sdk/test_models_extra.py +361 -0
- gpu_dev-0.7.10/tests/unit/sdk/test_sandbox.py +352 -0
- gpu_dev-0.7.10/tests/unit/sdk/test_sdk_config.py +258 -0
- gpu_dev-0.7.10/tests/unit/sdk/test_transport_ssh.py +327 -0
- gpu_dev-0.7.5/sdk/python/examples/parallel_experiments.ipynb +0 -362
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/admin/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/admin/generate_stats.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/admin/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/architecture.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/devgpu-features.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/feedback.png +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/index.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/osdc-future-plans.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/problem.png +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/sandbox.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/thesis.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/title-vid.mp4 +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/weneedgpus.png +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/wow.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/batch_multi_gpu.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/interactive_debug.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/quickstart.ipynb +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/run_tests.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/submit_job.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/config.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/enums.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/errors.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/models.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/py.typed +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/tests/test_models.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/setup.cfg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-deck/backend.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-deck/main.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-deck/terraform.tfvars.example +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/build-node.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/warm-pool.tf +0 -0
- {gpu_dev-0.7.5/sdk/python → gpu_dev-0.7.10}/tests/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/tests/submit/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.10}/tests/submit/success/run.sh +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
name: tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
unit:
|
|
9
|
+
name: unit + mocks
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- name: Install uv
|
|
14
|
+
uses: astral-sh/setup-uv@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- name: Install package + test deps
|
|
18
|
+
run: uv pip install -e ".[test]"
|
|
19
|
+
- name: Run unit + mock tests (integration excluded)
|
|
20
|
+
run: uv run pytest -m "not integration"
|
|
@@ -73,3 +73,14 @@ lambda/*/package/
|
|
|
73
73
|
admin/output/
|
|
74
74
|
|
|
75
75
|
.claude/worktrees/
|
|
76
|
+
.claude/settings.local.json
|
|
77
|
+
.claude/scheduled_tasks.lock
|
|
78
|
+
|
|
79
|
+
# Org-specific (filled in locally; not committed)
|
|
80
|
+
docs/INTERNAL_AUTH.md
|
|
81
|
+
|
|
82
|
+
# Local scratch / staging terraform working dir
|
|
83
|
+
*.pid
|
|
84
|
+
terraform-gpu-devservers/staging/.terraform/
|
|
85
|
+
terraform-gpu-devservers/staging/__pycache__/
|
|
86
|
+
terraform-gpu-devservers/staging/*.log
|
|
@@ -28,6 +28,59 @@ For terraform, we use opentofu, don't ever run tf apply directly. You're free to
|
|
|
28
28
|
- Group imports in standard order: standard library, third-party, local imports
|
|
29
29
|
- Use absolute imports when possible
|
|
30
30
|
|
|
31
|
+
## Testing (DO THIS FOR EVERY CHANGE)
|
|
32
|
+
|
|
33
|
+
There is a real test suite now. **Every change must keep it green, and add/adjust
|
|
34
|
+
tests.** Two tiers:
|
|
35
|
+
|
|
36
|
+
**1. Unit + mocks — ALWAYS run, must stay green (CI runs this on every push/PR).**
|
|
37
|
+
Fully mocked (boto3 / k8s / SSH / subprocess), no network, ~2s.
|
|
38
|
+
```bash
|
|
39
|
+
uv pip install -e ".[test]" # one-time: pytest, moto, kubernetes
|
|
40
|
+
uv run pytest -m "not integration" # ~1140 tests; run before every commit
|
|
41
|
+
```
|
|
42
|
+
- Layout: `tests/unit/{sdk,cli,lambda_fn}/test_*.py`; shared fixtures in the root
|
|
43
|
+
`conftest.py` (`cli_runner`, `lambda_index` = the lambda imported as `index`
|
|
44
|
+
with env pre-set, `aws_mocks` = MagicMock boto3 handles).
|
|
45
|
+
- When you touch CLI / SDK / lambda code, update or add the matching `test_*.py`.
|
|
46
|
+
- CI: `.github/workflows/tests.yml`. Lambda imports need env vars + sys.path — the
|
|
47
|
+
root `conftest.py` already sets both.
|
|
48
|
+
|
|
49
|
+
**2. e2e integration on STAGING — run for anything touching the
|
|
50
|
+
reserve/pod/SSH/lambda path before merging.** Real reservations on the **staging**
|
|
51
|
+
cluster (us-west-1), cpu + t4 only, auto-cancelled. Staging is the DEFAULT target
|
|
52
|
+
and github_user comes from your config, so the bare command is enough:
|
|
53
|
+
```bash
|
|
54
|
+
uv run pytest -m integration --run-integration -v
|
|
55
|
+
```
|
|
56
|
+
- Staging is the default (`GPU_DEV_TEST_ENV` defaults to `staging` → us-west-1,
|
|
57
|
+
standard `pytorch-gpu-dev-*` prefix, tf workspace `default`). The integration
|
|
58
|
+
conftest pins the region so the unit-test us-east-2 default can't leak in. Wired
|
|
59
|
+
in `cli-tools/.../config.py` ENVIRONMENTS.
|
|
60
|
+
- Covers: cpu-x86 + t4 reserve→active→cancel, list-while-active, exec
|
|
61
|
+
(`nproc`/`nvidia-smi`/`torch.cuda`), **`claude -p` answers "Paris"** (pod Claude
|
|
62
|
+
Code/Bedrock), and the **warm pool** (fast warm claim + custom-image
|
|
63
|
+
warm-ineligibility). Each cancels in a `finally` (no leaked pods).
|
|
64
|
+
- Warm-pool tests need `WARM_POOL_TARGETS` deployed on staging — set in
|
|
65
|
+
`lambda.tf` for the `default` workspace (`{t4, cpu-x86, cpu-arm}`). Staging IS the
|
|
66
|
+
tf `default` workspace (us-west-1, environment=test) — there is no `test`/`staging`
|
|
67
|
+
workspace: `tofu workspace select default && tofu apply`. Until then the warm
|
|
68
|
+
tests skip ("came up cold"). Custom-image test: set `GPU_DEV_TEST_IMAGE`.
|
|
69
|
+
- Repro test (`test_repro_known_failure.py`): set `GPU_DEV_REPRO_REF` +
|
|
70
|
+
`GPU_DEV_REPRO_TEST` to a known-red (commit, test). Find one with the
|
|
71
|
+
**treehugger MCP** (`hud`, user-scope — `get_hud_data`/`master_commit_red`).
|
|
72
|
+
Note: prebuilt torch is h100/b200 arch, so a CUDA test on t4 needs a full build;
|
|
73
|
+
prefer a failure that runs on the box's GPU or on cpu.
|
|
74
|
+
- Skips cleanly if staging is unreachable or the runner has no outbound SSH (e.g. a
|
|
75
|
+
sandbox). The reservation role can query/SQS but lacks `DescribeTable`, so the
|
|
76
|
+
reachability probe uses scan+get-queue-url, not describe.
|
|
77
|
+
- Validated live (2026-05-31): cpu + t4 lifecycle PASS; warm-claim test confirmed
|
|
78
|
+
it reaches the real reserve (skips until WARM_POOL_TARGETS is applied).
|
|
79
|
+
|
|
80
|
+
**Rule of thumb:** unit+mocks for *every* change; add e2e coverage when you add a
|
|
81
|
+
new command/flow; run the staging e2e before merging anything that could affect a
|
|
82
|
+
live reservation. Don't say "done/tested" without having run the relevant tier.
|
|
83
|
+
|
|
31
84
|
## Content
|
|
32
85
|
|
|
33
86
|
- torchci - a next.js app containing a PyTorch CI tracker
|
|
@@ -51,6 +104,42 @@ Currently we're working on a developer servers with GPUs in AWS. This means we'l
|
|
|
51
104
|
|
|
52
105
|
# AGENT SECTION
|
|
53
106
|
|
|
107
|
+
## Fast-repro redesign — by-SHA artifact cache + on-demand build (2026-06-01)
|
|
108
|
+
|
|
109
|
+
Goal: `gpu-dev repro <ref>` for any pytorch commit from the last ~72h lands a built,
|
|
110
|
+
importable tree in <2min. Design: `docs/FAST_REPRO_DESIGN.md`. **All merged to main**
|
|
111
|
+
(PRs #186–#189); **needs `tofu apply` (prod, workspace `prod`) + image rebuild**.
|
|
112
|
+
|
|
113
|
+
- **by-SHA artifact cache** (#186): whole *built* trees keyed by commit SHA at
|
|
114
|
+
`/ccache_shared/prebuilt/by-sha/<sha>.tar.{zst,gz}` (`.sha` written last = the
|
|
115
|
+
completion gate). Cron seeds one per viable/strict bump (hardlink, no extra space).
|
|
116
|
+
`stage-pytorch` (cold `--ref`) + `gpu-dev repro` consume on hit → `import torch`
|
|
117
|
+
with ZERO build. `repro` also publishes its in-pod build via `publish-pytorch-build`
|
|
118
|
+
(detached) so the cache fills from real usage. All paths safe-fallback on miss;
|
|
119
|
+
`ls-remote` is `timeout 15`.
|
|
120
|
+
- **retention** (#188): prebuild cron prunes by-sha entries >72h every tick (storage
|
|
121
|
+
budget ~500-650GB on the elastic ccache EFS). The by-sha set IS the snapshot ladder.
|
|
122
|
+
- **mold linker** (#187): Dockerfile installs `mold`; cron + in-pod repro build wrap
|
|
123
|
+
with `mold -run` (guarded on `command -v mold`). Drops the libtorch_cuda.so relink
|
|
124
|
+
~1-3min → ~15s. **Needs image rebuild** to activate (prod runs a stale image; that's
|
|
125
|
+
also why prod publishes gzip not zstd — the Dockerfile has zstd already).
|
|
126
|
+
- **on-demand build worker** (#189, `pytorch-ondemand.tf`): always-on Deployment on
|
|
127
|
+
NodeType=build drains `prebuilt/build-queue/<sha>.req` (own hostPath tree
|
|
128
|
+
`/mnt/ondemand-build` → builds at `/home/dev/pytorch` so build/ paths are
|
|
129
|
+
pod-compatible; mold+ccache), publishes by-sha, writes `.worker-alive` heartbeat.
|
|
130
|
+
`repro` enqueues + polls ONLY when the heartbeat is fresh (else straight to in-pod
|
|
131
|
+
build → zero regression if not deployed). Makes the FIRST repro of an uncached
|
|
132
|
+
commit fast. Coordination 100% via shared EFS — no new networking/RBAC/lambda.
|
|
133
|
+
- cuDNN fidelity (`USE_CUDNN=1`) DEFERRED — forcing it can fail the build if cuDNN
|
|
134
|
+
isn't found under cuda-13.2; needs prod e2e. Base image is cudnn9-devel.
|
|
135
|
+
- Fast path is **prod-arch only** (`sm_90;sm_100` = H100/B200); t4/staging is wrong-arch.
|
|
136
|
+
- Also: SSH alias now keys off reservation id not pod name (#185) so warm/repro pods
|
|
137
|
+
are reachable via `ssh gpu-dev-<resid>` / `connect` (routing is via the FQDN, the
|
|
138
|
+
alias is a local label). CCACHE_MAXSIZE settled at 250G (#184).
|
|
139
|
+
- Prod e2e: `gpu-dev repro <fresh-sha> <test> --gpu-type h100 --no-connect` (first =
|
|
140
|
+
off-pod build + stage; rerun = by-sha HIT zero build). Worker logs:
|
|
141
|
+
`k -n management logs deploy/pytorch-ondemand-builder -f`.
|
|
142
|
+
|
|
54
143
|
## Instant-sandboxes branch — WIP & things to fix (2026-05-29)
|
|
55
144
|
|
|
56
145
|
Big push on warm pools + instant claims + prebuilt pytorch. Tracking state here so it's not lost.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.10
|
|
4
4
|
Summary: CLI + Python SDK for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,6 +15,11 @@ Requires-Dist: questionary>=2.1.1
|
|
|
15
15
|
Requires-Dist: websockets>=12.0
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
|
+
Provides-Extra: test
|
|
19
|
+
Requires-Dist: pytest>=7.4; extra == "test"
|
|
20
|
+
Requires-Dist: pytest-cov>=4.1; extra == "test"
|
|
21
|
+
Requires-Dist: moto[dynamodb,ec2,sqs]>=5.0; extra == "test"
|
|
22
|
+
Requires-Dist: kubernetes>=28.1; extra == "test"
|
|
18
23
|
|
|
19
24
|
# GPU Developer CLI & SDK
|
|
20
25
|
|
|
@@ -319,6 +319,9 @@ def _show_single_reservation(connection_info: dict) -> None:
|
|
|
319
319
|
reservation_id = connection_info["reservation_id"]
|
|
320
320
|
reservation_name = connection_info.get("name")
|
|
321
321
|
pod_name = connection_info.get("pod_name", "")
|
|
322
|
+
# SSH host alias keys off the reservation id (works for warm-claimed pods,
|
|
323
|
+
# whose pod_name != gpu-dev-<resid8>). pod_name is shown separately below.
|
|
324
|
+
host_alias = f"gpu-dev-{short_id}"
|
|
322
325
|
ssh_config_path = get_ssh_config_path(reservation_id, reservation_name)
|
|
323
326
|
use_include = is_ssh_include_enabled()
|
|
324
327
|
|
|
@@ -328,14 +331,14 @@ def _show_single_reservation(connection_info: dict) -> None:
|
|
|
328
331
|
if use_include:
|
|
329
332
|
# User approved Include - show simple commands
|
|
330
333
|
from .reservations import _make_vscode_link
|
|
331
|
-
ssh_command_display = f"[green]ssh {
|
|
332
|
-
vscode_url = _make_vscode_link(
|
|
333
|
-
vscode_cmd_text = f"code --remote ssh-remote+{
|
|
334
|
+
ssh_command_display = f"[green]ssh {host_alias}[/green]"
|
|
335
|
+
vscode_url = _make_vscode_link(host_alias)
|
|
336
|
+
vscode_cmd_text = f"code --remote ssh-remote+{host_alias} /home/dev"
|
|
334
337
|
vscode_command_display = f"[link={vscode_url}][green]{vscode_cmd_text}[/green][/link]"
|
|
335
338
|
vscode_info = f"[blue]VS Code Remote:[/blue] {vscode_command_display}\n"
|
|
336
339
|
else:
|
|
337
340
|
# User declined Include - show commands with -F flag
|
|
338
|
-
ssh_command_display = f"[green]ssh -F {ssh_config_path} {
|
|
341
|
+
ssh_command_display = f"[green]ssh -F {ssh_config_path} {host_alias}[/green]"
|
|
339
342
|
vscode_command_display = f"Add [green]Include ~/.gpu-dev/*-sshconfig[/green] to ~/.ssh/config and ~/.cursor/ssh_config (or: [green]gpu-dev config ssh-include enable[/green])"
|
|
340
343
|
vscode_info = f"[blue]VS Code/Cursor:[/blue] {vscode_command_display}\n"
|
|
341
344
|
else:
|
|
@@ -1523,12 +1526,19 @@ def reserve(
|
|
|
1523
1526
|
@click.option("--gpu-type", default="b200", show_default=True, help="GPU type for the repro box.")
|
|
1524
1527
|
@click.option("--gpus", type=int, default=1, show_default=True)
|
|
1525
1528
|
@click.option("--hours", type=float, default=3.0, show_default=True,
|
|
1526
|
-
help="Lifetime ceiling
|
|
1529
|
+
help="Lifetime ceiling for the box.")
|
|
1530
|
+
@click.option("--no-connect", is_flag=True, default=False,
|
|
1531
|
+
help="CI mode: run the test, auto-cancel, exit code = test result. Default (on a TTY) drops you into the box to iterate.")
|
|
1527
1532
|
@click.option("--keep", is_flag=True, default=False,
|
|
1528
|
-
help="
|
|
1533
|
+
help="Never cancel the box (skip the cancel prompt / auto-cancel).")
|
|
1529
1534
|
@click.pass_context
|
|
1530
|
-
def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
|
|
1531
|
-
"""Reserve a GPU, check out a PR/commit, run a test, then
|
|
1535
|
+
def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
1536
|
+
"""Reserve a GPU, check out a PR/commit, run a test, then drop you into the box.
|
|
1537
|
+
|
|
1538
|
+
By default (in a terminal) repro runs the test and then **connects you into the
|
|
1539
|
+
box** at ~/pytorch — the ref is checked out, so you can fix and re-run. The box
|
|
1540
|
+
stays alive until you cancel it (you're prompted on exit). Use --no-connect for
|
|
1541
|
+
CI/scripts (run the test, auto-cancel, process exit code = the test result).
|
|
1532
1542
|
|
|
1533
1543
|
REF: pr/<N>, #<N>, a bare PR number, a branch, or a commit sha. PRs use
|
|
1534
1544
|
pull/<N>/merge (what CI tests), falling back to /head.
|
|
@@ -1539,6 +1549,7 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
|
|
|
1539
1549
|
"""
|
|
1540
1550
|
import shlex
|
|
1541
1551
|
import subprocess
|
|
1552
|
+
import sys
|
|
1542
1553
|
config = load_config()
|
|
1543
1554
|
reservation_mgr = ReservationManager(config)
|
|
1544
1555
|
try:
|
|
@@ -1546,27 +1557,82 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
|
|
|
1546
1557
|
except RuntimeError as e:
|
|
1547
1558
|
rprint(f"[red]❌ {str(e)}[/red]"); return
|
|
1548
1559
|
|
|
1549
|
-
# ref
|
|
1560
|
+
# Resolve the ref in-pod -> WANT (sha, for the by-sha cache) + FREF (fetch ref).
|
|
1561
|
+
# A MERGED pr/N reproduces the actual squash/merge commit on main (the real trunk
|
|
1562
|
+
# state that was red) — NOT pull/N/merge (the PR re-applied onto *current* trunk,
|
|
1563
|
+
# which goes green once the fix lands). Open PRs keep pull/N/merge (= CI's view).
|
|
1550
1564
|
r = ref.strip(); prnum = None
|
|
1551
1565
|
if r.startswith("pr/"): prnum = r[3:]
|
|
1552
1566
|
elif r.startswith("#"): prnum = r[1:]
|
|
1553
1567
|
elif r.isdigit(): prnum = r
|
|
1568
|
+
gh = "https://github.com/pytorch/pytorch.git"
|
|
1554
1569
|
if prnum:
|
|
1555
|
-
|
|
1556
|
-
|
|
1570
|
+
api = f"https://api.github.com/repos/pytorch/pytorch/pulls/{prnum}"
|
|
1571
|
+
resolve = (
|
|
1572
|
+
f"PRJSON=$(curl -s -m 10 -H 'Accept: application/vnd.github+json' -H 'User-Agent: gpu-dev' {api} 2>/dev/null); "
|
|
1573
|
+
"MCS=$(printf '%s' \"$PRJSON\" | grep -oE '\"merge_commit_sha\": *\"[0-9a-f]+\"' | head -1 | cut -d'\"' -f4); "
|
|
1574
|
+
"if printf '%s' \"$PRJSON\" | grep -q '\"merged\": *true' && [ -n \"$MCS\" ]; then "
|
|
1575
|
+
f"WANT=\"$MCS\"; FREF=\"$MCS\"; echo \"[repro] pr/{prnum} is merged -> reproducing trunk commit $MCS\"; "
|
|
1576
|
+
f"else FREF=pull/{prnum}/merge; WANT=$(timeout 15 git ls-remote {gh} $FREF 2>/dev/null | head -1 | cut -f1); "
|
|
1577
|
+
f"[ -n \"$WANT\" ] || {{ FREF=pull/{prnum}/head; WANT=$(timeout 15 git ls-remote {gh} $FREF 2>/dev/null | head -1 | cut -f1); echo '[repro] open PR, no /merge -> /head'; }}; fi; ")
|
|
1557
1578
|
else:
|
|
1558
1579
|
rq = shlex.quote(r)
|
|
1559
|
-
|
|
1580
|
+
resolve = (f"FREF={rq}; WANT=$(timeout 15 git ls-remote {gh} {rq} 2>/dev/null | head -1 | cut -f1); "
|
|
1581
|
+
f"[ -n \"$WANT\" ] || case {rq} in *[!0-9a-fA-F]*) WANT= ;; *) WANT={rq} ;; esac; ")
|
|
1582
|
+
# in-pod fallback checkout (by-sha miss + farm unavailable): fetch the resolved ref,
|
|
1583
|
+
# else check out the sha directly (reachable for a merged-PR land commit / trunk).
|
|
1584
|
+
checkout = ("git fetch origin \"$FREF\" 2>/dev/null && git checkout -f FETCH_HEAD "
|
|
1585
|
+
"|| git checkout -f \"$WANT\" 2>/dev/null "
|
|
1586
|
+
"|| { git fetch --force origin 2>/dev/null && git checkout -f \"$WANT\"; }")
|
|
1560
1587
|
|
|
1561
1588
|
testcmd = " ".join(shlex.quote(a) for a in test_args)
|
|
1589
|
+
# by-sha artifact cache: if a fully-built tree for the resolved SHA already exists
|
|
1590
|
+
# (shared EFS, seeded by the build node + prior repros), stage it -> ZERO build.
|
|
1591
|
+
# Otherwise build, then publish the result so the next dev (anyone) gets it instant.
|
|
1562
1592
|
remote = (
|
|
1563
1593
|
"set -e; cd /home/dev/pytorch; "
|
|
1564
1594
|
"git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true; "
|
|
1565
|
-
|
|
1595
|
+
"BYSHA=/ccache_shared/prebuilt/by-sha; QUEUE=/ccache_shared/prebuilt/build-queue; HIT=; "
|
|
1596
|
+
# bs <sha>: stage a fully-built by-sha tree into /home/dev/pytorch (zero build); 0 on success.
|
|
1597
|
+
# explicit ext check, not a glob: the pod login shell is zsh, where an unmatched glob is a hard error.
|
|
1598
|
+
# require the .sha completion gate (written last) so we never stage a half-published tarball.
|
|
1599
|
+
"bs() { local s=\"$1\" tb=; [ -f \"$BYSHA/$s.sha\" ] || return 1; for e in zst gz; do [ -f \"$BYSHA/$s.tar.$e\" ] && { tb=\"$BYSHA/$s.tar.$e\"; break; }; done; [ -n \"$tb\" ] || return 1; "
|
|
1600
|
+
"rm -rf /home/dev/pytorch.new; mkdir -p /home/dev/pytorch.new; "
|
|
1601
|
+
"case \"$tb\" in *.zst) zstd -dc \"$tb\" 2>/dev/null | tar -C /home/dev/pytorch.new --strip-components=1 -xf - 2>/dev/null ;; "
|
|
1602
|
+
"*) tar -C /home/dev/pytorch.new --strip-components=1 -xzf \"$tb\" 2>/dev/null ;; esac; "
|
|
1603
|
+
"[ -d /home/dev/pytorch.new/.git ] || { rm -rf /home/dev/pytorch.new; return 1; }; "
|
|
1604
|
+
"rm -rf /home/dev/pytorch; mv /home/dev/pytorch.new /home/dev/pytorch; return 0; }; "
|
|
1605
|
+
+ resolve +
|
|
1606
|
+
"echo \"[repro] target ${WANT:-?}\"; "
|
|
1607
|
+
# 1) already cached -> stage it (zero build)
|
|
1608
|
+
"if [ -n \"$WANT\" ] && bs \"$WANT\"; then cd /home/dev/pytorch; HIT=1; echo '[repro] by-sha cache HIT -> staged prebuilt tree (zero build)'; fi; "
|
|
1609
|
+
# 2) not cached, build farm alive -> request an off-pod build, wait, then stage
|
|
1610
|
+
"if [ -z \"$HIT\" ] && [ -n \"$WANT\" ] && [ -n \"$(find \"$QUEUE/.worker-alive\" -mmin -2 2>/dev/null)\" ]; then "
|
|
1611
|
+
"echo \"[repro] no cached build; requesting off-pod build of $WANT (build farm; streaming progress)…\"; printf '%s\\n' \"$FREF\" > \"$QUEUE/$WANT.req\" 2>/dev/null || true; "
|
|
1612
|
+
# poll for the artifact; meanwhile tail the farm's build log (ninja [x/N]) so it's not a silent hang.
|
|
1613
|
+
"i=0; LL=0; while [ $i -lt 400 ]; do [ -f \"$BYSHA/$WANT.sha\" ] && break; [ -f \"$QUEUE/$WANT.req\" ] || break; "
|
|
1614
|
+
"if [ -f \"$QUEUE/$WANT.log\" ]; then NL=$(wc -l < \"$QUEUE/$WANT.log\" 2>/dev/null || echo 0); "
|
|
1615
|
+
"if [ \"$NL\" -gt \"$LL\" ]; then tail -n +$((LL+1)) \"$QUEUE/$WANT.log\" 2>/dev/null | grep -aE '\\[[0-9]+/[0-9]+\\]|Building wheel|Successfully built|error' | tail -1 | sed 's/^/ [farm] /'; LL=$NL; fi; fi; "
|
|
1616
|
+
"sleep 3; i=$((i+1)); done; "
|
|
1617
|
+
"if bs \"$WANT\"; then cd /home/dev/pytorch; HIT=1; echo '[repro] off-pod build ready -> staged (zero build)'; else echo '[repro] off-pod build unavailable, building locally'; fi; fi; "
|
|
1618
|
+
# 3) fall back to in-pod fetch + build (+ cache the result for the next dev)
|
|
1619
|
+
"if [ -z \"$HIT\" ]; then "
|
|
1620
|
+
"echo \"[repro] checking out $FREF\"; " + checkout + "; "
|
|
1566
1621
|
"echo \"[repro] HEAD $(git rev-parse --short HEAD)\"; "
|
|
1567
1622
|
"git -c protocol.file.allow=always submodule update --init --recursive --jobs 8 >/dev/null 2>&1 || true; "
|
|
1568
1623
|
"if ! PYTHONPATH=/home/dev/pytorch python -c 'import torch' 2>/dev/null; then "
|
|
1569
|
-
"echo
|
|
1624
|
+
"echo \"[repro] prebuilt torch != this commit -> rebuilding (ccache-accelerated, but the further this commit is from viable/strict, the more recompiles). checked-out: $(git log -1 --format='%h %ci')\"; "
|
|
1625
|
+
# mold -run routes the libtorch_cuda.so relink through mold (~15s vs minutes); guarded.
|
|
1626
|
+
# Explicit if/else (not `$M pip`): the pod login shell is zsh, which doesn't word-split
|
|
1627
|
+
# unquoted vars. -v streams the cmake/ninja [x/N] progress instead of pip's blind spinner.
|
|
1628
|
+
"if command -v mold >/dev/null 2>&1; then mold -run pip install --break-system-packages -e . --no-build-isolation -v; "
|
|
1629
|
+
"else pip install --break-system-packages -e . --no-build-isolation -v; fi; fi; "
|
|
1630
|
+
# cache this build for the next dev (detached so it survives the ssh session)
|
|
1631
|
+
"SHA=$(git rev-parse HEAD 2>/dev/null); "
|
|
1632
|
+
"if command -v publish-pytorch-build >/dev/null 2>&1 && [ -n \"$SHA\" ] && [ ! -f \"$BYSHA/$SHA.sha\" ]; then "
|
|
1633
|
+
"echo '[repro] caching this build (by-sha) for next time…'; "
|
|
1634
|
+
"setsid publish-pytorch-build \"$SHA\" >/dev/null 2>&1 < /dev/null & fi; "
|
|
1635
|
+
"fi; "
|
|
1570
1636
|
f"echo '[repro] running: python {testcmd}'; "
|
|
1571
1637
|
f"PYTHONPATH=/home/dev/pytorch python {testcmd}"
|
|
1572
1638
|
)
|
|
@@ -1602,21 +1668,55 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
|
|
|
1602
1668
|
if "StrictHostKeyChecking" not in ssh_cmd:
|
|
1603
1669
|
ssh_cmd = ssh_cmd.replace("ssh ", "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR ", 1)
|
|
1604
1670
|
rprint(f"[dim]→ {ssh_cmd}[/dim]\n")
|
|
1671
|
+
rid8 = str(rid)[:8]
|
|
1605
1672
|
rc = 1
|
|
1606
1673
|
try:
|
|
1607
1674
|
rc = subprocess.run(f"{ssh_cmd} {shlex.quote(remote)}", shell=True).returncode
|
|
1608
1675
|
except KeyboardInterrupt:
|
|
1609
|
-
rprint("\n[yellow]interrupted[/yellow]")
|
|
1610
|
-
|
|
1676
|
+
rprint("\n[yellow]interrupted[/yellow]"); rc = 130
|
|
1677
|
+
|
|
1678
|
+
verdict = "[green]✓ test passed[/green]" if rc == 0 else f"[red]✗ test failed (exit {rc})[/red]"
|
|
1679
|
+
|
|
1680
|
+
# Default (TTY): drop into the box so you can fix and re-run. --no-connect is the
|
|
1681
|
+
# CI path: auto-cancel and exit with the test's code.
|
|
1682
|
+
connect = (not no_connect) and sys.stdout.isatty()
|
|
1683
|
+
if connect:
|
|
1684
|
+
rprint(f"\n{verdict} — dropping you into the box at ~/pytorch ({ref} checked out).")
|
|
1685
|
+
rprint(f"[dim] re-run: python {testcmd}[/dim]")
|
|
1686
|
+
rprint(f"[dim] finish: gpu-dev cancel (from inside) • or exit this shell[/dim]\n")
|
|
1687
|
+
shell_cmd = f"{ssh_cmd} -t {shlex.quote('cd /home/dev/pytorch 2>/dev/null; exec ${SHELL:-bash} -l')}"
|
|
1688
|
+
try:
|
|
1689
|
+
subprocess.run(shell_cmd, shell=True)
|
|
1690
|
+
except KeyboardInterrupt:
|
|
1691
|
+
pass
|
|
1611
1692
|
if keep:
|
|
1612
|
-
rprint(f"[cyan]📌
|
|
1613
|
-
|
|
1693
|
+
rprint(f"[cyan]📌 left {rid8} running — connect: gpu-dev connect {rid8} • cancel: gpu-dev cancel {rid8}[/cyan]")
|
|
1694
|
+
return
|
|
1695
|
+
try:
|
|
1696
|
+
drop = click.confirm(f"Cancel repro box {rid8}?", default=True)
|
|
1697
|
+
except (KeyboardInterrupt, EOFError, click.Abort):
|
|
1698
|
+
drop = False
|
|
1699
|
+
if drop:
|
|
1614
1700
|
try:
|
|
1615
1701
|
reservation_mgr.cancel_reservation(rid, user_info["user_id"])
|
|
1616
|
-
rprint(f"[green]🧹 cancelled
|
|
1702
|
+
rprint(f"[green]🧹 cancelled {rid8}[/green]")
|
|
1617
1703
|
except Exception as e:
|
|
1618
|
-
rprint(f"[yellow]
|
|
1619
|
-
|
|
1704
|
+
rprint(f"[yellow]cancel failed for {rid8}: {e}[/yellow]")
|
|
1705
|
+
else:
|
|
1706
|
+
rprint(f"[cyan]📌 left {rid8} running — connect: gpu-dev connect {rid8} • cancel: gpu-dev cancel {rid8}[/cyan]")
|
|
1707
|
+
return
|
|
1708
|
+
|
|
1709
|
+
# --no-connect / non-TTY: auto-cancel unless --keep, exit code = test result.
|
|
1710
|
+
if keep:
|
|
1711
|
+
rprint(f"[cyan]📌 kept {rid8} — gpu-dev connect {rid8} • gpu-dev cancel {rid8}[/cyan]")
|
|
1712
|
+
else:
|
|
1713
|
+
try:
|
|
1714
|
+
reservation_mgr.cancel_reservation(rid, user_info["user_id"])
|
|
1715
|
+
rprint(f"[green]🧹 cancelled repro box {rid8}[/green]")
|
|
1716
|
+
except Exception as e:
|
|
1717
|
+
rprint(f"[yellow]auto-cancel failed for {rid8}: {e}[/yellow]")
|
|
1718
|
+
rprint(f"\n[bold]repro exit code: {rc}[/bold] ({verdict})")
|
|
1719
|
+
sys.exit(rc)
|
|
1620
1720
|
|
|
1621
1721
|
|
|
1622
1722
|
_SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
@@ -1837,7 +1937,9 @@ def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dock
|
|
|
1837
1937
|
sys.exit(1)
|
|
1838
1938
|
create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
|
|
1839
1939
|
|
|
1840
|
-
|
|
1940
|
+
# Host alias matches the Host line written by create_ssh_config_for_reservation
|
|
1941
|
+
# (keyed off the reservation id, so warm-claimed masters resolve too).
|
|
1942
|
+
ssh_alias = f"gpu-dev-{master_id[:8]}"
|
|
1841
1943
|
ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
|
|
1842
1944
|
rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
|
|
1843
1945
|
|
|
@@ -3124,11 +3226,15 @@ def _show_direct_success(res: dict, elapsed: float) -> None:
|
|
|
3124
3226
|
"""Print the success block for an instant warm-pool claim,
|
|
3125
3227
|
matching the normal reserve output (SSH config + VS Code/Cursor remote)."""
|
|
3126
3228
|
from gpu_dev_cli.reservations import (
|
|
3127
|
-
create_ssh_config_for_reservation, _generate_vscode_command,
|
|
3229
|
+
create_ssh_config_for_reservation, _generate_vscode_command,
|
|
3230
|
+
_generate_cursor_command, _make_vscode_link, _make_cursor_link)
|
|
3128
3231
|
rid = res.get("reservation_id", "") or ""
|
|
3129
3232
|
ssh_command = res.get("ssh_command", "") or ""
|
|
3130
3233
|
pod_name = res.get("pod_name", "") or ""
|
|
3131
3234
|
fqdn = res.get("fqdn") or ""
|
|
3235
|
+
# Host alias keys off the reservation id — warm-claimed pods have a pod_name
|
|
3236
|
+
# that is NOT gpu-dev-<resid8>, so we must not use pod_name as the ssh alias.
|
|
3237
|
+
host_alias = f"gpu-dev-{rid[:8]}" if rid else pod_name
|
|
3132
3238
|
|
|
3133
3239
|
rprint(f"\n[green]✅ Instant reservation ready in {elapsed:.1f}s![/green]")
|
|
3134
3240
|
rprint(f"[bold]📋 Reservation ID:[/bold] {rid}")
|
|
@@ -3137,24 +3243,28 @@ def _show_direct_success(res: dict, elapsed: float) -> None:
|
|
|
3137
3243
|
if rid:
|
|
3138
3244
|
rprint(f"[bold]⚡ Quick Connect:[/bold] gpu-dev connect {rid[:8]}")
|
|
3139
3245
|
|
|
3140
|
-
# Build the per-reservation SSH config so `ssh
|
|
3246
|
+
# Build the per-reservation SSH config so `ssh gpu-dev-<resid8>` and connect work cleanly.
|
|
3141
3247
|
use_include = False
|
|
3142
3248
|
if fqdn and pod_name and rid:
|
|
3143
3249
|
try:
|
|
3144
3250
|
_cfg, use_include = create_ssh_config_for_reservation(fqdn, pod_name, rid, None)
|
|
3145
3251
|
except Exception:
|
|
3146
3252
|
pass
|
|
3147
|
-
if
|
|
3148
|
-
rprint(f"[bold]🖥️ SSH Command:[/bold] ssh {
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3253
|
+
if use_include and rid:
|
|
3254
|
+
rprint(f"[bold]🖥️ SSH Command:[/bold] ssh {host_alias}")
|
|
3255
|
+
vscode_url = _make_vscode_link(host_alias)
|
|
3256
|
+
cursor_url = _make_cursor_link(host_alias)
|
|
3257
|
+
rprint(f"[bold]💻 VS Code Remote:[/bold] [link={vscode_url}]code --remote ssh-remote+{host_alias} /home/dev[/link]")
|
|
3258
|
+
rprint(f"[bold]🖥️ Cursor Remote:[/bold] [link={cursor_url}]cursor --remote ssh-remote+{host_alias} /home/dev[/link]")
|
|
3259
|
+
else:
|
|
3260
|
+
if ssh_command:
|
|
3261
|
+
rprint(f"[bold]🖥️ SSH Command:[/bold] {ssh_command}")
|
|
3262
|
+
vsc = _generate_vscode_command(ssh_command) if ssh_command else None
|
|
3263
|
+
cur = _generate_cursor_command(ssh_command) if ssh_command else None
|
|
3264
|
+
if vsc:
|
|
3265
|
+
rprint(f"[bold]💻 VS Code Remote:[/bold] {vsc}")
|
|
3266
|
+
if cur:
|
|
3267
|
+
rprint(f"[bold]🖥️ Cursor Remote:[/bold] {cur}")
|
|
3158
3268
|
|
|
3159
3269
|
|
|
3160
3270
|
def _format_gpu_display(gpu_count, gpu_type):
|
|
@@ -3343,15 +3453,22 @@ def _show_availability(show_spot: bool = False) -> None:
|
|
|
3343
3453
|
spot_table = Table(title="⚡ Spot Instances (us-east-1, ~70% cheaper)")
|
|
3344
3454
|
spot_table.add_column("GPU Type", style="cyan")
|
|
3345
3455
|
spot_table.add_column("Avail\nNow", style="green")
|
|
3456
|
+
spot_table.add_column("In\nUse", style="yellow")
|
|
3346
3457
|
spot_table.add_column("Per\nNode", style="bright_green")
|
|
3347
3458
|
spot_table.add_column("Status", style="magenta")
|
|
3348
3459
|
spot_table.add_column("Spot Discount", style="dim")
|
|
3349
3460
|
_on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
|
|
3350
3461
|
for gt, info in sorted(spot_region_info.items()):
|
|
3351
3462
|
avail = info.get("available", 0)
|
|
3463
|
+
total = info.get("total", 0)
|
|
3464
|
+
in_use = max(0, total - avail) # GPUs on up spot nodes already taken
|
|
3352
3465
|
per_node = spot_gpus_per_node.get(gt, 8)
|
|
3353
3466
|
avail_display = f"[green]{avail}[/green]" if avail > 0 else f"[dim]0[/dim]"
|
|
3354
|
-
|
|
3467
|
+
in_use_display = f"[yellow]{in_use}[/yellow]" if in_use > 0 else f"[dim]0[/dim]"
|
|
3468
|
+
if in_use > 0:
|
|
3469
|
+
status = "[yellow]Node up (in use)[/yellow]" if avail == 0 else "[green]Node up[/green]"
|
|
3470
|
+
else:
|
|
3471
|
+
status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
|
|
3355
3472
|
si = info.get("spot_info", {}) or {}
|
|
3356
3473
|
sp = si.get("spot_price", "") if isinstance(si, dict) else ""
|
|
3357
3474
|
if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
|
|
@@ -3363,7 +3480,7 @@ def _show_availability(show_spot: bool = False) -> None:
|
|
|
3363
3480
|
avail_signal = f"[green]{pct}% off on-demand[/green]" if pct > 0 else "[dim]At on-demand price[/dim]"
|
|
3364
3481
|
except (ValueError, TypeError):
|
|
3365
3482
|
avail_signal = "[yellow]Unknown[/yellow]"
|
|
3366
|
-
spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
|
|
3483
|
+
spot_table.add_row(f"{gt.upper()} *", avail_display, in_use_display, str(per_node), status, avail_signal)
|
|
3367
3484
|
console.print(spot_table)
|
|
3368
3485
|
rprint("[dim]* = spot: ~70% cheaper, AWS can reclaim with 2-min notice, fulfillment not guaranteed.[/dim]")
|
|
3369
3486
|
rprint("[dim] Separate cluster (us-east-1) with separate disks. Select via gpu-dev reserve (interactive).[/dim]")
|
|
@@ -3737,7 +3854,8 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3737
3854
|
for node in nodes:
|
|
3738
3855
|
status_display = "✅ Active" if node.get("status") == "active" else f"⏳ {node.get('status', 'unknown')}"
|
|
3739
3856
|
pod_name = node.get("pod_name", "unknown")
|
|
3740
|
-
|
|
3857
|
+
node_rid = node.get("reservation_id")
|
|
3858
|
+
ssh_cmd_short = f"ssh gpu-dev-{node_rid[:8]}" if node_rid else "N/A"
|
|
3741
3859
|
|
|
3742
3860
|
table.add_row(
|
|
3743
3861
|
f"Node {node.get('node_index', 0) + 1}",
|
|
@@ -3994,10 +4112,11 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
|
|
|
3994
4112
|
)
|
|
3995
4113
|
|
|
3996
4114
|
if config_path:
|
|
4115
|
+
node_alias = f"gpu-dev-{node_res_id[:8]}"
|
|
3997
4116
|
if use_include:
|
|
3998
|
-
rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh {
|
|
4117
|
+
rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh {node_alias}[/cyan]")
|
|
3999
4118
|
else:
|
|
4000
|
-
rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh -F {config_path} {
|
|
4119
|
+
rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh -F {config_path} {node_alias}[/cyan]")
|
|
4001
4120
|
else:
|
|
4002
4121
|
rprint(f"[yellow]⚠️ Node {node_idx + 1}: Failed to create SSH config[/yellow]")
|
|
4003
4122
|
|
|
@@ -4025,12 +4144,13 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
|
|
|
4025
4144
|
)
|
|
4026
4145
|
|
|
4027
4146
|
if config_path:
|
|
4147
|
+
host_alias = f"gpu-dev-{reservation_id[:8]}"
|
|
4028
4148
|
rprint(f"[green]✅ SSH config created:[/green] [cyan]{config_path}[/cyan]\n")
|
|
4029
4149
|
if use_include:
|
|
4030
|
-
rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh {
|
|
4150
|
+
rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh {host_alias}[/cyan]")
|
|
4031
4151
|
rprint(f"[dim] or:[/dim] [cyan]gpu-dev connect {reservation_id[:8]}[/cyan]")
|
|
4032
4152
|
else:
|
|
4033
|
-
rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh -F {config_path} {
|
|
4153
|
+
rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh -F {config_path} {host_alias}[/cyan]")
|
|
4034
4154
|
rprint(f"[dim] or:[/dim] [cyan]gpu-dev connect {reservation_id[:8]}[/cyan]")
|
|
4035
4155
|
else:
|
|
4036
4156
|
rprint("[red]❌ Failed to create SSH config[/red]")
|
|
@@ -4597,13 +4717,13 @@ def ssh_include(action: str):
|
|
|
4597
4717
|
|
|
4598
4718
|
\b
|
|
4599
4719
|
When enabled:
|
|
4600
|
-
• Simple SSH commands: ssh
|
|
4601
|
-
• VS Code Remote works: code --remote ssh-remote
|
|
4720
|
+
• Simple SSH commands: ssh gpu-dev-<reservation-id>
|
|
4721
|
+
• VS Code Remote works: code --remote ssh-remote+gpu-dev-<reservation-id>
|
|
4602
4722
|
• Cursor Remote works: Open Remote SSH in Cursor
|
|
4603
4723
|
|
|
4604
4724
|
\b
|
|
4605
4725
|
When disabled:
|
|
4606
|
-
• Need -F flag: ssh -F ~/.gpu-dev/<id>-sshconfig
|
|
4726
|
+
• Need -F flag: ssh -F ~/.gpu-dev/<id>-sshconfig gpu-dev-<reservation-id>
|
|
4607
4727
|
• VS Code/Cursor requires manual config setup
|
|
4608
4728
|
|
|
4609
4729
|
\b
|
|
@@ -29,6 +29,15 @@ class Config:
|
|
|
29
29
|
"description": "Spot-only us-east-1 environment (T4/L4/CPU)",
|
|
30
30
|
"spot_types": ["b300", "b200", "h200", "h100", "a100", "t4", "l4", "rtxpro6000"],
|
|
31
31
|
},
|
|
32
|
+
# Staging (us-west-1, tf "default" workspace, environment=test). Same
|
|
33
|
+
# standard resource prefix as prod, just a different region — so only the
|
|
34
|
+
# region changes. Live capacity: cpu-x86/arm + t4. Used for integration
|
|
35
|
+
# tests. Select via `GPU_DEV_ENVIRONMENT=staging` (or the "test" env alias).
|
|
36
|
+
"staging": {
|
|
37
|
+
"region": "us-west-1",
|
|
38
|
+
"workspace": "default",
|
|
39
|
+
"description": "Staging (us-west-1, cpu + t4)",
|
|
40
|
+
},
|
|
32
41
|
}
|
|
33
42
|
DEFAULT_ENVIRONMENT = "prod"
|
|
34
43
|
|
|
@@ -43,19 +52,33 @@ class Config:
|
|
|
43
52
|
# Load unified config (handles migration from legacy files)
|
|
44
53
|
self.user_config = self._load_config()
|
|
45
54
|
|
|
46
|
-
#
|
|
55
|
+
# Active environment: GPU_DEV_ENVIRONMENT env wins (handy for tests/CI),
|
|
56
|
+
# then the persisted config, then the default. Its region/prefix back the
|
|
57
|
+
# fallbacks below so e.g. `GPU_DEV_ENVIRONMENT=staging` reaches us-west-2.
|
|
58
|
+
env_override = os.getenv("GPU_DEV_ENVIRONMENT")
|
|
59
|
+
env_name = env_override or self.user_config.get(
|
|
60
|
+
"environment", self.DEFAULT_ENVIRONMENT)
|
|
61
|
+
env_cfg = self.ENVIRONMENTS.get(env_name, {})
|
|
62
|
+
|
|
63
|
+
# Get region: AWS_* env vars take priority (for spot routing); then an
|
|
64
|
+
# explicit GPU_DEV_ENVIRONMENT switch uses that env's region (beating the
|
|
65
|
+
# persisted one); then the persisted config; then the env's region; default.
|
|
47
66
|
env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
|
|
48
67
|
if env_region and env_region != self.user_config.get("region"):
|
|
49
68
|
self.aws_region = env_region
|
|
69
|
+
elif env_override and env_cfg.get("region"):
|
|
70
|
+
self.aws_region = env_cfg["region"]
|
|
50
71
|
elif self.user_config.get("region"):
|
|
51
72
|
self.aws_region = self.user_config["region"]
|
|
73
|
+
elif env_cfg.get("region"):
|
|
74
|
+
self.aws_region = env_cfg["region"]
|
|
52
75
|
else:
|
|
53
76
|
self.aws_region = "us-east-2"
|
|
54
77
|
|
|
55
78
|
os.environ["AWS_DEFAULT_REGION"] = self.aws_region
|
|
56
79
|
|
|
57
|
-
# Resource naming convention -
|
|
58
|
-
self.prefix = "pytorch-gpu-dev"
|
|
80
|
+
# Resource naming convention — per-environment prefix (default for prod).
|
|
81
|
+
self.prefix = env_cfg.get("prefix", "pytorch-gpu-dev")
|
|
59
82
|
|
|
60
83
|
# Construct ARNs from convention
|
|
61
84
|
self.queue_name = f"{self.prefix}-reservation-queue"
|