gpu-dev 0.7.12__tar.gz → 0.7.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/PKG-INFO +1 -1
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +227 -9
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +26 -2
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/gpu_dev.egg-info/SOURCES.txt +3 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/pyproject.toml +1 -1
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/Dockerfile +24 -9
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +148 -11
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_processor/index.py +109 -4
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda.tf +37 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/main.tf +2 -4
- gpu_dev-0.7.13/tests/unit/cli/test_debug.py +155 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_repro.py +75 -1
- gpu_dev-0.7.13/tests/unit/lambda_fn/test_dead_pod_cleanup.py +177 -0
- gpu_dev-0.7.13/tests/unit/lambda_fn/test_get_logs.py +59 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_mig_gpu_config.py +8 -7
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_pod_resources.py +17 -8
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/.github/workflows/tests.yml +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/.gitignore +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/CLAUDE.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/README.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/admin/README.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/admin/generate_stats.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/admin/requirements.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/conftest.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/docs/FAST_REPRO_DESIGN.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/docs/GPU_DEV_SUBMIT.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/docs/SDK_REPRO.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/post-may-2026.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/CLAUDE.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/architecture.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/cli-demo.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/devgpu-features.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/feedback.png +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/gpu-fleet.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/index.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/k8s-under-the-hood.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/multinode.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/osdc-future-plans.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/problem.png +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/pyproject.toml +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/sandbox.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/sdk-demo.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/teaser.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/thesis.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/title-vid.mp4 +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/weneedgpus.png +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/presentation/wow.html +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/README.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/examples/batch_multi_gpu.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/examples/interactive_debug.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/examples/parallel_experiments.ipynb +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/examples/quickstart.ipynb +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/examples/run_tests.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/examples/submit_job.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_backend/aws.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/config.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/enums.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/errors.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/models.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/py.typed +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/sdk/python/tests/test_models.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/setup.cfg +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-deck/backend.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-deck/main.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-deck/terraform.tfvars.example +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/build-node.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/pytorch-ondemand.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/pytorch-prebuild.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/warm-pool.tf +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/integration/README.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/integration/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/integration/conftest.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/integration/test_claude.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/integration/test_cpu_lifecycle.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/integration/test_repro_known_failure.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/integration/test_t4_lifecycle.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/integration/test_warm_pool.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/submit/README.md +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/submit/success/run.sh +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_auth.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_avail.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_cancel.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_config_cmd.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_config_module.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_connect.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_disks.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_edit.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_interactive.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_list_show.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_name_generator.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_reservations_mgr.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_reserve.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_smoke.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_ssh_alias.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/cli/test_submit.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_availability.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_cancellation.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_claim.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_finalize_no_ssh.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_ref_staging.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_smoke.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_version_gate.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_warm_pool.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/sdk/__init__.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/sdk/test_backend_aws.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/sdk/test_client.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/sdk/test_errors_enums.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/sdk/test_models_extra.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/sdk/test_sandbox.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/sdk/test_sdk_config.py +0 -0
- {gpu_dev-0.7.12 → gpu_dev-0.7.13}/tests/unit/sdk/test_transport_ssh.py +0 -0
|
@@ -1521,19 +1521,23 @@ def reserve(
|
|
|
1521
1521
|
|
|
1522
1522
|
|
|
1523
1523
|
@main.command(context_settings={"ignore_unknown_options": True})
|
|
1524
|
-
@click.argument("ref")
|
|
1525
|
-
@click.argument("test_args", nargs=-1, required=
|
|
1526
|
-
@click.option("--
|
|
1524
|
+
@click.argument("ref", required=False)
|
|
1525
|
+
@click.argument("test_args", nargs=-1, required=False)
|
|
1526
|
+
@click.option("--lint", is_flag=True, default=False,
|
|
1527
|
+
help="Run a PyTorch lint job (lintrunner) on a CPU box instead of a python test. "
|
|
1528
|
+
"Defaults to --gpu-type cpu-x86 and skips the torch build. Extra args go to "
|
|
1529
|
+
"lintrunner (default: --merge-base-with origin/main, i.e. the PR diff like CI).")
|
|
1530
|
+
@click.option("--gpu-type", default=None, help="GPU type for the repro box (default: b200; cpu-x86 with --lint).")
|
|
1527
1531
|
@click.option("--gpus", type=int, default=1, show_default=True)
|
|
1528
1532
|
@click.option("--hours", type=float, default=3.0, show_default=True,
|
|
1529
1533
|
help="Lifetime ceiling for the box.")
|
|
1530
1534
|
@click.option("--no-connect", is_flag=True, default=False,
|
|
1531
|
-
help="CI mode: run the test, auto-cancel, exit code =
|
|
1535
|
+
help="CI mode: run the test/lint, auto-cancel, exit code = result. Default (on a TTY) drops you into the box to iterate.")
|
|
1532
1536
|
@click.option("--keep", is_flag=True, default=False,
|
|
1533
1537
|
help="Never cancel the box (skip the cancel prompt / auto-cancel).")
|
|
1534
1538
|
@click.pass_context
|
|
1535
|
-
def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
1536
|
-
"""Reserve a
|
|
1539
|
+
def repro(ctx, ref, test_args, lint, gpu_type, gpus, hours, no_connect, keep):
|
|
1540
|
+
"""Reserve a box, check out a PR/commit, run a test (or lint), then drop you in.
|
|
1537
1541
|
|
|
1538
1542
|
By default (in a terminal) repro runs the test and then **connects you into the
|
|
1539
1543
|
box** at ~/pytorch — the ref is checked out, so you can fix and re-run. The box
|
|
@@ -1546,10 +1550,30 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
|
1546
1550
|
TEST_ARGS are passed straight to `python` inside ~/pytorch, e.g.
|
|
1547
1551
|
|
|
1548
1552
|
gpu-dev repro pr/185264 test/inductor/test_flex_attention.py TestFlexAttentionCUDA.test_large_kv_int64_pointer_math_cuda
|
|
1553
|
+
|
|
1554
|
+
--lint runs lintrunner on a CPU box instead (no GPU, no torch build), e.g.
|
|
1555
|
+
|
|
1556
|
+
gpu-dev repro --lint # lint main (all files)
|
|
1557
|
+
gpu-dev repro --lint pr/185264 # lint the PR diff (CI-equivalent)
|
|
1558
|
+
gpu-dev repro --lint pr/185264 --all-files # lint everything
|
|
1559
|
+
|
|
1560
|
+
The box stays up after the run: on a TTY you're dropped in and prompted to
|
|
1561
|
+
cancel on exit (use --keep to leave it running; --no-connect auto-cancels).
|
|
1549
1562
|
"""
|
|
1550
1563
|
import shlex
|
|
1551
1564
|
import subprocess
|
|
1552
1565
|
import sys
|
|
1566
|
+
if not ref:
|
|
1567
|
+
if not lint:
|
|
1568
|
+
rprint("[red]❌ Provide a REF (pr/N, branch, or commit) — or use --lint to lint main.[/red]")
|
|
1569
|
+
sys.exit(2)
|
|
1570
|
+
ref = "main" # bare `repro --lint` lints current main
|
|
1571
|
+
if not lint and not test_args:
|
|
1572
|
+
rprint("[red]❌ Provide a test, e.g. gpu-dev repro pr/123 test/foo.py — or pass --lint for a lint job.[/red]")
|
|
1573
|
+
sys.exit(2)
|
|
1574
|
+
gpu_type = (gpu_type or ("cpu-x86" if lint else "b200")).lower()
|
|
1575
|
+
if gpu_type.startswith("cpu"):
|
|
1576
|
+
gpus = 0 # CPU reservations must have gpu_count=0
|
|
1553
1577
|
config = load_config()
|
|
1554
1578
|
reservation_mgr = ReservationManager(config)
|
|
1555
1579
|
try:
|
|
@@ -1637,9 +1661,37 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
|
1637
1661
|
f"PYTHONPATH=/home/dev/pytorch python {testcmd}"
|
|
1638
1662
|
)
|
|
1639
1663
|
|
|
1664
|
+
runlabel, rerun_hint = "test", f"python {testcmd}"
|
|
1665
|
+
if lint:
|
|
1666
|
+
# Lint needs the source tree at the ref but NO torch build. Most pods already
|
|
1667
|
+
# have /home/dev/pytorch; CPU pods may not, so clone (partial) as a fallback.
|
|
1668
|
+
# origin/main is fetched so --merge-base-with works (the PR-diff scope CI lints).
|
|
1669
|
+
# PR ref -> lint the diff (CI-equivalent); main/branch/sha -> lint everything
|
|
1670
|
+
# (merge-base-with origin/main would be empty when you ARE main).
|
|
1671
|
+
lint_default = "--merge-base-with origin/main" if prnum else "--all-files"
|
|
1672
|
+
lintargs = " ".join(shlex.quote(a) for a in test_args) or lint_default
|
|
1673
|
+
runlabel, rerun_hint = "lint", f"lintrunner {lintargs}"
|
|
1674
|
+
remote = (
|
|
1675
|
+
"set -e; "
|
|
1676
|
+
"git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true; "
|
|
1677
|
+
"if [ ! -d /home/dev/pytorch/.git ]; then echo '[lint] no pytorch tree on this pod — cloning (partial)…'; "
|
|
1678
|
+
"rm -rf /home/dev/pytorch; git clone --filter=blob:none https://github.com/pytorch/pytorch.git /home/dev/pytorch; fi; "
|
|
1679
|
+
"cd /home/dev/pytorch; "
|
|
1680
|
+
+ resolve +
|
|
1681
|
+
"echo \"[lint] target ${WANT:-?}\"; "
|
|
1682
|
+
"git fetch origin main 2>/dev/null || true; "
|
|
1683
|
+
"echo \"[lint] checking out $FREF\"; " + checkout + "; "
|
|
1684
|
+
"echo \"[lint] HEAD $(git rev-parse --short HEAD)\"; "
|
|
1685
|
+
"command -v lintrunner >/dev/null 2>&1 || pip install --break-system-packages -q lintrunner; "
|
|
1686
|
+
"echo '[lint] lintrunner init (downloading linters)…'; lintrunner init; "
|
|
1687
|
+
f"echo '[lint] running: lintrunner {lintargs}'; "
|
|
1688
|
+
f"lintrunner {lintargs}"
|
|
1689
|
+
)
|
|
1690
|
+
|
|
1640
1691
|
# Reserve — warm claim (instant) first, else cold ephemeral. Always no-persist
|
|
1641
1692
|
# (so the prebuilt tree is staged; a default disk would skip staging).
|
|
1642
|
-
|
|
1693
|
+
desc = f"{gpus}x {gpu_type}" if gpus else gpu_type
|
|
1694
|
+
rprint(f"[cyan]🔬 repro: reserving {desc} (warm if available)…[/cyan]")
|
|
1643
1695
|
rid = ssh_cmd = None
|
|
1644
1696
|
try:
|
|
1645
1697
|
res = reservation_mgr.claim_direct(
|
|
@@ -1675,14 +1727,14 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
|
1675
1727
|
except KeyboardInterrupt:
|
|
1676
1728
|
rprint("\n[yellow]interrupted[/yellow]"); rc = 130
|
|
1677
1729
|
|
|
1678
|
-
verdict = "[green]✓
|
|
1730
|
+
verdict = f"[green]✓ {runlabel} passed[/green]" if rc == 0 else f"[red]✗ {runlabel} failed (exit {rc})[/red]"
|
|
1679
1731
|
|
|
1680
1732
|
# Default (TTY): drop into the box so you can fix and re-run. --no-connect is the
|
|
1681
1733
|
# CI path: auto-cancel and exit with the test's code.
|
|
1682
1734
|
connect = (not no_connect) and sys.stdout.isatty()
|
|
1683
1735
|
if connect:
|
|
1684
1736
|
rprint(f"\n{verdict} — dropping you into the box at ~/pytorch ({ref} checked out).")
|
|
1685
|
-
rprint(f"[dim] re-run:
|
|
1737
|
+
rprint(f"[dim] re-run: {rerun_hint}[/dim]")
|
|
1686
1738
|
rprint(f"[dim] finish: gpu-dev cancel (from inside) • or exit this shell[/dim]\n")
|
|
1687
1739
|
shell_cmd = f"{ssh_cmd} -t {shlex.quote('cd /home/dev/pytorch 2>/dev/null; exec ${SHELL:-bash} -l')}"
|
|
1688
1740
|
try:
|
|
@@ -3232,6 +3284,172 @@ def show(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3232
3284
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
3233
3285
|
|
|
3234
3286
|
|
|
3287
|
+
def _print_recovery_hints(connection_info: dict) -> None:
|
|
3288
|
+
"""Tell the user how to unblock/recover their own reservation based on status."""
|
|
3289
|
+
status = (connection_info.get("status") or "").lower()
|
|
3290
|
+
disk_name = connection_info.get("disk_name") or ""
|
|
3291
|
+
res_id = connection_info.get("reservation_id", "") or ""
|
|
3292
|
+
short_id = res_id[:8] if res_id else "<id>"
|
|
3293
|
+
hints = []
|
|
3294
|
+
if status in ("failed", "expired", "cancelled"):
|
|
3295
|
+
if disk_name:
|
|
3296
|
+
hints.append(
|
|
3297
|
+
f"Your data on disk '{disk_name}' is preserved — re-reserve with: "
|
|
3298
|
+
f"gpu-dev reserve --disk {disk_name}")
|
|
3299
|
+
hints.append(f"If that disk is stuck locked: gpu-dev disk unlock {disk_name}")
|
|
3300
|
+
else:
|
|
3301
|
+
hints.append("Re-reserve a new box with: gpu-dev reserve")
|
|
3302
|
+
elif status == "active":
|
|
3303
|
+
hints.append(
|
|
3304
|
+
f"If status is 'active' but you can't SSH, the pod likely died (e.g. OOM). "
|
|
3305
|
+
f"Free it (and your disk) with: gpu-dev cancel {short_id} — then re-reserve.")
|
|
3306
|
+
if disk_name:
|
|
3307
|
+
hints.append(f"If the disk stays locked after cancel: gpu-dev disk unlock {disk_name}")
|
|
3308
|
+
if hints:
|
|
3309
|
+
rprint("\n[bold]Recovery:[/bold]")
|
|
3310
|
+
for h in hints:
|
|
3311
|
+
rprint(f" • {h}")
|
|
3312
|
+
|
|
3313
|
+
|
|
3314
|
+
def _show_diagnostics(connection_info: dict) -> None:
|
|
3315
|
+
"""Render the extra diagnostics `gpu-dev debug` adds on top of the status panel:
|
|
3316
|
+
failure reason, OOM events, the full status-history timeline, captured pod logs,
|
|
3317
|
+
and recovery hints. All sourced from data the lambdas write to DynamoDB, so it
|
|
3318
|
+
needs no cluster/lambda access."""
|
|
3319
|
+
from rich.text import Text
|
|
3320
|
+
|
|
3321
|
+
status = (connection_info.get("status") or "").lower()
|
|
3322
|
+
|
|
3323
|
+
# Failure reason / latest detailed status — shown for ANY status (the normal
|
|
3324
|
+
# `show` only surfaces failure_reason on 'failed'; for an active-but-dead pod
|
|
3325
|
+
# this is exactly what the user needs).
|
|
3326
|
+
failure_reason = (connection_info.get("failure_reason") or "").strip()
|
|
3327
|
+
detailed = (connection_info.get("current_detailed_status") or "").strip()
|
|
3328
|
+
if failure_reason:
|
|
3329
|
+
rprint(f"\n[bold red]Why it ended:[/bold red] {failure_reason}")
|
|
3330
|
+
elif detailed and status != "active":
|
|
3331
|
+
rprint(f"\n[bold]Latest status:[/bold] {detailed}")
|
|
3332
|
+
|
|
3333
|
+
# OOM events
|
|
3334
|
+
oom_count = int(connection_info.get("oom_count", 0) or 0)
|
|
3335
|
+
if oom_count > 0:
|
|
3336
|
+
last = connection_info.get("last_oom_at") or "unknown"
|
|
3337
|
+
cont = connection_info.get("oom_container") or "?"
|
|
3338
|
+
rprint(f"[red]⚠️ OOM:[/red] {oom_count} event(s) — last {last} (container: {cont})")
|
|
3339
|
+
|
|
3340
|
+
# Status-history timeline (the gold for "what happened to my reservation")
|
|
3341
|
+
history = connection_info.get("status_history") or []
|
|
3342
|
+
if history:
|
|
3343
|
+
table = Table(title="Status timeline (most recent last)", show_header=True,
|
|
3344
|
+
header_style="bold", box=None, pad_edge=False)
|
|
3345
|
+
table.add_column("Time", style="dim", no_wrap=True)
|
|
3346
|
+
table.add_column("Event")
|
|
3347
|
+
for entry in history[-40:]:
|
|
3348
|
+
if isinstance(entry, dict):
|
|
3349
|
+
table.add_row(str(entry.get("timestamp", "")), str(entry.get("message", "")))
|
|
3350
|
+
console.print("")
|
|
3351
|
+
console.print(table)
|
|
3352
|
+
else:
|
|
3353
|
+
rprint("\n[dim]No status history recorded for this reservation.[/dim]")
|
|
3354
|
+
|
|
3355
|
+
# Captured pod logs (lambda snapshot — last lines around the failure)
|
|
3356
|
+
pod_logs = (connection_info.get("pod_logs") or "").strip()
|
|
3357
|
+
if pod_logs:
|
|
3358
|
+
console.print(Panel(Text(pod_logs[-4000:]), title="Captured pod logs (snapshot)",
|
|
3359
|
+
border_style="yellow"))
|
|
3360
|
+
|
|
3361
|
+
_print_recovery_hints(connection_info)
|
|
3362
|
+
|
|
3363
|
+
|
|
3364
|
+
def _show_lambda_logs(reservation_mgr, reservation_id: str, user_id: str) -> None:
|
|
3365
|
+
"""Fetch + render the raw lambda (CloudWatch) logs for a reservation."""
|
|
3366
|
+
from rich.text import Text
|
|
3367
|
+
rprint("\n[bold]Fetching lambda logs from CloudWatch…[/bold] [dim](a few seconds)[/dim]")
|
|
3368
|
+
result = reservation_mgr.get_reservation_logs(reservation_id, user_id)
|
|
3369
|
+
if result is None:
|
|
3370
|
+
rprint("[yellow]Could not reach the log backend (it may not be deployed yet, "
|
|
3371
|
+
"or you lack lambda:InvokeFunctionUrl access).[/yellow]")
|
|
3372
|
+
return
|
|
3373
|
+
if result.get("error"):
|
|
3374
|
+
rprint(f"[yellow]Log query: {result['error']}[/yellow]")
|
|
3375
|
+
lines = result.get("lines") or []
|
|
3376
|
+
if not lines:
|
|
3377
|
+
rprint("[dim]No lambda log lines found for this reservation (outside the "
|
|
3378
|
+
"retention window, or none recorded).[/dim]")
|
|
3379
|
+
return
|
|
3380
|
+
body = "\n".join(f"{ln.get('timestamp','')} {ln.get('message','')}".rstrip()
|
|
3381
|
+
for ln in lines)
|
|
3382
|
+
console.print(Panel(Text(body[-16000:]),
|
|
3383
|
+
title=f"Lambda logs · {len(lines)} line(s)", border_style="cyan"))
|
|
3384
|
+
|
|
3385
|
+
|
|
3386
|
+
@main.command()
|
|
3387
|
+
@click.argument("reservation_id", required=False)
|
|
3388
|
+
@click.option("--logs", "show_logs", is_flag=True,
|
|
3389
|
+
help="Also fetch the raw lambda logs for this reservation from CloudWatch.")
|
|
3390
|
+
@click.pass_context
|
|
3391
|
+
def debug(ctx: click.Context, reservation_id: Optional[str], show_logs: bool) -> None:
|
|
3392
|
+
"""Diagnose your own reservation — why a box died or won't connect.
|
|
3393
|
+
|
|
3394
|
+
Shows the status timeline, failure reason, OOM events, and captured pod logs,
|
|
3395
|
+
plus recovery steps — all without needing cluster or lambda access. Add --logs
|
|
3396
|
+
to also pull the raw reservation/expiry lambda logs from CloudWatch.
|
|
3397
|
+
|
|
3398
|
+
\b
|
|
3399
|
+
Examples:
|
|
3400
|
+
gpu-dev debug # pick from your active reservations
|
|
3401
|
+
gpu-dev debug abc12345 # a specific reservation (id prefix ok)
|
|
3402
|
+
gpu-dev debug abc12345 --logs # + raw lambda logs from CloudWatch
|
|
3403
|
+
|
|
3404
|
+
For a recently failed/expired box, find its id with 'gpu-dev list' then
|
|
3405
|
+
'gpu-dev debug <id>'.
|
|
3406
|
+
"""
|
|
3407
|
+
try:
|
|
3408
|
+
config = load_config()
|
|
3409
|
+
user_info = authenticate_user(config)
|
|
3410
|
+
reservation_mgr = ReservationManager(config)
|
|
3411
|
+
|
|
3412
|
+
# In-pod fast path: the pod's own reservation id is on the env.
|
|
3413
|
+
if reservation_id is None:
|
|
3414
|
+
reservation_id = os.environ.get("GPU_DEV_RESERVATION_ID") or None
|
|
3415
|
+
|
|
3416
|
+
if reservation_id is None:
|
|
3417
|
+
reservations = _fetch_reservations_cross_region(
|
|
3418
|
+
reservation_mgr, user_info["user_id"],
|
|
3419
|
+
["active", "preparing", "queued", "pending"], config)
|
|
3420
|
+
if not reservations:
|
|
3421
|
+
rprint("[yellow]📋 No active reservations.[/yellow] To debug a recent "
|
|
3422
|
+
"failed/expired one, find its id with [bold]gpu-dev list[/bold] "
|
|
3423
|
+
"then run [bold]gpu-dev debug <id>[/bold].")
|
|
3424
|
+
return
|
|
3425
|
+
if len(reservations) == 1:
|
|
3426
|
+
reservation_id = reservations[0].get("reservation_id")
|
|
3427
|
+
else:
|
|
3428
|
+
selected = select_reservation_interactive(reservations, "debug")
|
|
3429
|
+
if not selected or selected in ("__QUIT__", "__ALL__"):
|
|
3430
|
+
rprint("[yellow]Cancelled.[/yellow]")
|
|
3431
|
+
return
|
|
3432
|
+
reservation_id = selected
|
|
3433
|
+
|
|
3434
|
+
connection_info = reservation_mgr.get_connection_info(
|
|
3435
|
+
reservation_id, user_info["user_id"])
|
|
3436
|
+
if not connection_info:
|
|
3437
|
+
rprint(f"[red]❌ No reservation found matching '{reservation_id}'[/red] "
|
|
3438
|
+
"(try a longer id prefix, or check 'gpu-dev list').")
|
|
3439
|
+
return
|
|
3440
|
+
|
|
3441
|
+
_show_single_reservation(connection_info)
|
|
3442
|
+
_show_diagnostics(connection_info)
|
|
3443
|
+
if show_logs:
|
|
3444
|
+
_show_lambda_logs(reservation_mgr, connection_info["reservation_id"],
|
|
3445
|
+
user_info["user_id"])
|
|
3446
|
+
|
|
3447
|
+
except RuntimeError as e:
|
|
3448
|
+
rprint(f"[red]❌ {str(e)}[/red]")
|
|
3449
|
+
except Exception as e:
|
|
3450
|
+
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
3451
|
+
|
|
3452
|
+
|
|
3235
3453
|
|
|
3236
3454
|
def _maybe_show_sdk_tip() -> None:
|
|
3237
3455
|
"""For a user's first few reservations, nudge them toward the Python SDK +
|
|
@@ -613,7 +613,7 @@ class ReservationManager:
|
|
|
613
613
|
pass
|
|
614
614
|
return self._direct_url or None
|
|
615
615
|
|
|
616
|
-
def _signed_post(self, url: str, payload: dict) -> Optional[dict]:
|
|
616
|
+
def _signed_post(self, url: str, payload: dict, timeout: int = 20) -> Optional[dict]:
|
|
617
617
|
"""SigV4-signed POST to the Function URL. Returns parsed JSON or None."""
|
|
618
618
|
try:
|
|
619
619
|
creds = self.config.session.get_credentials()
|
|
@@ -623,13 +623,29 @@ class ReservationManager:
|
|
|
623
623
|
aws_req = AWSRequest(method="POST", url=url, data=data,
|
|
624
624
|
headers={"Content-Type": "application/json"})
|
|
625
625
|
SigV4Auth(creds, "lambda", self.config.aws_region).add_auth(aws_req)
|
|
626
|
-
resp = requests.post(url, data=data, headers=dict(aws_req.headers), timeout=
|
|
626
|
+
resp = requests.post(url, data=data, headers=dict(aws_req.headers), timeout=timeout)
|
|
627
627
|
if resp.status_code != 200:
|
|
628
628
|
return None
|
|
629
629
|
return resp.json()
|
|
630
630
|
except Exception:
|
|
631
631
|
return None
|
|
632
632
|
|
|
633
|
+
def get_reservation_logs(self, reservation_id: str, user_id: str) -> Optional[Dict[str, Any]]:
|
|
634
|
+
"""Fetch a reservation's lambda logs (CloudWatch Logs Insights) via the
|
|
635
|
+
processor Function URL. Returns {"lines": [...]} / {"error": ...}, or None if
|
|
636
|
+
the backend/URL is unavailable. Used by `gpu-dev debug --logs`."""
|
|
637
|
+
url = self._get_direct_url()
|
|
638
|
+
if not url:
|
|
639
|
+
return None
|
|
640
|
+
payload = {
|
|
641
|
+
"action": "get_logs",
|
|
642
|
+
"reservation_id": reservation_id,
|
|
643
|
+
"user_id": user_id,
|
|
644
|
+
"version": get_version(),
|
|
645
|
+
}
|
|
646
|
+
# CloudWatch Logs Insights queries take longer than a claim — allow ~70s.
|
|
647
|
+
return self._signed_post(url, payload, timeout=70)
|
|
648
|
+
|
|
633
649
|
def claim_direct(self, *, user_id: str, gpu_count: int, gpu_type: str,
|
|
634
650
|
duration_hours: Union[int, float], name: Optional[str] = None,
|
|
635
651
|
github_user: Optional[str] = None, ref: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
@@ -999,11 +1015,19 @@ class ReservationManager:
|
|
|
999
1015
|
"jupyter_enabled": reservation.get("jupyter_enabled", False),
|
|
1000
1016
|
"jupyter_error": reservation.get("jupyter_error", ""),
|
|
1001
1017
|
"ebs_volume_id": reservation.get("ebs_volume_id", ""),
|
|
1018
|
+
"disk_name": reservation.get("disk_name", ""),
|
|
1002
1019
|
"secondary_users": reservation.get("secondary_users", []),
|
|
1003
1020
|
"warning": reservation.get("warning", ""),
|
|
1004
1021
|
"is_multinode": is_multinode,
|
|
1005
1022
|
"pod_ip": reservation.get("pod_ip", ""),
|
|
1023
|
+
"node_ip": reservation.get("node_ip", ""),
|
|
1024
|
+
"node_name": reservation.get("node_name", ""),
|
|
1006
1025
|
"fqdn": reservation.get("fqdn", ""),
|
|
1026
|
+
# Health/diagnostics (surfaced by `gpu-dev debug`); written by the
|
|
1027
|
+
# reservation + expiry lambdas. Present off the raw item, not always set.
|
|
1028
|
+
"oom_count": int(reservation.get("oom_count", 0) or 0),
|
|
1029
|
+
"last_oom_at": reservation.get("last_oom_at", ""),
|
|
1030
|
+
"oom_container": reservation.get("oom_container", ""),
|
|
1007
1031
|
}
|
|
1008
1032
|
|
|
1009
1033
|
# If multi-node, fetch all nodes in the group
|
|
@@ -198,6 +198,7 @@ tests/unit/cli/test_cancel.py
|
|
|
198
198
|
tests/unit/cli/test_config_cmd.py
|
|
199
199
|
tests/unit/cli/test_config_module.py
|
|
200
200
|
tests/unit/cli/test_connect.py
|
|
201
|
+
tests/unit/cli/test_debug.py
|
|
201
202
|
tests/unit/cli/test_disks.py
|
|
202
203
|
tests/unit/cli/test_edit.py
|
|
203
204
|
tests/unit/cli/test_interactive.py
|
|
@@ -213,7 +214,9 @@ tests/unit/lambda_fn/__init__.py
|
|
|
213
214
|
tests/unit/lambda_fn/test_availability.py
|
|
214
215
|
tests/unit/lambda_fn/test_cancellation.py
|
|
215
216
|
tests/unit/lambda_fn/test_claim.py
|
|
217
|
+
tests/unit/lambda_fn/test_dead_pod_cleanup.py
|
|
216
218
|
tests/unit/lambda_fn/test_finalize_no_ssh.py
|
|
219
|
+
tests/unit/lambda_fn/test_get_logs.py
|
|
217
220
|
tests/unit/lambda_fn/test_mig_gpu_config.py
|
|
218
221
|
tests/unit/lambda_fn/test_pod_resources.py
|
|
219
222
|
tests/unit/lambda_fn/test_ref_staging.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.13"
|
|
8
8
|
description = "CLI + Python SDK for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -46,8 +46,12 @@ RUN for attempt in 1 2 3; do \
|
|
|
46
46
|
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
|
|
47
47
|
apt-get install -y nodejs
|
|
48
48
|
|
|
49
|
-
# Install
|
|
49
|
+
# Install additional CUDA toolkits alongside base CUDA 13.2
|
|
50
50
|
# Base image already has NVIDIA repo configured, no need for cuda-keyring
|
|
51
|
+
# NOTE: cuda-toolkit-13-3 is intentionally NOT here. CUDA 13.3 ships a unified
|
|
52
|
+
# `cccl-13-3` package that `Breaks` `cuda-cccl-12-8`/`-12-9`, so 13.3 cannot coexist
|
|
53
|
+
# with the 12.8/12.9 toolkits in one image. To add 13.3 we'd have to drop 12.8/12.9
|
|
54
|
+
# (or hand-curate 13.3 sub-packages that exclude cccl). Kept 12.8-13.2 for now.
|
|
51
55
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
52
56
|
cuda-toolkit-12-8 \
|
|
53
57
|
cuda-toolkit-12-9 \
|
|
@@ -163,21 +167,32 @@ WORKDIR /home/dev
|
|
|
163
167
|
RUN mkdir -p ~/.npm-global && \
|
|
164
168
|
npm config set prefix ~/.npm-global
|
|
165
169
|
|
|
166
|
-
# OpenAI Codex CLI on
|
|
167
|
-
#
|
|
168
|
-
#
|
|
169
|
-
# key
|
|
170
|
-
# the
|
|
171
|
-
#
|
|
172
|
-
#
|
|
170
|
+
# OpenAI Codex CLI on OpenAI gpt-5.x via AWS Bedrock. Installed system-wide (parallels
|
|
171
|
+
# Claude above), then /usr/local/bin/codex is replaced with a thin wrapper that auths via
|
|
172
|
+
# the pod IRSA — it mints a short-lived Bedrock bearer token (AWS_BEARER_TOKEN_BEDROCK), no
|
|
173
|
+
# per-user key. The wrapper uses codex's NATIVE `amazon-bedrock` model provider (the Bedrock
|
|
174
|
+
# Mantle path serves the OpenAI Responses API for supported OpenAI models — per the official
|
|
175
|
+
# Codex/Bedrock docs), so NO custom endpoint/wire_api config is needed. Model via CODEX_MODEL
|
|
176
|
+
# (default openai.gpt-5.4), effort via CODEX_EFFORT (default high). The wrapper forces
|
|
177
|
+
# AWS_REGION=us-east-1.
|
|
178
|
+
#
|
|
179
|
+
# Why gpt-5.4 default (2026-06-16): gpt-5.5 is mid-rollout on Bedrock us-east-1 — it works
|
|
180
|
+
# intermittently but ~30% of calls still 404 "Engine not found" (us-east-2 fails outright).
|
|
181
|
+
# gpt-5.4 is rock-solid in us-east-1. To switch to 5.5 once AWS's rollout stabilizes, change
|
|
182
|
+
# the default above to openai.gpt-5.5 (one line) — region is already us-east-1. Users can opt
|
|
183
|
+
# in early with CODEX_MODEL=openai.gpt-5.5. The wrapper rewrites ~/.codex/config.toml each
|
|
184
|
+
# launch. IAM already in place (pod IRSA: bedrock-mantle:* — native Mantle path does NOT need
|
|
185
|
+
# bedrock:CallWithBearerToken).
|
|
173
186
|
USER root
|
|
187
|
+
# Always install the latest codex (the native amazon-bedrock provider is stable across
|
|
188
|
+
# releases, so no need to pin — each image rebuild tracks latest). Validated on 0.140.0.
|
|
174
189
|
RUN npm install -g --prefix /usr/local @openai/codex || echo "Codex CLI install failed (non-fatal at build time)"
|
|
175
190
|
# Bedrock wrapper, base64-embedded to avoid heredoc/quoting fragility. It execs the real
|
|
176
191
|
# launcher at /usr/local/lib/node_modules/@openai/codex/bin/codex.js. CRITICAL: `npm install`
|
|
177
192
|
# leaves /usr/local/bin/codex as a SYMLINK to that codex.js, so we must `rm -f` it first —
|
|
178
193
|
# writing through the symlink would clobber codex.js itself, making the wrapper exec itself
|
|
179
194
|
# (infinite recursion -> codex hangs on launch).
|
|
180
|
-
RUN rm -f /usr/local/bin/codex && echo '
|
|
195
|
+
RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IG9uIE9wZW5BSSBncHQtNS54IHZpYSBBV1MgQmVkcm9jayB1c2luZyBjb2RleCdzIE5BVElWRSBgYW1hem9uLWJlZHJvY2tgCiMgcHJvdmlkZXIuIFJlZ2lvbiB1cy1lYXN0LTEgKGdwdC01LnggTWFudGxlIHJlZ2lvbikuIEF1dGg6IGEgc2hvcnQtbGl2ZWQgQmVkcm9jawojIGJlYXJlciB0b2tlbiBtaW50ZWQgZnJvbSB0aGUgcG9kIElSU0EgKG5vIHBlci11c2VyIGtleSkuIE1vZGVsIHZpYSBDT0RFWF9NT0RFTAojIChkZWZhdWx0IG9wZW5haS5ncHQtNS40KSwgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgKGhpZ2gpLgojCiMgbW9kZWxfY29udGV4dF93aW5kb3cgaXMgc2V0IGV4cGxpY2l0bHkgYmVjYXVzZSBjb2RleCdzIGNhdGFsb2cgZG9lc24ndCBrbm93IHRoZQojIEJlZHJvY2stcHJlZml4ZWQgaWQgIm9wZW5haS5ncHQtNS54IiBhbmQgb3RoZXJ3aXNlIHdhcm5zICJNb2RlbCBtZXRhZGF0YSBub3QgZm91bmQsCiMgZGVmYXVsdGluZyB0byBmYWxsYmFjayBtZXRhZGF0YSIuIDI3MjAwMCBpcyBncHQtNS41J3MgYnVuZGxlZCBjb250ZXh0IHdpbmRvdy4KIwojIGdwdC01LjUgbm90ZSAoMjAyNi0wNi0xNik6IHByb3Zpc2lvbmVkIGluIHVzLWVhc3QtMSBidXQgbWlkLXJvbGxvdXQg4oCUIH4zMCUgb2YgY2FsbHMKIyBzdGlsbCA0MDQgIkVuZ2luZSBub3QgZm91bmQiLiBEZWZhdWx0IHN0YXlzIGdwdC01LjQgKHNvbGlkKTsgc3dpdGNoIHRoZSBkZWZhdWx0IHRvCiMgb3BlbmFpLmdwdC01LjUgb25jZSBBV1Mgc3RhYmlsaXplcywgb3Igb3B0IGluIG5vdyB3aXRoIENPREVYX01PREVMPW9wZW5haS5ncHQtNS41LgpzZXQgK2UKTU9ERUw9IiR7Q09ERVhfTU9ERUw6LW9wZW5haS5ncHQtNS40fSIKRUZGT1JUPSIke0NPREVYX0VGRk9SVDotaGlnaH0iCmV4cG9ydCBBV1NfUkVHSU9OPXVzLWVhc3QtMSBBV1NfREVGQVVMVF9SRUdJT049dXMtZWFzdC0xCm1rZGlyIC1wICIkSE9NRS8uY29kZXgiCmNhdCA+ICIkSE9NRS8uY29kZXgvY29uZmlnLnRvbWwiIDw8Q0ZHCm1vZGVsX3Byb3ZpZGVyID0gImFtYXpvbi1iZWRyb2NrIgptb2RlbCA9ICIkTU9ERUwiCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKbW9kZWxfY29udGV4dF93aW5kb3cgPSAyNzIwMDAKd2ViX3NlYXJjaCA9ICJkaXNhYmxlZCIKQ0ZHClRPSz0iJCgvdXNyL2Jpbi9weXRob24zIC1jICdmcm9tIGF3c19iZWRyb2NrX3Rva2VuX2dlbmVyYXRvciBpbXBvcnQgcHJvdmlkZV90b2tlbjsgcHJpbnQocHJvdmlkZV90b2tlbihyZWdpb249InVzLWVhc3QtMSIpKScgMj4vZGV2L251bGwpIgpbIC1uICIkVE9LIiBdICYmIGV4cG9ydCBBV1NfQkVBUkVSX1RPS0VOX0JFRFJPQ0s9IiRUT0siCmV4ZWMgL3Vzci9sb2NhbC9saWIvbm9kZV9tb2R1bGVzL0BvcGVuYWkvY29kZXgvYmluL2NvZGV4LmpzICIkQCIK' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
|
|
181
196
|
|
|
182
197
|
USER dev
|
|
183
198
|
|
{gpu_dev-0.7.12 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
@@ -45,6 +45,9 @@ DISKS_TABLE = os.environ.get("DISKS_TABLE_NAME", "pytorch-gpu-dev-disks")
|
|
|
45
45
|
EKS_CLUSTER_NAME = os.environ["EKS_CLUSTER_NAME"]
|
|
46
46
|
REGION = os.environ["REGION"]
|
|
47
47
|
|
|
48
|
+
# Name of the main dev container in every reservation pod (the one users SSH into).
|
|
49
|
+
MAIN_CONTAINER = "gpu-dev"
|
|
50
|
+
|
|
48
51
|
# Global Kubernetes client (reused across Lambda execution)
|
|
49
52
|
_k8s_client = None
|
|
50
53
|
|
|
@@ -499,18 +502,36 @@ def handler(event, context):
|
|
|
499
502
|
f"Could not parse launched_at for reservation {reservation_id}: {e}"
|
|
500
503
|
)
|
|
501
504
|
|
|
502
|
-
if not skip_pod_check
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
try:
|
|
507
|
-
expire_reservation_due_to_missing_pod(reservation)
|
|
508
|
-
expired_count += 1
|
|
509
|
-
continue # Skip warning processing for this reservation
|
|
510
|
-
except Exception as e:
|
|
511
|
-
logger.error(
|
|
512
|
-
f"Failed to expire reservation {reservation_id} due to missing pod: {e}"
|
|
505
|
+
if not skip_pod_check:
|
|
506
|
+
if not check_pod_exists(pod_name):
|
|
507
|
+
logger.warning(
|
|
508
|
+
f"Pod {pod_name} for active reservation {reservation_id} no longer exists - marking as expired"
|
|
513
509
|
)
|
|
510
|
+
try:
|
|
511
|
+
expire_reservation_due_to_missing_pod(reservation)
|
|
512
|
+
expired_count += 1
|
|
513
|
+
continue # Skip warning processing for this reservation
|
|
514
|
+
except Exception as e:
|
|
515
|
+
logger.error(
|
|
516
|
+
f"Failed to expire reservation {reservation_id} due to missing pod: {e}"
|
|
517
|
+
)
|
|
518
|
+
else:
|
|
519
|
+
# Pod exists but may have died in place (node-pressure
|
|
520
|
+
# eviction / preemption). A container-level OOMKill is
|
|
521
|
+
# recoverable (kubelet restarts it) and returns None here.
|
|
522
|
+
dead_reason = check_pod_dead(pod_name)
|
|
523
|
+
if dead_reason:
|
|
524
|
+
logger.warning(
|
|
525
|
+
f"Pod {pod_name} for active reservation {reservation_id} is dead ({dead_reason}) - finalizing"
|
|
526
|
+
)
|
|
527
|
+
try:
|
|
528
|
+
expire_reservation_due_to_dead_pod(reservation, dead_reason)
|
|
529
|
+
expired_count += 1
|
|
530
|
+
continue # Skip warning processing for this reservation
|
|
531
|
+
except Exception as e:
|
|
532
|
+
logger.error(
|
|
533
|
+
f"Failed to finalize dead-pod reservation {reservation_id}: {e}"
|
|
534
|
+
)
|
|
514
535
|
|
|
515
536
|
minutes_until_expiry = (expires_at - current_time) // 60
|
|
516
537
|
warnings_sent = reservation.get("warnings_sent", {})
|
|
@@ -996,6 +1017,70 @@ def check_pod_oom_status(pod_name: str, namespace: str = "gpu-dev") -> dict:
|
|
|
996
1017
|
return result
|
|
997
1018
|
|
|
998
1019
|
|
|
1020
|
+
def check_pod_dead(pod_name: str, namespace: str = "gpu-dev") -> str | None:
|
|
1021
|
+
"""Return a human-readable reason if the pod is in a terminal, UNRECOVERABLE
|
|
1022
|
+
state, else None.
|
|
1023
|
+
|
|
1024
|
+
The distinction that matters: a container-level OOMKill or crash is recoverable
|
|
1025
|
+
— kubelet restarts the container in place (restartPolicy defaults to Always), so
|
|
1026
|
+
SSH comes back and we must NOT clean these up. But a *pod-level* failure (node
|
|
1027
|
+
memory/disk-pressure eviction, preemption, node loss) terminates the whole pod
|
|
1028
|
+
and nothing recreates it (dev pods are bare V1Pods, not managed by a controller),
|
|
1029
|
+
so the reservation would otherwise hang 'active' with a dead box until expiry.
|
|
1030
|
+
|
|
1031
|
+
Detected as dead:
|
|
1032
|
+
- phase Failed/Succeeded (kubelet sets phase=Failed reason=Evicted on
|
|
1033
|
+
node-pressure eviction)
|
|
1034
|
+
- a DisruptionTarget condition (eviction/preemption) while the main container
|
|
1035
|
+
is no longer running
|
|
1036
|
+
Returns None for healthy, Pending, or merely-restarting pods.
|
|
1037
|
+
"""
|
|
1038
|
+
try:
|
|
1039
|
+
k8s_client = get_k8s_client()
|
|
1040
|
+
v1 = client.CoreV1Api(k8s_client)
|
|
1041
|
+
pod = v1.read_namespaced_pod(name=pod_name, namespace=namespace)
|
|
1042
|
+
except client.exceptions.ApiException as e:
|
|
1043
|
+
if e.status == 404:
|
|
1044
|
+
return None # missing pod is handled by the separate missing-pod path
|
|
1045
|
+
logger.warning(f"Error reading pod {pod_name} for dead check: {e}")
|
|
1046
|
+
return None
|
|
1047
|
+
except Exception as e:
|
|
1048
|
+
logger.warning(f"Error reading pod {pod_name} for dead check: {e}")
|
|
1049
|
+
return None
|
|
1050
|
+
|
|
1051
|
+
status = pod.status
|
|
1052
|
+
if not status:
|
|
1053
|
+
return None
|
|
1054
|
+
|
|
1055
|
+
phase = status.phase or ""
|
|
1056
|
+
|
|
1057
|
+
def _short(msg: str) -> str:
|
|
1058
|
+
msg = (msg or "").strip()
|
|
1059
|
+
return msg.split("\n")[0][:300] if msg else ""
|
|
1060
|
+
|
|
1061
|
+
# Terminal phase: eviction (Failed/Evicted), node loss, or clean exit.
|
|
1062
|
+
if phase in ("Failed", "Succeeded"):
|
|
1063
|
+
reason = getattr(status, "reason", None) or phase
|
|
1064
|
+
message = _short(getattr(status, "message", None))
|
|
1065
|
+
return f"Pod {phase.lower()} ({reason}): {message}" if message else f"Pod {phase.lower()} ({reason})"
|
|
1066
|
+
|
|
1067
|
+
# Evicted/disrupted in place: the node flagged the pod for disruption and the
|
|
1068
|
+
# main container is no longer running (kubelet would otherwise have restarted it).
|
|
1069
|
+
conditions = {c.type: c for c in (status.conditions or [])}
|
|
1070
|
+
disruption = conditions.get("DisruptionTarget")
|
|
1071
|
+
if disruption is not None and getattr(disruption, "status", "") == "True":
|
|
1072
|
+
main_running = any(
|
|
1073
|
+
cs.name == MAIN_CONTAINER and cs.state and cs.state.running
|
|
1074
|
+
for cs in (status.container_statuses or [])
|
|
1075
|
+
)
|
|
1076
|
+
if not main_running:
|
|
1077
|
+
d_reason = getattr(disruption, "reason", None) or "Disrupted"
|
|
1078
|
+
d_msg = _short(getattr(disruption, "message", None))
|
|
1079
|
+
return f"Pod disrupted ({d_reason}): {d_msg}" if d_msg else f"Pod disrupted ({d_reason})"
|
|
1080
|
+
|
|
1081
|
+
return None
|
|
1082
|
+
|
|
1083
|
+
|
|
999
1084
|
def mark_disk_not_in_use(user_id: str, disk_name: str) -> None:
|
|
1000
1085
|
"""
|
|
1001
1086
|
Mark a disk as not in use in the disks table.
|
|
@@ -1365,6 +1450,58 @@ def expire_reservation_due_to_missing_pod(reservation: dict[str, Any]) -> None:
|
|
|
1365
1450
|
)
|
|
1366
1451
|
|
|
1367
1452
|
|
|
1453
|
+
def expire_reservation_due_to_dead_pod(reservation: dict[str, Any], reason: str) -> None:
|
|
1454
|
+
"""Finalize an active reservation whose pod died in place (node-pressure
|
|
1455
|
+
eviction / preemption / node loss). Snapshots the persistent disk so the user
|
|
1456
|
+
keeps their data, cleans up the pod, marks the reservation 'failed' with a clear
|
|
1457
|
+
reason (so the CLI surfaces *why* the box vanished), and frees the disk lock so
|
|
1458
|
+
the user can immediately re-reserve and restore from the snapshot."""
|
|
1459
|
+
reservation_id = reservation["reservation_id"]
|
|
1460
|
+
logger.info(f"Finalizing reservation {reservation_id} as failed due to dead pod: {reason}")
|
|
1461
|
+
|
|
1462
|
+
now = datetime.utcnow().isoformat()
|
|
1463
|
+
reservations_table = dynamodb.Table(RESERVATIONS_TABLE)
|
|
1464
|
+
|
|
1465
|
+
# Mark failed FIRST so the reservation leaves the active set even if cleanup
|
|
1466
|
+
# partially fails (mirrors process_cancellation_request ordering).
|
|
1467
|
+
reservations_table.update_item(
|
|
1468
|
+
Key={"reservation_id": reservation_id},
|
|
1469
|
+
UpdateExpression="SET #status = :status, failed_at = :failed_at, reservation_ended = :reservation_ended, failure_reason = :reason",
|
|
1470
|
+
ExpressionAttributeNames={"#status": "status"},
|
|
1471
|
+
ExpressionAttributeValues={
|
|
1472
|
+
":status": "failed",
|
|
1473
|
+
":failed_at": now,
|
|
1474
|
+
":reservation_ended": now,
|
|
1475
|
+
":reason": reason,
|
|
1476
|
+
},
|
|
1477
|
+
)
|
|
1478
|
+
|
|
1479
|
+
# Snapshot + delete the pod. cleanup_pod creates a shutdown snapshot of the EBS
|
|
1480
|
+
# volume first (the volume still holds the user's data even though the container
|
|
1481
|
+
# is dead; the kubectl-exec content capture just fails gracefully).
|
|
1482
|
+
pod_name = reservation.get("pod_name")
|
|
1483
|
+
if pod_name:
|
|
1484
|
+
try:
|
|
1485
|
+
cleanup_pod(pod_name, reservation.get("namespace", "gpu-dev"), reservation_data=reservation)
|
|
1486
|
+
logger.info(f"Cleaned up dead pod {pod_name} for reservation {reservation_id}")
|
|
1487
|
+
except Exception as cleanup_error:
|
|
1488
|
+
logger.error(f"Pod cleanup failed for dead-pod reservation {reservation_id}: {cleanup_error}")
|
|
1489
|
+
|
|
1490
|
+
# Free the disk lock so the persistent disk can be reused right away.
|
|
1491
|
+
user_id = reservation.get("user_id")
|
|
1492
|
+
disk_name = reservation.get("disk_name")
|
|
1493
|
+
if user_id and not disk_name:
|
|
1494
|
+
disk_name = find_disk_by_reservation(user_id, reservation_id)
|
|
1495
|
+
if user_id and disk_name:
|
|
1496
|
+
try:
|
|
1497
|
+
mark_disk_not_in_use(user_id, disk_name)
|
|
1498
|
+
logger.info(f"Cleared disk lock for '{disk_name}' after dead-pod cleanup of {reservation_id[:8]}")
|
|
1499
|
+
except Exception as e:
|
|
1500
|
+
logger.warning(f"Failed to clear disk lock during dead-pod cleanup: {e}")
|
|
1501
|
+
|
|
1502
|
+
logger.info(f"Successfully finalized dead-pod reservation {reservation_id} as failed")
|
|
1503
|
+
|
|
1504
|
+
|
|
1368
1505
|
def expire_stuck_preparing_reservation(reservation: dict[str, Any]) -> None:
|
|
1369
1506
|
"""Mark stuck preparing reservation as failed when it's been preparing too long"""
|
|
1370
1507
|
try:
|