gpu-dev 0.7.11__tar.gz → 0.7.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/PKG-INFO +1 -1
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +278 -13
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +26 -2
- gpu_dev-0.7.13/docs/GPU_DEV_SUBMIT.md +89 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/gpu_dev.egg-info/SOURCES.txt +5 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/pyproject.toml +1 -1
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/Dockerfile +26 -10
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +148 -11
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_processor/index.py +172 -44
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda.tf +37 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/main.tf +2 -4
- gpu_dev-0.7.13/tests/unit/cli/test_debug.py +155 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_repro.py +75 -1
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_submit.py +47 -1
- gpu_dev-0.7.13/tests/unit/lambda_fn/test_dead_pod_cleanup.py +177 -0
- gpu_dev-0.7.13/tests/unit/lambda_fn/test_finalize_no_ssh.py +24 -0
- gpu_dev-0.7.13/tests/unit/lambda_fn/test_get_logs.py +59 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_mig_gpu_config.py +8 -7
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_pod_resources.py +17 -8
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/.github/workflows/tests.yml +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/.gitignore +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/CLAUDE.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/README.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/admin/README.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/admin/generate_stats.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/admin/requirements.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/conftest.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/docs/FAST_REPRO_DESIGN.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/docs/SDK_REPRO.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/post-may-2026.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/CLAUDE.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/architecture.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/cli-demo.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/devgpu-features.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/feedback.png +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/gpu-fleet.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/index.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/k8s-under-the-hood.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/multinode.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/osdc-future-plans.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/problem.png +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/pyproject.toml +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/sandbox.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/sdk-demo.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/teaser.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/thesis.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/title-vid.mp4 +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/weneedgpus.png +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/presentation/wow.html +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/README.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/examples/batch_multi_gpu.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/examples/interactive_debug.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/examples/parallel_experiments.ipynb +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/examples/quickstart.ipynb +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/examples/run_tests.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/examples/submit_job.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_backend/aws.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/config.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/enums.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/errors.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/common/models.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/src/gpu_dev/py.typed +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/sdk/python/tests/test_models.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/setup.cfg +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-deck/backend.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-deck/main.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-deck/terraform.tfvars.example +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/build-node.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/pytorch-ondemand.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/pytorch-prebuild.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/terraform-gpu-devservers/warm-pool.tf +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/integration/README.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/integration/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/integration/conftest.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/integration/test_claude.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/integration/test_cpu_lifecycle.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/integration/test_repro_known_failure.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/integration/test_t4_lifecycle.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/integration/test_warm_pool.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/submit/README.md +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/submit/success/run.sh +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_auth.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_avail.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_cancel.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_config_cmd.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_config_module.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_connect.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_disks.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_edit.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_interactive.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_list_show.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_name_generator.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_reservations_mgr.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_reserve.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_smoke.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/cli/test_ssh_alias.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_availability.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_cancellation.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_claim.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_ref_staging.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_smoke.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_version_gate.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/lambda_fn/test_warm_pool.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/sdk/__init__.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/sdk/test_backend_aws.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/sdk/test_client.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/sdk/test_errors_enums.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/sdk/test_models_extra.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/sdk/test_sandbox.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/sdk/test_sdk_config.py +0 -0
- {gpu_dev-0.7.11 → gpu_dev-0.7.13}/tests/unit/sdk/test_transport_ssh.py +0 -0
|
@@ -1521,19 +1521,23 @@ def reserve(
|
|
|
1521
1521
|
|
|
1522
1522
|
|
|
1523
1523
|
@main.command(context_settings={"ignore_unknown_options": True})
|
|
1524
|
-
@click.argument("ref")
|
|
1525
|
-
@click.argument("test_args", nargs=-1, required=
|
|
1526
|
-
@click.option("--
|
|
1524
|
+
@click.argument("ref", required=False)
|
|
1525
|
+
@click.argument("test_args", nargs=-1, required=False)
|
|
1526
|
+
@click.option("--lint", is_flag=True, default=False,
|
|
1527
|
+
help="Run a PyTorch lint job (lintrunner) on a CPU box instead of a python test. "
|
|
1528
|
+
"Defaults to --gpu-type cpu-x86 and skips the torch build. Extra args go to "
|
|
1529
|
+
"lintrunner (default: --merge-base-with origin/main, i.e. the PR diff like CI).")
|
|
1530
|
+
@click.option("--gpu-type", default=None, help="GPU type for the repro box (default: b200; cpu-x86 with --lint).")
|
|
1527
1531
|
@click.option("--gpus", type=int, default=1, show_default=True)
|
|
1528
1532
|
@click.option("--hours", type=float, default=3.0, show_default=True,
|
|
1529
1533
|
help="Lifetime ceiling for the box.")
|
|
1530
1534
|
@click.option("--no-connect", is_flag=True, default=False,
|
|
1531
|
-
help="CI mode: run the test, auto-cancel, exit code =
|
|
1535
|
+
help="CI mode: run the test/lint, auto-cancel, exit code = result. Default (on a TTY) drops you into the box to iterate.")
|
|
1532
1536
|
@click.option("--keep", is_flag=True, default=False,
|
|
1533
1537
|
help="Never cancel the box (skip the cancel prompt / auto-cancel).")
|
|
1534
1538
|
@click.pass_context
|
|
1535
|
-
def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
1536
|
-
"""Reserve a
|
|
1539
|
+
def repro(ctx, ref, test_args, lint, gpu_type, gpus, hours, no_connect, keep):
|
|
1540
|
+
"""Reserve a box, check out a PR/commit, run a test (or lint), then drop you in.
|
|
1537
1541
|
|
|
1538
1542
|
By default (in a terminal) repro runs the test and then **connects you into the
|
|
1539
1543
|
box** at ~/pytorch — the ref is checked out, so you can fix and re-run. The box
|
|
@@ -1546,10 +1550,30 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
|
1546
1550
|
TEST_ARGS are passed straight to `python` inside ~/pytorch, e.g.
|
|
1547
1551
|
|
|
1548
1552
|
gpu-dev repro pr/185264 test/inductor/test_flex_attention.py TestFlexAttentionCUDA.test_large_kv_int64_pointer_math_cuda
|
|
1553
|
+
|
|
1554
|
+
--lint runs lintrunner on a CPU box instead (no GPU, no torch build), e.g.
|
|
1555
|
+
|
|
1556
|
+
gpu-dev repro --lint # lint main (all files)
|
|
1557
|
+
gpu-dev repro --lint pr/185264 # lint the PR diff (CI-equivalent)
|
|
1558
|
+
gpu-dev repro --lint pr/185264 --all-files # lint everything
|
|
1559
|
+
|
|
1560
|
+
The box stays up after the run: on a TTY you're dropped in and prompted to
|
|
1561
|
+
cancel on exit (use --keep to leave it running; --no-connect auto-cancels).
|
|
1549
1562
|
"""
|
|
1550
1563
|
import shlex
|
|
1551
1564
|
import subprocess
|
|
1552
1565
|
import sys
|
|
1566
|
+
if not ref:
|
|
1567
|
+
if not lint:
|
|
1568
|
+
rprint("[red]❌ Provide a REF (pr/N, branch, or commit) — or use --lint to lint main.[/red]")
|
|
1569
|
+
sys.exit(2)
|
|
1570
|
+
ref = "main" # bare `repro --lint` lints current main
|
|
1571
|
+
if not lint and not test_args:
|
|
1572
|
+
rprint("[red]❌ Provide a test, e.g. gpu-dev repro pr/123 test/foo.py — or pass --lint for a lint job.[/red]")
|
|
1573
|
+
sys.exit(2)
|
|
1574
|
+
gpu_type = (gpu_type or ("cpu-x86" if lint else "b200")).lower()
|
|
1575
|
+
if gpu_type.startswith("cpu"):
|
|
1576
|
+
gpus = 0 # CPU reservations must have gpu_count=0
|
|
1553
1577
|
config = load_config()
|
|
1554
1578
|
reservation_mgr = ReservationManager(config)
|
|
1555
1579
|
try:
|
|
@@ -1637,9 +1661,37 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
|
1637
1661
|
f"PYTHONPATH=/home/dev/pytorch python {testcmd}"
|
|
1638
1662
|
)
|
|
1639
1663
|
|
|
1664
|
+
runlabel, rerun_hint = "test", f"python {testcmd}"
|
|
1665
|
+
if lint:
|
|
1666
|
+
# Lint needs the source tree at the ref but NO torch build. Most pods already
|
|
1667
|
+
# have /home/dev/pytorch; CPU pods may not, so clone (partial) as a fallback.
|
|
1668
|
+
# origin/main is fetched so --merge-base-with works (the PR-diff scope CI lints).
|
|
1669
|
+
# PR ref -> lint the diff (CI-equivalent); main/branch/sha -> lint everything
|
|
1670
|
+
# (merge-base-with origin/main would be empty when you ARE main).
|
|
1671
|
+
lint_default = "--merge-base-with origin/main" if prnum else "--all-files"
|
|
1672
|
+
lintargs = " ".join(shlex.quote(a) for a in test_args) or lint_default
|
|
1673
|
+
runlabel, rerun_hint = "lint", f"lintrunner {lintargs}"
|
|
1674
|
+
remote = (
|
|
1675
|
+
"set -e; "
|
|
1676
|
+
"git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true; "
|
|
1677
|
+
"if [ ! -d /home/dev/pytorch/.git ]; then echo '[lint] no pytorch tree on this pod — cloning (partial)…'; "
|
|
1678
|
+
"rm -rf /home/dev/pytorch; git clone --filter=blob:none https://github.com/pytorch/pytorch.git /home/dev/pytorch; fi; "
|
|
1679
|
+
"cd /home/dev/pytorch; "
|
|
1680
|
+
+ resolve +
|
|
1681
|
+
"echo \"[lint] target ${WANT:-?}\"; "
|
|
1682
|
+
"git fetch origin main 2>/dev/null || true; "
|
|
1683
|
+
"echo \"[lint] checking out $FREF\"; " + checkout + "; "
|
|
1684
|
+
"echo \"[lint] HEAD $(git rev-parse --short HEAD)\"; "
|
|
1685
|
+
"command -v lintrunner >/dev/null 2>&1 || pip install --break-system-packages -q lintrunner; "
|
|
1686
|
+
"echo '[lint] lintrunner init (downloading linters)…'; lintrunner init; "
|
|
1687
|
+
f"echo '[lint] running: lintrunner {lintargs}'; "
|
|
1688
|
+
f"lintrunner {lintargs}"
|
|
1689
|
+
)
|
|
1690
|
+
|
|
1640
1691
|
# Reserve — warm claim (instant) first, else cold ephemeral. Always no-persist
|
|
1641
1692
|
# (so the prebuilt tree is staged; a default disk would skip staging).
|
|
1642
|
-
|
|
1693
|
+
desc = f"{gpus}x {gpu_type}" if gpus else gpu_type
|
|
1694
|
+
rprint(f"[cyan]🔬 repro: reserving {desc} (warm if available)…[/cyan]")
|
|
1643
1695
|
rid = ssh_cmd = None
|
|
1644
1696
|
try:
|
|
1645
1697
|
res = reservation_mgr.claim_direct(
|
|
@@ -1675,14 +1727,14 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
|
1675
1727
|
except KeyboardInterrupt:
|
|
1676
1728
|
rprint("\n[yellow]interrupted[/yellow]"); rc = 130
|
|
1677
1729
|
|
|
1678
|
-
verdict = "[green]✓
|
|
1730
|
+
verdict = f"[green]✓ {runlabel} passed[/green]" if rc == 0 else f"[red]✗ {runlabel} failed (exit {rc})[/red]"
|
|
1679
1731
|
|
|
1680
1732
|
# Default (TTY): drop into the box so you can fix and re-run. --no-connect is the
|
|
1681
1733
|
# CI path: auto-cancel and exit with the test's code.
|
|
1682
1734
|
connect = (not no_connect) and sys.stdout.isatty()
|
|
1683
1735
|
if connect:
|
|
1684
1736
|
rprint(f"\n{verdict} — dropping you into the box at ~/pytorch ({ref} checked out).")
|
|
1685
|
-
rprint(f"[dim] re-run:
|
|
1737
|
+
rprint(f"[dim] re-run: {rerun_hint}[/dim]")
|
|
1686
1738
|
rprint(f"[dim] finish: gpu-dev cancel (from inside) • or exit this shell[/dim]\n")
|
|
1687
1739
|
shell_cmd = f"{ssh_cmd} -t {shlex.quote('cd /home/dev/pytorch 2>/dev/null; exec ${SHELL:-bash} -l')}"
|
|
1688
1740
|
try:
|
|
@@ -1724,6 +1776,47 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
|
|
|
1724
1776
|
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"]
|
|
1725
1777
|
|
|
1726
1778
|
|
|
1779
|
+
def _build_submit_remote_script(workdir: str, remote_cmd: str, ref: Optional[str],
|
|
1780
|
+
no_build: bool) -> str:
|
|
1781
|
+
"""Build the remote shell script `submit` runs over SSH (under `bash -lc`).
|
|
1782
|
+
|
|
1783
|
+
Without --ref this is just `cd <workdir> && <cmd>`. With --ref the pytorch
|
|
1784
|
+
tree is staged in the *background* in-pod (stage-pytorch &), and the tree is
|
|
1785
|
+
only chowned to dev + the ref fully checked out at the very end. Running the
|
|
1786
|
+
user command before that finishes is the footgun Driss hit: a root-owned tree
|
|
1787
|
+
(git "dubious ownership") and a source/installed-torch mismatch (the ref is
|
|
1788
|
+
checked out but the prebuilt .so is the stale base build -> `import torch`
|
|
1789
|
+
fails). So with --ref we prepend a preamble that:
|
|
1790
|
+
1. waits for staging to finish (`.pytorch-staging` marker removed at end),
|
|
1791
|
+
2. marks /home/dev/pytorch a git safe.directory for the dev user,
|
|
1792
|
+
3. unless --no-build, rebuilds incrementally so installed torch == the
|
|
1793
|
+
checked-out source (warm build/ -> ~tens of seconds; a rebuild failure
|
|
1794
|
+
exits 90 before the user command runs).
|
|
1795
|
+
The rebuild/safe.directory only touch pytorch when staging actually ran
|
|
1796
|
+
(`.pytorch-ready` present), so --disk reservations (ref ignored, no staging)
|
|
1797
|
+
are unaffected.
|
|
1798
|
+
"""
|
|
1799
|
+
import shlex
|
|
1800
|
+
cd_run = f"cd {shlex.quote(workdir)} && {remote_cmd}"
|
|
1801
|
+
if not ref:
|
|
1802
|
+
return cd_run
|
|
1803
|
+
lines = [
|
|
1804
|
+
'if [ -e /home/dev/.pytorch-staging ]; then',
|
|
1805
|
+
' echo "[gpu-dev] waiting for background pytorch --ref staging to finish…"',
|
|
1806
|
+
' for _i in $(seq 1 3600); do [ -e /home/dev/.pytorch-staging ] || break; sleep 1; done',
|
|
1807
|
+
'fi',
|
|
1808
|
+
'if [ -f /home/dev/.pytorch-ready ]; then',
|
|
1809
|
+
' git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true',
|
|
1810
|
+
]
|
|
1811
|
+
if not no_build:
|
|
1812
|
+
lines += [
|
|
1813
|
+
' echo "[gpu-dev] rebuilding torch to match --ref (pip install -e . --no-build-isolation)…"',
|
|
1814
|
+
' ( cd /home/dev/pytorch && pip install -e . --no-build-isolation ) || { echo "[gpu-dev] torch rebuild failed"; exit 90; }',
|
|
1815
|
+
]
|
|
1816
|
+
lines += ['fi', cd_run]
|
|
1817
|
+
return "\n".join(lines)
|
|
1818
|
+
|
|
1819
|
+
|
|
1727
1820
|
@main.command(context_settings={"ignore_unknown_options": True})
|
|
1728
1821
|
@click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
|
|
1729
1822
|
@click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
|
|
@@ -1743,6 +1836,8 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
|
|
|
1743
1836
|
@click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
|
|
1744
1837
|
help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
|
|
1745
1838
|
@click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
|
|
1839
|
+
@click.option("--no-build", is_flag=True,
|
|
1840
|
+
help="With --ref, skip the incremental torch rebuild before the command (Python-only PRs / quick checks). Default: rebuild so `import torch` reflects the ref.")
|
|
1746
1841
|
@click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
|
|
1747
1842
|
@click.option("--name", type=str, default=None, help="Reservation name.")
|
|
1748
1843
|
@click.option("--timeout", type=int, default=24 * 60, show_default=True,
|
|
@@ -1750,7 +1845,7 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
|
|
|
1750
1845
|
@click.argument("command", nargs=-1, required=True)
|
|
1751
1846
|
@click.pass_context
|
|
1752
1847
|
def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dockerfile, dockerimage, preserve_entrypoint,
|
|
1753
|
-
runtime, no_pull, keep_alive, name, timeout, command):
|
|
1848
|
+
runtime, no_pull, no_build, keep_alive, name, timeout, command):
|
|
1754
1849
|
"""Submit a job: reserve, sync code, run, sync results back, auto-cancel.
|
|
1755
1850
|
|
|
1756
1851
|
\b
|
|
@@ -1961,11 +2056,15 @@ def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dock
|
|
|
1961
2056
|
else:
|
|
1962
2057
|
workdir = "/home/dev"
|
|
1963
2058
|
|
|
1964
|
-
# Run remote command via login shell so MULTINODE_* etc. are loaded
|
|
2059
|
+
# Run remote command via login shell so MULTINODE_* etc. are loaded. With
|
|
2060
|
+
# --ref, the script first waits for background pytorch staging + rebuilds
|
|
2061
|
+
# so `import torch` matches the checked-out ref (see helper docstring).
|
|
1965
2062
|
remote_cmd = " ".join(shlex.quote(c) for c in command)
|
|
1966
2063
|
rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
|
|
1967
|
-
|
|
1968
|
-
|
|
2064
|
+
if ref and not no_build:
|
|
2065
|
+
rprint("[dim] (--ref: will wait for staging + rebuild torch first; pass --no-build to skip)[/dim]")
|
|
2066
|
+
remote_script = _build_submit_remote_script(workdir, remote_cmd, ref, no_build)
|
|
2067
|
+
ssh_run = ssh_base + [ssh_alias, f"bash -lc {shlex.quote(remote_script)}"]
|
|
1969
2068
|
rc = subprocess.call(ssh_run)
|
|
1970
2069
|
rprint(f"\n[dim]Job exited with code {rc}[/dim]")
|
|
1971
2070
|
|
|
@@ -3185,6 +3284,172 @@ def show(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3185
3284
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
3186
3285
|
|
|
3187
3286
|
|
|
3287
|
+
def _print_recovery_hints(connection_info: dict) -> None:
|
|
3288
|
+
"""Tell the user how to unblock/recover their own reservation based on status."""
|
|
3289
|
+
status = (connection_info.get("status") or "").lower()
|
|
3290
|
+
disk_name = connection_info.get("disk_name") or ""
|
|
3291
|
+
res_id = connection_info.get("reservation_id", "") or ""
|
|
3292
|
+
short_id = res_id[:8] if res_id else "<id>"
|
|
3293
|
+
hints = []
|
|
3294
|
+
if status in ("failed", "expired", "cancelled"):
|
|
3295
|
+
if disk_name:
|
|
3296
|
+
hints.append(
|
|
3297
|
+
f"Your data on disk '{disk_name}' is preserved — re-reserve with: "
|
|
3298
|
+
f"gpu-dev reserve --disk {disk_name}")
|
|
3299
|
+
hints.append(f"If that disk is stuck locked: gpu-dev disk unlock {disk_name}")
|
|
3300
|
+
else:
|
|
3301
|
+
hints.append("Re-reserve a new box with: gpu-dev reserve")
|
|
3302
|
+
elif status == "active":
|
|
3303
|
+
hints.append(
|
|
3304
|
+
f"If status is 'active' but you can't SSH, the pod likely died (e.g. OOM). "
|
|
3305
|
+
f"Free it (and your disk) with: gpu-dev cancel {short_id} — then re-reserve.")
|
|
3306
|
+
if disk_name:
|
|
3307
|
+
hints.append(f"If the disk stays locked after cancel: gpu-dev disk unlock {disk_name}")
|
|
3308
|
+
if hints:
|
|
3309
|
+
rprint("\n[bold]Recovery:[/bold]")
|
|
3310
|
+
for h in hints:
|
|
3311
|
+
rprint(f" • {h}")
|
|
3312
|
+
|
|
3313
|
+
|
|
3314
|
+
def _show_diagnostics(connection_info: dict) -> None:
|
|
3315
|
+
"""Render the extra diagnostics `gpu-dev debug` adds on top of the status panel:
|
|
3316
|
+
failure reason, OOM events, the full status-history timeline, captured pod logs,
|
|
3317
|
+
and recovery hints. All sourced from data the lambdas write to DynamoDB, so it
|
|
3318
|
+
needs no cluster/lambda access."""
|
|
3319
|
+
from rich.text import Text
|
|
3320
|
+
|
|
3321
|
+
status = (connection_info.get("status") or "").lower()
|
|
3322
|
+
|
|
3323
|
+
# Failure reason / latest detailed status — shown for ANY status (the normal
|
|
3324
|
+
# `show` only surfaces failure_reason on 'failed'; for an active-but-dead pod
|
|
3325
|
+
# this is exactly what the user needs).
|
|
3326
|
+
failure_reason = (connection_info.get("failure_reason") or "").strip()
|
|
3327
|
+
detailed = (connection_info.get("current_detailed_status") or "").strip()
|
|
3328
|
+
if failure_reason:
|
|
3329
|
+
rprint(f"\n[bold red]Why it ended:[/bold red] {failure_reason}")
|
|
3330
|
+
elif detailed and status != "active":
|
|
3331
|
+
rprint(f"\n[bold]Latest status:[/bold] {detailed}")
|
|
3332
|
+
|
|
3333
|
+
# OOM events
|
|
3334
|
+
oom_count = int(connection_info.get("oom_count", 0) or 0)
|
|
3335
|
+
if oom_count > 0:
|
|
3336
|
+
last = connection_info.get("last_oom_at") or "unknown"
|
|
3337
|
+
cont = connection_info.get("oom_container") or "?"
|
|
3338
|
+
rprint(f"[red]⚠️ OOM:[/red] {oom_count} event(s) — last {last} (container: {cont})")
|
|
3339
|
+
|
|
3340
|
+
# Status-history timeline (the gold for "what happened to my reservation")
|
|
3341
|
+
history = connection_info.get("status_history") or []
|
|
3342
|
+
if history:
|
|
3343
|
+
table = Table(title="Status timeline (most recent last)", show_header=True,
|
|
3344
|
+
header_style="bold", box=None, pad_edge=False)
|
|
3345
|
+
table.add_column("Time", style="dim", no_wrap=True)
|
|
3346
|
+
table.add_column("Event")
|
|
3347
|
+
for entry in history[-40:]:
|
|
3348
|
+
if isinstance(entry, dict):
|
|
3349
|
+
table.add_row(str(entry.get("timestamp", "")), str(entry.get("message", "")))
|
|
3350
|
+
console.print("")
|
|
3351
|
+
console.print(table)
|
|
3352
|
+
else:
|
|
3353
|
+
rprint("\n[dim]No status history recorded for this reservation.[/dim]")
|
|
3354
|
+
|
|
3355
|
+
# Captured pod logs (lambda snapshot — last lines around the failure)
|
|
3356
|
+
pod_logs = (connection_info.get("pod_logs") or "").strip()
|
|
3357
|
+
if pod_logs:
|
|
3358
|
+
console.print(Panel(Text(pod_logs[-4000:]), title="Captured pod logs (snapshot)",
|
|
3359
|
+
border_style="yellow"))
|
|
3360
|
+
|
|
3361
|
+
_print_recovery_hints(connection_info)
|
|
3362
|
+
|
|
3363
|
+
|
|
3364
|
+
def _show_lambda_logs(reservation_mgr, reservation_id: str, user_id: str) -> None:
|
|
3365
|
+
"""Fetch + render the raw lambda (CloudWatch) logs for a reservation."""
|
|
3366
|
+
from rich.text import Text
|
|
3367
|
+
rprint("\n[bold]Fetching lambda logs from CloudWatch…[/bold] [dim](a few seconds)[/dim]")
|
|
3368
|
+
result = reservation_mgr.get_reservation_logs(reservation_id, user_id)
|
|
3369
|
+
if result is None:
|
|
3370
|
+
rprint("[yellow]Could not reach the log backend (it may not be deployed yet, "
|
|
3371
|
+
"or you lack lambda:InvokeFunctionUrl access).[/yellow]")
|
|
3372
|
+
return
|
|
3373
|
+
if result.get("error"):
|
|
3374
|
+
rprint(f"[yellow]Log query: {result['error']}[/yellow]")
|
|
3375
|
+
lines = result.get("lines") or []
|
|
3376
|
+
if not lines:
|
|
3377
|
+
rprint("[dim]No lambda log lines found for this reservation (outside the "
|
|
3378
|
+
"retention window, or none recorded).[/dim]")
|
|
3379
|
+
return
|
|
3380
|
+
body = "\n".join(f"{ln.get('timestamp','')} {ln.get('message','')}".rstrip()
|
|
3381
|
+
for ln in lines)
|
|
3382
|
+
console.print(Panel(Text(body[-16000:]),
|
|
3383
|
+
title=f"Lambda logs · {len(lines)} line(s)", border_style="cyan"))
|
|
3384
|
+
|
|
3385
|
+
|
|
3386
|
+
@main.command()
|
|
3387
|
+
@click.argument("reservation_id", required=False)
|
|
3388
|
+
@click.option("--logs", "show_logs", is_flag=True,
|
|
3389
|
+
help="Also fetch the raw lambda logs for this reservation from CloudWatch.")
|
|
3390
|
+
@click.pass_context
|
|
3391
|
+
def debug(ctx: click.Context, reservation_id: Optional[str], show_logs: bool) -> None:
|
|
3392
|
+
"""Diagnose your own reservation — why a box died or won't connect.
|
|
3393
|
+
|
|
3394
|
+
Shows the status timeline, failure reason, OOM events, and captured pod logs,
|
|
3395
|
+
plus recovery steps — all without needing cluster or lambda access. Add --logs
|
|
3396
|
+
to also pull the raw reservation/expiry lambda logs from CloudWatch.
|
|
3397
|
+
|
|
3398
|
+
\b
|
|
3399
|
+
Examples:
|
|
3400
|
+
gpu-dev debug # pick from your active reservations
|
|
3401
|
+
gpu-dev debug abc12345 # a specific reservation (id prefix ok)
|
|
3402
|
+
gpu-dev debug abc12345 --logs # + raw lambda logs from CloudWatch
|
|
3403
|
+
|
|
3404
|
+
For a recently failed/expired box, find its id with 'gpu-dev list' then
|
|
3405
|
+
'gpu-dev debug <id>'.
|
|
3406
|
+
"""
|
|
3407
|
+
try:
|
|
3408
|
+
config = load_config()
|
|
3409
|
+
user_info = authenticate_user(config)
|
|
3410
|
+
reservation_mgr = ReservationManager(config)
|
|
3411
|
+
|
|
3412
|
+
# In-pod fast path: the pod's own reservation id is on the env.
|
|
3413
|
+
if reservation_id is None:
|
|
3414
|
+
reservation_id = os.environ.get("GPU_DEV_RESERVATION_ID") or None
|
|
3415
|
+
|
|
3416
|
+
if reservation_id is None:
|
|
3417
|
+
reservations = _fetch_reservations_cross_region(
|
|
3418
|
+
reservation_mgr, user_info["user_id"],
|
|
3419
|
+
["active", "preparing", "queued", "pending"], config)
|
|
3420
|
+
if not reservations:
|
|
3421
|
+
rprint("[yellow]📋 No active reservations.[/yellow] To debug a recent "
|
|
3422
|
+
"failed/expired one, find its id with [bold]gpu-dev list[/bold] "
|
|
3423
|
+
"then run [bold]gpu-dev debug <id>[/bold].")
|
|
3424
|
+
return
|
|
3425
|
+
if len(reservations) == 1:
|
|
3426
|
+
reservation_id = reservations[0].get("reservation_id")
|
|
3427
|
+
else:
|
|
3428
|
+
selected = select_reservation_interactive(reservations, "debug")
|
|
3429
|
+
if not selected or selected in ("__QUIT__", "__ALL__"):
|
|
3430
|
+
rprint("[yellow]Cancelled.[/yellow]")
|
|
3431
|
+
return
|
|
3432
|
+
reservation_id = selected
|
|
3433
|
+
|
|
3434
|
+
connection_info = reservation_mgr.get_connection_info(
|
|
3435
|
+
reservation_id, user_info["user_id"])
|
|
3436
|
+
if not connection_info:
|
|
3437
|
+
rprint(f"[red]❌ No reservation found matching '{reservation_id}'[/red] "
|
|
3438
|
+
"(try a longer id prefix, or check 'gpu-dev list').")
|
|
3439
|
+
return
|
|
3440
|
+
|
|
3441
|
+
_show_single_reservation(connection_info)
|
|
3442
|
+
_show_diagnostics(connection_info)
|
|
3443
|
+
if show_logs:
|
|
3444
|
+
_show_lambda_logs(reservation_mgr, connection_info["reservation_id"],
|
|
3445
|
+
user_info["user_id"])
|
|
3446
|
+
|
|
3447
|
+
except RuntimeError as e:
|
|
3448
|
+
rprint(f"[red]❌ {str(e)}[/red]")
|
|
3449
|
+
except Exception as e:
|
|
3450
|
+
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
3451
|
+
|
|
3452
|
+
|
|
3188
3453
|
|
|
3189
3454
|
def _maybe_show_sdk_tip() -> None:
|
|
3190
3455
|
"""For a user's first few reservations, nudge them toward the Python SDK +
|
|
@@ -613,7 +613,7 @@ class ReservationManager:
|
|
|
613
613
|
pass
|
|
614
614
|
return self._direct_url or None
|
|
615
615
|
|
|
616
|
-
def _signed_post(self, url: str, payload: dict) -> Optional[dict]:
|
|
616
|
+
def _signed_post(self, url: str, payload: dict, timeout: int = 20) -> Optional[dict]:
|
|
617
617
|
"""SigV4-signed POST to the Function URL. Returns parsed JSON or None."""
|
|
618
618
|
try:
|
|
619
619
|
creds = self.config.session.get_credentials()
|
|
@@ -623,13 +623,29 @@ class ReservationManager:
|
|
|
623
623
|
aws_req = AWSRequest(method="POST", url=url, data=data,
|
|
624
624
|
headers={"Content-Type": "application/json"})
|
|
625
625
|
SigV4Auth(creds, "lambda", self.config.aws_region).add_auth(aws_req)
|
|
626
|
-
resp = requests.post(url, data=data, headers=dict(aws_req.headers), timeout=
|
|
626
|
+
resp = requests.post(url, data=data, headers=dict(aws_req.headers), timeout=timeout)
|
|
627
627
|
if resp.status_code != 200:
|
|
628
628
|
return None
|
|
629
629
|
return resp.json()
|
|
630
630
|
except Exception:
|
|
631
631
|
return None
|
|
632
632
|
|
|
633
|
+
def get_reservation_logs(self, reservation_id: str, user_id: str) -> Optional[Dict[str, Any]]:
|
|
634
|
+
"""Fetch a reservation's lambda logs (CloudWatch Logs Insights) via the
|
|
635
|
+
processor Function URL. Returns {"lines": [...]} / {"error": ...}, or None if
|
|
636
|
+
the backend/URL is unavailable. Used by `gpu-dev debug --logs`."""
|
|
637
|
+
url = self._get_direct_url()
|
|
638
|
+
if not url:
|
|
639
|
+
return None
|
|
640
|
+
payload = {
|
|
641
|
+
"action": "get_logs",
|
|
642
|
+
"reservation_id": reservation_id,
|
|
643
|
+
"user_id": user_id,
|
|
644
|
+
"version": get_version(),
|
|
645
|
+
}
|
|
646
|
+
# CloudWatch Logs Insights queries take longer than a claim — allow ~70s.
|
|
647
|
+
return self._signed_post(url, payload, timeout=70)
|
|
648
|
+
|
|
633
649
|
def claim_direct(self, *, user_id: str, gpu_count: int, gpu_type: str,
|
|
634
650
|
duration_hours: Union[int, float], name: Optional[str] = None,
|
|
635
651
|
github_user: Optional[str] = None, ref: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
@@ -999,11 +1015,19 @@ class ReservationManager:
|
|
|
999
1015
|
"jupyter_enabled": reservation.get("jupyter_enabled", False),
|
|
1000
1016
|
"jupyter_error": reservation.get("jupyter_error", ""),
|
|
1001
1017
|
"ebs_volume_id": reservation.get("ebs_volume_id", ""),
|
|
1018
|
+
"disk_name": reservation.get("disk_name", ""),
|
|
1002
1019
|
"secondary_users": reservation.get("secondary_users", []),
|
|
1003
1020
|
"warning": reservation.get("warning", ""),
|
|
1004
1021
|
"is_multinode": is_multinode,
|
|
1005
1022
|
"pod_ip": reservation.get("pod_ip", ""),
|
|
1023
|
+
"node_ip": reservation.get("node_ip", ""),
|
|
1024
|
+
"node_name": reservation.get("node_name", ""),
|
|
1006
1025
|
"fqdn": reservation.get("fqdn", ""),
|
|
1026
|
+
# Health/diagnostics (surfaced by `gpu-dev debug`); written by the
|
|
1027
|
+
# reservation + expiry lambdas. Present off the raw item, not always set.
|
|
1028
|
+
"oom_count": int(reservation.get("oom_count", 0) or 0),
|
|
1029
|
+
"last_oom_at": reservation.get("last_oom_at", ""),
|
|
1030
|
+
"oom_container": reservation.get("oom_container", ""),
|
|
1007
1031
|
}
|
|
1008
1032
|
|
|
1009
1033
|
# If multi-node, fetch all nodes in the group
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# `gpu-dev submit` — guide & footguns
|
|
2
|
+
|
|
3
|
+
`gpu-dev submit` reserves a box, (optionally) rsyncs a local dir up, runs your
|
|
4
|
+
command over SSH, syncs results back, and auto-cancels. It's the non-interactive
|
|
5
|
+
sibling of `gpu-dev reserve` — good for CI-style validation, one-shot test runs,
|
|
6
|
+
and scripted repros.
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
# run a script in a local dir on 1x H100, sync results back, auto-cancel
|
|
10
|
+
gpu-dev submit --runtime ./ --gpu-type h100 -- bash run.sh
|
|
11
|
+
|
|
12
|
+
# validate a PyTorch PR's tests on H100 (stages + builds the PR for you)
|
|
13
|
+
gpu-dev submit --gpu-type h100 --no-persistent-disk --ref pr/186015 -- \
|
|
14
|
+
python test/test_foo.py -k some_test
|
|
15
|
+
|
|
16
|
+
# keep the box after the job (debug a failure interactively)
|
|
17
|
+
gpu-dev submit --keep-alive --gpu-type h100 -- pytest test/test_x.py
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Exit code = your command's exit code (so it composes in scripts/CI).
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Footguns (read before your first `--ref` run)
|
|
25
|
+
|
|
26
|
+
### 1. `--ref` stages PyTorch in the background — `submit` now waits for it
|
|
27
|
+
With `--ref`, the in-pod startup checks out your ref into `/home/dev/pytorch`
|
|
28
|
+
**in the background** and only chowns the tree to `dev` + finishes the checkout
|
|
29
|
+
at the very end. Historically `submit` could SSH in and run your command before
|
|
30
|
+
that finished, so you'd hit:
|
|
31
|
+
- a **root-owned** `/home/dev/pytorch` (git: *"detected dubious ownership"*), and
|
|
32
|
+
- a **source/installed-torch mismatch** → `import torch` fails (the ref source is
|
|
33
|
+
checked out but the importable `.so` is still the stale prebuilt base).
|
|
34
|
+
|
|
35
|
+
`submit` now **waits for staging to complete**, marks the tree a git
|
|
36
|
+
`safe.directory`, and (by default) **rebuilds incrementally** so the installed
|
|
37
|
+
torch matches the checked-out ref before your command runs. You don't need the
|
|
38
|
+
`sudo chown` / `safe.directory` workaround anymore.
|
|
39
|
+
|
|
40
|
+
### 2. `--ref` rebuilds torch by default — use `--no-build` to skip
|
|
41
|
+
The dropped-in `build/` + `.so` come from the **base** tree, not your ref. To make
|
|
42
|
+
`import torch` reflect your ref's compiled (C++/CUDA) changes, `submit --ref`
|
|
43
|
+
runs `pip install -e . --no-build-isolation` (incremental, warm `build/` →
|
|
44
|
+
typically tens of seconds; a cold/cross-arch build is much longer).
|
|
45
|
+
|
|
46
|
+
- Pass **`--no-build`** for Python-only PRs or quick checks — skips the rebuild
|
|
47
|
+
(import still works; it just won't include compiled changes).
|
|
48
|
+
- A rebuild failure exits **90** *before* your command runs (so a broken build
|
|
49
|
+
doesn't masquerade as a test failure).
|
|
50
|
+
|
|
51
|
+
### 3. Prebuilt fast path is **prod-arch only** (H100 / B200)
|
|
52
|
+
The by-SHA / viable-strict prebuilt trees are compiled for `sm_90;sm_100`
|
|
53
|
+
(H100/B200). On other GPU types (t4, a100, l4, …) or staging there's no matching
|
|
54
|
+
prebuilt, so `--ref` falls back to a **full from-scratch build** — slow. Validate
|
|
55
|
+
ref-based jobs on `--gpu-type h100` (or `b200`).
|
|
56
|
+
|
|
57
|
+
### 4. `--ref` is ignored with `--disk`
|
|
58
|
+
A persistent disk brings its own `/home/dev/pytorch`; `--ref` does **not** stage
|
|
59
|
+
onto a `--disk` reservation (and `submit` won't rebuild it). Use
|
|
60
|
+
`--no-persistent-disk` (or omit `--disk`) when you want a ref staged.
|
|
61
|
+
|
|
62
|
+
### 5. `--preserve-entrypoint` needs SSH
|
|
63
|
+
`submit` runs your command over SSH, so a custom image with
|
|
64
|
+
`--preserve-entrypoint` must still expose the SSH harness or `submit` can't reach
|
|
65
|
+
it. For pure entrypoint containers, use `reserve`, not `submit`.
|
|
66
|
+
|
|
67
|
+
### 6. Results sync-back is best-effort
|
|
68
|
+
With `--runtime`, output is rsync'd back to your local dir when the job exits
|
|
69
|
+
(unless `--no-pull`). If the box dies mid-job (spot reclaim, expiry) the sync-back
|
|
70
|
+
may be partial — you'll see a warning. For long jobs prefer `--keep-alive` and
|
|
71
|
+
pull manually, or write important artifacts to `/shared-personal` (persists
|
|
72
|
+
across reservations).
|
|
73
|
+
|
|
74
|
+
### 7. `--hours` is a ceiling, not the runtime
|
|
75
|
+
It's the reservation lifetime cap; the job auto-cancels as soon as your command
|
|
76
|
+
exits (unless `--keep-alive`). Set it high enough that queueing + build + run fit.
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## Finding footguns early
|
|
81
|
+
|
|
82
|
+
- `gpu-dev submit --keep-alive … -- true` then `gpu-dev connect <id>` — get a
|
|
83
|
+
box in the exact submit state and poke around before committing a real run.
|
|
84
|
+
- With `--ref`, watch staging directly: `tail -f /home/dev/.pytorch-staging.log`
|
|
85
|
+
in the pod; `.pytorch-ready` (HEAD sha) is written when staging is done.
|
|
86
|
+
- `python -c "import torch; print(torch.__file__, torch.version.git_version)"`
|
|
87
|
+
confirms which torch you're actually importing vs. the ref you asked for.
|
|
88
|
+
|
|
89
|
+
Found a new one? Add it here and ping `oncall:pytorch_release_engineering`.
|
|
@@ -24,6 +24,7 @@ cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py
|
|
|
24
24
|
cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py
|
|
25
25
|
cli-tools/scripts/clear_stale_disk_locks.py
|
|
26
26
|
docs/FAST_REPRO_DESIGN.md
|
|
27
|
+
docs/GPU_DEV_SUBMIT.md
|
|
27
28
|
docs/SDK_REPRO.md
|
|
28
29
|
docs/USER_GUIDE.md
|
|
29
30
|
docs/devgpu-features.html
|
|
@@ -197,6 +198,7 @@ tests/unit/cli/test_cancel.py
|
|
|
197
198
|
tests/unit/cli/test_config_cmd.py
|
|
198
199
|
tests/unit/cli/test_config_module.py
|
|
199
200
|
tests/unit/cli/test_connect.py
|
|
201
|
+
tests/unit/cli/test_debug.py
|
|
200
202
|
tests/unit/cli/test_disks.py
|
|
201
203
|
tests/unit/cli/test_edit.py
|
|
202
204
|
tests/unit/cli/test_interactive.py
|
|
@@ -212,6 +214,9 @@ tests/unit/lambda_fn/__init__.py
|
|
|
212
214
|
tests/unit/lambda_fn/test_availability.py
|
|
213
215
|
tests/unit/lambda_fn/test_cancellation.py
|
|
214
216
|
tests/unit/lambda_fn/test_claim.py
|
|
217
|
+
tests/unit/lambda_fn/test_dead_pod_cleanup.py
|
|
218
|
+
tests/unit/lambda_fn/test_finalize_no_ssh.py
|
|
219
|
+
tests/unit/lambda_fn/test_get_logs.py
|
|
215
220
|
tests/unit/lambda_fn/test_mig_gpu_config.py
|
|
216
221
|
tests/unit/lambda_fn/test_pod_resources.py
|
|
217
222
|
tests/unit/lambda_fn/test_ref_staging.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.13"
|
|
8
8
|
description = "CLI + Python SDK for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|