gpu-dev 0.7.10__tar.gz → 0.7.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/PKG-INFO +1 -1
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +51 -4
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +13 -1
- gpu_dev-0.7.12/docs/GPU_DEV_SUBMIT.md +89 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/SOURCES.txt +2 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/pyproject.toml +1 -1
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/Dockerfile +3 -2
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_processor/index.py +63 -40
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_submit.py +47 -1
- gpu_dev-0.7.12/tests/unit/lambda_fn/test_finalize_no_ssh.py +24 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/.github/workflows/tests.yml +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/.gitignore +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/CLAUDE.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/README.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/admin/README.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/admin/generate_stats.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/admin/requirements.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/conftest.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/FAST_REPRO_DESIGN.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/SDK_REPRO.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/post-may-2026.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/CLAUDE.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/architecture.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/cli-demo.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/devgpu-features.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/feedback.png +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/gpu-fleet.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/index.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/k8s-under-the-hood.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/multinode.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/osdc-future-plans.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/problem.png +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/pyproject.toml +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/sandbox.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/sdk-demo.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/teaser.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/thesis.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/title-vid.mp4 +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/weneedgpus.png +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/wow.html +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/README.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/batch_multi_gpu.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/interactive_debug.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/parallel_experiments.ipynb +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/quickstart.ipynb +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/run_tests.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/submit_job.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_backend/aws.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/config.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/enums.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/errors.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/models.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/py.typed +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/tests/test_models.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/setup.cfg +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-deck/backend.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-deck/main.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-deck/terraform.tfvars.example +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/build-node.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/pytorch-ondemand.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/pytorch-prebuild.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/warm-pool.tf +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/README.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/conftest.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_claude.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_cpu_lifecycle.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_repro_known_failure.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_t4_lifecycle.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_warm_pool.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/submit/README.md +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/submit/success/run.sh +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_auth.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_avail.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_cancel.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_config_cmd.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_config_module.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_connect.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_disks.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_edit.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_interactive.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_list_show.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_name_generator.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_repro.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_reservations_mgr.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_reserve.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_smoke.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_ssh_alias.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_availability.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_cancellation.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_claim.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_mig_gpu_config.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_pod_resources.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_ref_staging.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_smoke.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_version_gate.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_warm_pool.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/__init__.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_backend_aws.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_client.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_errors_enums.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_models_extra.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_sandbox.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_sdk_config.py +0 -0
- {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_transport_ssh.py +0 -0
|
@@ -1724,6 +1724,47 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
|
|
|
1724
1724
|
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"]
|
|
1725
1725
|
|
|
1726
1726
|
|
|
1727
|
+
def _build_submit_remote_script(workdir: str, remote_cmd: str, ref: Optional[str],
|
|
1728
|
+
no_build: bool) -> str:
|
|
1729
|
+
"""Build the remote shell script `submit` runs over SSH (under `bash -lc`).
|
|
1730
|
+
|
|
1731
|
+
Without --ref this is just `cd <workdir> && <cmd>`. With --ref the pytorch
|
|
1732
|
+
tree is staged in the *background* in-pod (stage-pytorch &), and the tree is
|
|
1733
|
+
only chowned to dev + the ref fully checked out at the very end. Running the
|
|
1734
|
+
user command before that finishes is the footgun Driss hit: a root-owned tree
|
|
1735
|
+
(git "dubious ownership") and a source/installed-torch mismatch (the ref is
|
|
1736
|
+
checked out but the prebuilt .so is the stale base build -> `import torch`
|
|
1737
|
+
fails). So with --ref we prepend a preamble that:
|
|
1738
|
+
1. waits for staging to finish (`.pytorch-staging` marker removed at end),
|
|
1739
|
+
2. marks /home/dev/pytorch a git safe.directory for the dev user,
|
|
1740
|
+
3. unless --no-build, rebuilds incrementally so installed torch == the
|
|
1741
|
+
checked-out source (warm build/ -> ~tens of seconds; a rebuild failure
|
|
1742
|
+
exits 90 before the user command runs).
|
|
1743
|
+
The rebuild/safe.directory only touch pytorch when staging actually ran
|
|
1744
|
+
(`.pytorch-ready` present), so --disk reservations (ref ignored, no staging)
|
|
1745
|
+
are unaffected.
|
|
1746
|
+
"""
|
|
1747
|
+
import shlex
|
|
1748
|
+
cd_run = f"cd {shlex.quote(workdir)} && {remote_cmd}"
|
|
1749
|
+
if not ref:
|
|
1750
|
+
return cd_run
|
|
1751
|
+
lines = [
|
|
1752
|
+
'if [ -e /home/dev/.pytorch-staging ]; then',
|
|
1753
|
+
' echo "[gpu-dev] waiting for background pytorch --ref staging to finish…"',
|
|
1754
|
+
' for _i in $(seq 1 3600); do [ -e /home/dev/.pytorch-staging ] || break; sleep 1; done',
|
|
1755
|
+
'fi',
|
|
1756
|
+
'if [ -f /home/dev/.pytorch-ready ]; then',
|
|
1757
|
+
' git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true',
|
|
1758
|
+
]
|
|
1759
|
+
if not no_build:
|
|
1760
|
+
lines += [
|
|
1761
|
+
' echo "[gpu-dev] rebuilding torch to match --ref (pip install -e . --no-build-isolation)…"',
|
|
1762
|
+
' ( cd /home/dev/pytorch && pip install -e . --no-build-isolation ) || { echo "[gpu-dev] torch rebuild failed"; exit 90; }',
|
|
1763
|
+
]
|
|
1764
|
+
lines += ['fi', cd_run]
|
|
1765
|
+
return "\n".join(lines)
|
|
1766
|
+
|
|
1767
|
+
|
|
1727
1768
|
@main.command(context_settings={"ignore_unknown_options": True})
|
|
1728
1769
|
@click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
|
|
1729
1770
|
@click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
|
|
@@ -1743,6 +1784,8 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
|
|
|
1743
1784
|
@click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
|
|
1744
1785
|
help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
|
|
1745
1786
|
@click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
|
|
1787
|
+
@click.option("--no-build", is_flag=True,
|
|
1788
|
+
help="With --ref, skip the incremental torch rebuild before the command (Python-only PRs / quick checks). Default: rebuild so `import torch` reflects the ref.")
|
|
1746
1789
|
@click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
|
|
1747
1790
|
@click.option("--name", type=str, default=None, help="Reservation name.")
|
|
1748
1791
|
@click.option("--timeout", type=int, default=24 * 60, show_default=True,
|
|
@@ -1750,7 +1793,7 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
|
|
|
1750
1793
|
@click.argument("command", nargs=-1, required=True)
|
|
1751
1794
|
@click.pass_context
|
|
1752
1795
|
def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dockerfile, dockerimage, preserve_entrypoint,
|
|
1753
|
-
runtime, no_pull, keep_alive, name, timeout, command):
|
|
1796
|
+
runtime, no_pull, no_build, keep_alive, name, timeout, command):
|
|
1754
1797
|
"""Submit a job: reserve, sync code, run, sync results back, auto-cancel.
|
|
1755
1798
|
|
|
1756
1799
|
\b
|
|
@@ -1961,11 +2004,15 @@ def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dock
|
|
|
1961
2004
|
else:
|
|
1962
2005
|
workdir = "/home/dev"
|
|
1963
2006
|
|
|
1964
|
-
# Run remote command via login shell so MULTINODE_* etc. are loaded
|
|
2007
|
+
# Run remote command via login shell so MULTINODE_* etc. are loaded. With
|
|
2008
|
+
# --ref, the script first waits for background pytorch staging + rebuilds
|
|
2009
|
+
# so `import torch` matches the checked-out ref (see helper docstring).
|
|
1965
2010
|
remote_cmd = " ".join(shlex.quote(c) for c in command)
|
|
1966
2011
|
rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
|
|
1967
|
-
|
|
1968
|
-
|
|
2012
|
+
if ref and not no_build:
|
|
2013
|
+
rprint("[dim] (--ref: will wait for staging + rebuild torch first; pass --no-build to skip)[/dim]")
|
|
2014
|
+
remote_script = _build_submit_remote_script(workdir, remote_cmd, ref, no_build)
|
|
2015
|
+
ssh_run = ssh_base + [ssh_alias, f"bash -lc {shlex.quote(remote_script)}"]
|
|
1969
2016
|
rc = subprocess.call(ssh_run)
|
|
1970
2017
|
rprint(f"\n[dim]Job exited with code {rc}[/dim]")
|
|
1971
2018
|
|
|
@@ -55,11 +55,23 @@ async def tunnel_ssh(target_host: str, target_port: int):
|
|
|
55
55
|
# WebSocket URL - wss:// for secure WebSocket
|
|
56
56
|
ws_url = f"wss://{proxy_host}/tunnel/{target_host}"
|
|
57
57
|
|
|
58
|
+
# Verify TLS against certifi's CA bundle. The default SSL context uses the OS
|
|
59
|
+
# trust store, which on macOS python.org builds is often empty
|
|
60
|
+
# ("unable to get local issuer certificate" / CERTIFICATE_VERIFY_FAILED).
|
|
61
|
+
# certifi ships the Mozilla bundle, so this works without the manual
|
|
62
|
+
# "Install Certificates.command" step.
|
|
63
|
+
ssl_ctx = ssl_module.create_default_context()
|
|
64
|
+
try:
|
|
65
|
+
import certifi
|
|
66
|
+
ssl_ctx.load_verify_locations(certifi.where())
|
|
67
|
+
except Exception:
|
|
68
|
+
pass # fall back to the default trust store
|
|
69
|
+
|
|
58
70
|
last_exc = None
|
|
59
71
|
for attempt in range(MAX_RETRIES):
|
|
60
72
|
try:
|
|
61
73
|
async with websockets.connect(
|
|
62
|
-
ws_url, open_timeout=20,
|
|
74
|
+
ws_url, ssl=ssl_ctx, open_timeout=20,
|
|
63
75
|
ping_interval=30, ping_timeout=10,
|
|
64
76
|
) as websocket:
|
|
65
77
|
# Set up stdin/stdout for SSH
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# `gpu-dev submit` — guide & footguns
|
|
2
|
+
|
|
3
|
+
`gpu-dev submit` reserves a box, (optionally) rsyncs a local dir up, runs your
|
|
4
|
+
command over SSH, syncs results back, and auto-cancels. It's the non-interactive
|
|
5
|
+
sibling of `gpu-dev reserve` — good for CI-style validation, one-shot test runs,
|
|
6
|
+
and scripted repros.
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
# run a script in a local dir on 1x H100, sync results back, auto-cancel
|
|
10
|
+
gpu-dev submit --runtime ./ --gpu-type h100 -- bash run.sh
|
|
11
|
+
|
|
12
|
+
# validate a PyTorch PR's tests on H100 (stages + builds the PR for you)
|
|
13
|
+
gpu-dev submit --gpu-type h100 --no-persistent-disk --ref pr/186015 -- \
|
|
14
|
+
python test/test_foo.py -k some_test
|
|
15
|
+
|
|
16
|
+
# keep the box after the job (debug a failure interactively)
|
|
17
|
+
gpu-dev submit --keep-alive --gpu-type h100 -- pytest test/test_x.py
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Exit code = your command's exit code (so it composes in scripts/CI).
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Footguns (read before your first `--ref` run)
|
|
25
|
+
|
|
26
|
+
### 1. `--ref` stages PyTorch in the background — `submit` now waits for it
|
|
27
|
+
With `--ref`, the in-pod startup checks out your ref into `/home/dev/pytorch`
|
|
28
|
+
**in the background** and only chowns the tree to `dev` + finishes the checkout
|
|
29
|
+
at the very end. Historically `submit` could SSH in and run your command before
|
|
30
|
+
that finished, so you'd hit:
|
|
31
|
+
- a **root-owned** `/home/dev/pytorch` (git: *"detected dubious ownership"*), and
|
|
32
|
+
- a **source/installed-torch mismatch** → `import torch` fails (the ref source is
|
|
33
|
+
checked out but the importable `.so` is still the stale prebuilt base).
|
|
34
|
+
|
|
35
|
+
`submit` now **waits for staging to complete**, marks the tree a git
|
|
36
|
+
`safe.directory`, and (by default) **rebuilds incrementally** so the installed
|
|
37
|
+
torch matches the checked-out ref before your command runs. You don't need the
|
|
38
|
+
`sudo chown` / `safe.directory` workaround anymore.
|
|
39
|
+
|
|
40
|
+
### 2. `--ref` rebuilds torch by default — use `--no-build` to skip
|
|
41
|
+
The dropped-in `build/` + `.so` come from the **base** tree, not your ref. To make
|
|
42
|
+
`import torch` reflect your ref's compiled (C++/CUDA) changes, `submit --ref`
|
|
43
|
+
runs `pip install -e . --no-build-isolation` (incremental, warm `build/` →
|
|
44
|
+
typically tens of seconds; a cold/cross-arch build is much longer).
|
|
45
|
+
|
|
46
|
+
- Pass **`--no-build`** for Python-only PRs or quick checks — skips the rebuild
|
|
47
|
+
(import still works; it just won't include compiled changes).
|
|
48
|
+
- A rebuild failure exits **90** *before* your command runs (so a broken build
|
|
49
|
+
doesn't masquerade as a test failure).
|
|
50
|
+
|
|
51
|
+
### 3. Prebuilt fast path is **prod-arch only** (H100 / B200)
|
|
52
|
+
The by-SHA / viable-strict prebuilt trees are compiled for `sm_90;sm_100`
|
|
53
|
+
(H100/B200). On other GPU types (t4, a100, l4, …) or staging there's no matching
|
|
54
|
+
prebuilt, so `--ref` falls back to a **full from-scratch build** — slow. Validate
|
|
55
|
+
ref-based jobs on `--gpu-type h100` (or `b200`).
|
|
56
|
+
|
|
57
|
+
### 4. `--ref` is ignored with `--disk`
|
|
58
|
+
A persistent disk brings its own `/home/dev/pytorch`; `--ref` does **not** stage
|
|
59
|
+
onto a `--disk` reservation (and `submit` won't rebuild it). Use
|
|
60
|
+
`--no-persistent-disk` (or omit `--disk`) when you want a ref staged.
|
|
61
|
+
|
|
62
|
+
### 5. `--preserve-entrypoint` needs SSH
|
|
63
|
+
`submit` runs your command over SSH, so a custom image with
|
|
64
|
+
`--preserve-entrypoint` must still expose the SSH harness or `submit` can't reach
|
|
65
|
+
it. For pure entrypoint containers, use `reserve`, not `submit`.
|
|
66
|
+
|
|
67
|
+
### 6. Results sync-back is best-effort
|
|
68
|
+
With `--runtime`, output is rsync'd back to your local dir when the job exits
|
|
69
|
+
(unless `--no-pull`). If the box dies mid-job (spot reclaim, expiry) the sync-back
|
|
70
|
+
may be partial — you'll see a warning. For long jobs prefer `--keep-alive` and
|
|
71
|
+
pull manually, or write important artifacts to `/shared-personal` (persists
|
|
72
|
+
across reservations).
|
|
73
|
+
|
|
74
|
+
### 7. `--hours` is a ceiling, not the runtime
|
|
75
|
+
It's the reservation lifetime cap; the job auto-cancels as soon as your command
|
|
76
|
+
exits (unless `--keep-alive`). Set it high enough that queueing + build + run fit.
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## Finding footguns early
|
|
81
|
+
|
|
82
|
+
- `gpu-dev submit --keep-alive … -- true` then `gpu-dev connect <id>` — get a
|
|
83
|
+
box in the exact submit state and poke around before committing a real run.
|
|
84
|
+
- With `--ref`, watch staging directly: `tail -f /home/dev/.pytorch-staging.log`
|
|
85
|
+
in the pod; `.pytorch-ready` (HEAD sha) is written when staging is done.
|
|
86
|
+
- `python -c "import torch; print(torch.__file__, torch.version.git_version)"`
|
|
87
|
+
confirms which torch you're actually importing vs. the ref you asked for.
|
|
88
|
+
|
|
89
|
+
Found a new one? Add it here and ping `oncall:pytorch_release_engineering`.
|
|
@@ -24,6 +24,7 @@ cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py
|
|
|
24
24
|
cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py
|
|
25
25
|
cli-tools/scripts/clear_stale_disk_locks.py
|
|
26
26
|
docs/FAST_REPRO_DESIGN.md
|
|
27
|
+
docs/GPU_DEV_SUBMIT.md
|
|
27
28
|
docs/SDK_REPRO.md
|
|
28
29
|
docs/USER_GUIDE.md
|
|
29
30
|
docs/devgpu-features.html
|
|
@@ -212,6 +213,7 @@ tests/unit/lambda_fn/__init__.py
|
|
|
212
213
|
tests/unit/lambda_fn/test_availability.py
|
|
213
214
|
tests/unit/lambda_fn/test_cancellation.py
|
|
214
215
|
tests/unit/lambda_fn/test_claim.py
|
|
216
|
+
tests/unit/lambda_fn/test_finalize_no_ssh.py
|
|
215
217
|
tests/unit/lambda_fn/test_mig_gpu_config.py
|
|
216
218
|
tests/unit/lambda_fn/test_pod_resources.py
|
|
217
219
|
tests/unit/lambda_fn/test_ref_staging.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.12"
|
|
8
8
|
description = "CLI + Python SDK for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -148,12 +148,13 @@ COPY ssh_config /etc/ssh/sshd_config
|
|
|
148
148
|
# Bump CLAUDE_CODE_BUILD to bust the layer cache and re-fetch the latest Claude Code
|
|
149
149
|
# (the installer always grabs latest; without a bump Docker reuses the cached layer).
|
|
150
150
|
USER root
|
|
151
|
-
ARG CLAUDE_CODE_BUILD=2026-
|
|
151
|
+
ARG CLAUDE_CODE_BUILD=2026-06-09
|
|
152
152
|
RUN echo "Claude Code build marker: $CLAUDE_CODE_BUILD" && \
|
|
153
153
|
curl -fsSL https://claude.ai/install.sh | HOME=/opt/claude bash || echo "Claude native install failed (non-fatal at build time)"
|
|
154
154
|
RUN if [ -e /opt/claude/.local/bin/claude ]; then \
|
|
155
155
|
ln -sf /opt/claude/.local/bin/claude /usr/local/bin/claude; \
|
|
156
156
|
chmod -R a+rX /opt/claude; \
|
|
157
|
+
echo "Installed Claude Code (native): $(/usr/local/bin/claude --version 2>/dev/null || echo unknown)"; \
|
|
157
158
|
fi
|
|
158
159
|
|
|
159
160
|
# Set up npm global directory for dev user (kept for ad-hoc dev-installed CLIs).
|
|
@@ -176,7 +177,7 @@ RUN npm install -g --prefix /usr/local @openai/codex || echo "Codex CLI install
|
|
|
176
177
|
# leaves /usr/local/bin/codex as a SYMLINK to that codex.js, so we must `rm -f` it first —
|
|
177
178
|
# writing through the symlink would clobber codex.js itself, making the wrapper exec itself
|
|
178
179
|
# (infinite recursion -> codex hangs on launch).
|
|
179
|
-
RUN rm -f /usr/local/bin/codex && echo '
|
|
180
|
+
RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IHdpcmVkIHRvIEdQVC01LjUgb24gQVdTIEJlZHJvY2sgKHVzLWVhc3QtMiBtYW50bGUgZW5kcG9pbnQpLgojIEF1dGggdmlhIHRoZSBwb2QgSVJTQSAtPiBzaG9ydC1saXZlZCAofjEyaCkgQmVkcm9jayBiZWFyZXIgdG9rZW47IG5vIHBlci11c2VyIGtleS4KIyBSZWFzb25pbmcgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgZW52IChkZWZhdWx0IGhpZ2gpLiBUaGUgY29uZmlnIGlzIChyZSl3cml0dGVuIG9uCiMgZXZlcnkgbGF1bmNoOiAvaG9tZS9kZXYgaXMgZXBoZW1lcmFsLCBhbmQgdGhpcyBhbHNvIHNlbGYtaGVhbHMgYSAvbW9kZWwgbWlzaGFwCiMgKHRoZSBwaWNrZXIgY2FuIGNvcnJ1cHQgdGhlIG1vZGVsIGlkOyByZXN0YXJ0aW5nIGNvZGV4IHJlc2V0cyBpdCkuCnNldCArZQpSRUFMPS91c3IvbG9jYWwvbGliL25vZGVfbW9kdWxlcy9Ab3BlbmFpL2NvZGV4L2Jpbi9jb2RleC5qcwpFRkZPUlQ9IiR7Q09ERVhfRUZGT1JUOi1oaWdofSIKbWtkaXIgLXAgIiRIT01FLy5jb2RleCIgMj4vZGV2L251bGwKY2F0ID4gIiRIT01FLy5jb2RleC9jb25maWcudG9tbCIgPDxDRkcKbW9kZWwgPSAib3BlbmFpLmdwdC01LjUiCm1vZGVsX3Byb3ZpZGVyID0gImJlZHJvY2siCndlYl9zZWFyY2ggPSAiZGlzYWJsZWQiCm1vZGVsX2NvbnRleHRfd2luZG93ID0gMjcyMDAwCm1vZGVsX21heF9vdXRwdXRfdG9rZW5zID0gMTI4MDAwCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKClttb2RlbF9wcm92aWRlcnMuYmVkcm9ja10KbmFtZSA9ICJBV1MgQmVkcm9jayAoR1BULTUuNSkiCmJhc2VfdXJsID0gImh0dHBzOi8vYmVkcm9jay1tYW50bGUudXMtZWFzdC0yLmFwaS5hd3Mvb3BlbmFpL3YxIgplbnZfa2V5ID0gIk9QRU5BSV9BUElfS0VZIgp3aXJlX2FwaSA9ICJyZXNwb25zZXMiCkNGRwpUT0s9IiQoL3Vzci9iaW4vcHl0aG9uMyAtYyAnZnJvbSBhd3NfYmVkcm9ja190b2tlbl9nZW5lcmF0b3IgaW1wb3J0IHByb3ZpZGVfdG9rZW47IHByaW50KHByb3ZpZGVfdG9rZW4ocmVnaW9uPSJ1cy1lYXN0LTIiKSknIDI+L2Rldi9udWxsKSIKWyAtbiAiJFRPSyIgXSAmJiBleHBvcnQgT1BFTkFJX0FQSV9LRVk9IiRUT0siCmV4ZWMgIiRSRUFMIiAiJEAiCg==' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
|
|
180
181
|
|
|
181
182
|
USER dev
|
|
182
183
|
|
{gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -3832,40 +3832,73 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
3832
3832
|
f"MAIN FLOW: Pod is ready, checking SSH daemon status from logs for {reservation_id}"
|
|
3833
3833
|
)
|
|
3834
3834
|
|
|
3835
|
+
# Let the user know we're past pod creation and waiting on the service.
|
|
3836
|
+
# On persistent-disk reservations the entrypoint restores the disk before
|
|
3837
|
+
# sshd binds, so this can legitimately take a few minutes.
|
|
3838
|
+
update_reservation_status(
|
|
3839
|
+
reservation_id,
|
|
3840
|
+
"preparing",
|
|
3841
|
+
"Container running — restoring your environment and starting SSH…"
|
|
3842
|
+
if use_persistent_disk
|
|
3843
|
+
else "Container running — starting SSH service…",
|
|
3844
|
+
)
|
|
3845
|
+
|
|
3835
3846
|
record_trace_event(trace_data, "ssh_ready_check_start")
|
|
3836
3847
|
ssh_ready = False
|
|
3837
3848
|
try:
|
|
3838
3849
|
v1 = client.CoreV1Api(k8s_client)
|
|
3839
3850
|
|
|
3840
|
-
# Poll
|
|
3841
|
-
#
|
|
3842
|
-
#
|
|
3843
|
-
|
|
3851
|
+
# Poll pod logs for the sshd-ready marker. Fast (100ms) for the first
|
|
3852
|
+
# 8s to catch the common fast path instantly, then back off to 5s.
|
|
3853
|
+
# Slow-disk startups restore the disk *before* sshd binds, so allow up
|
|
3854
|
+
# to ~150s. If the marker never appears we finalize anyway below —
|
|
3855
|
+
# routing is already in place and the SSH proxy retries until sshd binds.
|
|
3856
|
+
deadline = time.time() + 150.0
|
|
3844
3857
|
elapsed = 0.0
|
|
3845
|
-
|
|
3846
|
-
|
|
3858
|
+
attempt = 0
|
|
3859
|
+
logs = ""
|
|
3860
|
+
while time.time() < deadline:
|
|
3847
3861
|
logs = v1.read_namespaced_pod_log(
|
|
3848
|
-
name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=
|
|
3862
|
+
name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=200
|
|
3849
3863
|
)
|
|
3850
3864
|
if "SSH daemon starting on port 22" in logs or "Server listening on" in logs:
|
|
3851
3865
|
logger.info(
|
|
3852
3866
|
f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1}, {elapsed:.1f}s elapsed)")
|
|
3853
3867
|
ssh_ready = True
|
|
3854
3868
|
break
|
|
3855
|
-
else
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
|
|
3860
|
-
|
|
3861
|
-
|
|
3862
|
-
f"SSH daemon not detected after {max_attempts} attempts, logs preview: {logs[-200:]}")
|
|
3869
|
+
delay = 0.1 if elapsed < 8.0 else min(1.0 + (elapsed - 8.0) * 0.3, 5.0)
|
|
3870
|
+
time.sleep(delay)
|
|
3871
|
+
elapsed += delay
|
|
3872
|
+
attempt += 1
|
|
3873
|
+
if not ssh_ready:
|
|
3874
|
+
logger.warning(
|
|
3875
|
+
f"SSH daemon marker not seen for {pod_name} after {elapsed:.1f}s, logs preview: {logs[-200:]}")
|
|
3863
3876
|
except Exception as e:
|
|
3864
3877
|
logger.warning(f"Could not check SSH daemon logs: {e}")
|
|
3865
3878
|
# Assume ready if pod is running (NLB will handle routing)
|
|
3866
3879
|
ssh_ready = True
|
|
3867
3880
|
record_trace_event(trace_data, "ssh_ready_check_end")
|
|
3868
3881
|
|
|
3882
|
+
# If the sshd marker never showed, don't orphan the reservation in
|
|
3883
|
+
# 'preparing'. Only a genuinely broken pod should fail here; otherwise the
|
|
3884
|
+
# pod is just slow to bind sshd (disk restore) — routing is already stored,
|
|
3885
|
+
# so we finalize anyway and let the SSH proxy retry until sshd is up.
|
|
3886
|
+
if not ssh_ready:
|
|
3887
|
+
logger.warning(
|
|
3888
|
+
f"MAIN FLOW: SSH daemon not confirmed for reservation {reservation_id}, checking pod status for errors")
|
|
3889
|
+
pod_info = update_pod_status_and_events(k8s_client, pod_name, reservation_id)
|
|
3890
|
+
if not should_finalize_without_ssh_marker(pod_info):
|
|
3891
|
+
update_reservation_status(
|
|
3892
|
+
reservation_id,
|
|
3893
|
+
"failed",
|
|
3894
|
+
f"Pod failed to start properly: {pod_info['display_message']}",
|
|
3895
|
+
)
|
|
3896
|
+
raise RuntimeError(f"Pod failed: {pod_info['display_message']}")
|
|
3897
|
+
logger.warning(
|
|
3898
|
+
f"SSH daemon not confirmed for {pod_name}, but pod is healthy — "
|
|
3899
|
+
f"finalizing connection anyway (SSH proxy retries until sshd binds)")
|
|
3900
|
+
ssh_ready = True
|
|
3901
|
+
|
|
3869
3902
|
if ssh_ready:
|
|
3870
3903
|
# Update status: Finalizing connection
|
|
3871
3904
|
update_reservation_status(
|
|
@@ -3985,28 +4018,6 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
3985
4018
|
f"Failed to trigger availability update: {update_error}")
|
|
3986
4019
|
# Don't fail the reservation for this
|
|
3987
4020
|
|
|
3988
|
-
else:
|
|
3989
|
-
logger.warning(
|
|
3990
|
-
f"MAIN FLOW: SSH connectivity test FAILED for reservation {reservation_id}, checking pod status for errors")
|
|
3991
|
-
# Check pod status using our consolidated monitoring function
|
|
3992
|
-
pod_info = update_pod_status_and_events(
|
|
3993
|
-
k8s_client, pod_name, reservation_id)
|
|
3994
|
-
if pod_info["has_errors"]:
|
|
3995
|
-
update_reservation_status(
|
|
3996
|
-
reservation_id,
|
|
3997
|
-
"failed",
|
|
3998
|
-
f"Pod failed to start properly: {pod_info['display_message']}",
|
|
3999
|
-
)
|
|
4000
|
-
raise RuntimeError(
|
|
4001
|
-
f"Pod failed: {pod_info['display_message']}")
|
|
4002
|
-
else:
|
|
4003
|
-
# Pod is running but SSH not ready yet - keep as preparing
|
|
4004
|
-
# Status message already updated by update_pod_status_and_events
|
|
4005
|
-
pass
|
|
4006
|
-
logger.warning(
|
|
4007
|
-
f"SSH not ready yet for {pod_name}, keeping reservation in preparing state"
|
|
4008
|
-
)
|
|
4009
|
-
|
|
4010
4021
|
# GPU allocation handled automatically by K8s scheduler
|
|
4011
4022
|
|
|
4012
4023
|
# Store trace data in DynamoDB if tracing is enabled
|
|
@@ -4057,6 +4068,18 @@ def delete_sqs_message(record: dict[str, Any]) -> None:
|
|
|
4057
4068
|
logger.error(f"Error deleting SQS message: {str(e)}")
|
|
4058
4069
|
|
|
4059
4070
|
|
|
4071
|
+
def should_finalize_without_ssh_marker(pod_info: dict) -> bool:
|
|
4072
|
+
"""Decide what to do when the sshd-ready log marker never appeared.
|
|
4073
|
+
|
|
4074
|
+
The pod's routing (domain mapping) is stored before the readiness poll, so a
|
|
4075
|
+
slow sshd (e.g. a persistent-disk restore that runs before sshd binds) is not
|
|
4076
|
+
a failure — finalizing anyway lets the CLI's SSH proxy retry until sshd is up,
|
|
4077
|
+
instead of orphaning the reservation in 'preparing' forever. Only a pod that
|
|
4078
|
+
actually reports errors should fail.
|
|
4079
|
+
"""
|
|
4080
|
+
return not pod_info.get("has_errors", False)
|
|
4081
|
+
|
|
4082
|
+
|
|
4060
4083
|
def update_reservation_status(reservation_id: str, status: str, detailed_status: str = None, failure_reason: str = None) -> None:
|
|
4061
4084
|
"""
|
|
4062
4085
|
Update reservation status with unified status tracking.
|
|
@@ -6328,7 +6351,7 @@ EOF
|
|
|
6328
6351
|
# Only start Jupyter if enabled at creation time
|
|
6329
6352
|
if [ "$JUPYTER_ENABLED" = "true" ]; then
|
|
6330
6353
|
echo "[STARTUP] Starting Jupyter Lab in background..."
|
|
6331
|
-
nohup su - dev -c "cd /workspace && /
|
|
6354
|
+
nohup su - dev -c "cd /workspace && $(command -v jupyter-lab || echo /usr/local/bin/jupyter-lab) --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
|
|
6332
6355
|
echo "[STARTUP] Jupyter Lab started (check /tmp/jupyter.log for details)"
|
|
6333
6356
|
else
|
|
6334
6357
|
echo "[STARTUP] Jupyter Lab configured but not started (use 'gpu-dev edit --enable-jupyter' to enable)"
|
|
@@ -8487,7 +8510,7 @@ def update_pod_status_and_events(k8s_client, pod_name: str, reservation_id: str)
|
|
|
8487
8510
|
if pod_phase == "Pending":
|
|
8488
8511
|
display_message = "⏳ Pod pending"
|
|
8489
8512
|
elif pod_phase == "Running":
|
|
8490
|
-
display_message = "🚀 Container running"
|
|
8513
|
+
display_message = "🚀 Container running — starting SSH service…"
|
|
8491
8514
|
else:
|
|
8492
8515
|
display_message = f"Pod phase: {pod_phase}"
|
|
8493
8516
|
|
|
@@ -9296,7 +9319,7 @@ def enable_jupyter_in_pod(
|
|
|
9296
9319
|
|
|
9297
9320
|
# Start Jupyter as dev user in background (config already exists)
|
|
9298
9321
|
echo "Starting Jupyter Lab with existing config..."
|
|
9299
|
-
nohup su - dev -c "cd /workspace && /
|
|
9322
|
+
nohup su - dev -c "cd /workspace && $(command -v jupyter-lab || echo /usr/local/bin/jupyter-lab) --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
|
|
9300
9323
|
|
|
9301
9324
|
# Wait for startup
|
|
9302
9325
|
sleep 3
|
|
@@ -19,12 +19,58 @@ from unittest.mock import MagicMock, patch
|
|
|
19
19
|
|
|
20
20
|
import pytest
|
|
21
21
|
|
|
22
|
-
from gpu_dev_cli.cli import main
|
|
22
|
+
from gpu_dev_cli.cli import main, _build_submit_remote_script
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
USER_INFO = {"user_id": "u-123", "github_user": "octocat"}
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# _build_submit_remote_script — the --ref staging-gate + rebuild preamble
|
|
30
|
+
# (regression for Driss's footguns: root-owned tree + source/installed mismatch)
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
def test_remote_script_no_ref_is_plain_cd_run():
|
|
33
|
+
s = _build_submit_remote_script("/workspace/x", "python a.py", ref=None, no_build=False)
|
|
34
|
+
assert s == "cd /workspace/x && python a.py"
|
|
35
|
+
assert "pytorch-staging" not in s
|
|
36
|
+
assert "no-build-isolation" not in s
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_remote_script_with_ref_waits_and_rebuilds():
|
|
40
|
+
s = _build_submit_remote_script("/home/dev", "pytest q.py", ref="pr/123", no_build=False)
|
|
41
|
+
# waits for the background staging marker
|
|
42
|
+
assert "/home/dev/.pytorch-staging" in s
|
|
43
|
+
# only acts once staging actually completed
|
|
44
|
+
assert "/home/dev/.pytorch-ready" in s
|
|
45
|
+
# marks safe.directory for the dev user (fixes git "dubious ownership")
|
|
46
|
+
assert "safe.directory /home/dev/pytorch" in s
|
|
47
|
+
# rebuilds so installed torch matches the checked-out ref
|
|
48
|
+
assert "pip install -e . --no-build-isolation" in s
|
|
49
|
+
# user command still runs last, in the workdir
|
|
50
|
+
assert s.rstrip().endswith("cd /home/dev && pytest q.py")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_remote_script_ref_no_build_skips_rebuild():
|
|
54
|
+
s = _build_submit_remote_script("/home/dev", "pytest q.py", ref="pr/123", no_build=True)
|
|
55
|
+
assert "/home/dev/.pytorch-staging" in s # still waits for staging
|
|
56
|
+
assert "safe.directory /home/dev/pytorch" in s # still fixes ownership
|
|
57
|
+
assert "no-build-isolation" not in s # but no rebuild
|
|
58
|
+
assert s.rstrip().endswith("cd /home/dev && pytest q.py")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_remote_script_quotes_workdir():
|
|
62
|
+
s = _build_submit_remote_script("/work space/x", "echo hi", ref=None, no_build=False)
|
|
63
|
+
assert "'/work space/x'" in s
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_no_build_flag_threaded_and_defaults_false(cli_runner):
|
|
67
|
+
# --no-build is accepted; with --ref it changes the rebuild preamble. Here we
|
|
68
|
+
# just assert the flag parses (reservation returns None -> exit 2).
|
|
69
|
+
res, rm = _run(cli_runner, ["--ref", "pr/1", "--no-build", "--", "x"])
|
|
70
|
+
assert res.exit_code == 2
|
|
71
|
+
rm.create_reservation.assert_called_once()
|
|
72
|
+
|
|
73
|
+
|
|
28
74
|
# ---------------------------------------------------------------------------
|
|
29
75
|
# patch harness
|
|
30
76
|
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Unit tests for the slow-sshd finalize decision.
|
|
2
|
+
|
|
3
|
+
Regression for the orphaned-`preparing` bug: a persistent-disk reservation
|
|
4
|
+
restores its disk *before* sshd binds, so the readiness poll's log marker never
|
|
5
|
+
shows within the window. The main flow used to leave such reservations stuck in
|
|
6
|
+
`preparing` forever. It now finalizes anyway (routing is already stored, the SSH
|
|
7
|
+
proxy retries) and only fails when the pod itself reports errors.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_finalize_when_pod_healthy_but_no_ssh_marker(lambda_index):
|
|
12
|
+
# Running pod, no errors, sshd marker not seen -> finalize anyway.
|
|
13
|
+
info = {"has_errors": False, "display_message": "🚀 Container running — starting SSH service…"}
|
|
14
|
+
assert lambda_index.should_finalize_without_ssh_marker(info) is True
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_do_not_finalize_when_pod_has_errors(lambda_index):
|
|
18
|
+
info = {"has_errors": True, "display_message": "❌ ImagePullBackOff"}
|
|
19
|
+
assert lambda_index.should_finalize_without_ssh_marker(info) is False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_missing_has_errors_key_defaults_to_finalize(lambda_index):
|
|
23
|
+
# Defensive: a partial pod_info dict shouldn't strand the reservation.
|
|
24
|
+
assert lambda_index.should_finalize_without_ssh_marker({}) is True
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|