gpu-dev 0.7.5__tar.gz → 0.7.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/PKG-INFO +1 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +53 -11
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/docs/SDK_REPRO.md +47 -4
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/pyproject.toml +1 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/__init__.py +1 -1
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/reservation_processor/index.py +10 -8
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/.gitignore +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/CLAUDE.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/admin/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/admin/generate_stats.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/admin/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/architecture.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/cli-demo.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/devgpu-features.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/docker-mark-blue.svg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/feedback.png +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/gpu-fleet.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/index.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/k8s-under-the-hood.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/multinode.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/osdc-future-plans.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/problem.png +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/sandbox.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/sdk-demo.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/thesis.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/title-vid.mp4 +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/weneedgpus.png +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/presentation/wow.html +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/examples/batch_multi_gpu.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/examples/interactive_debug.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/examples/parallel_experiments.ipynb +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/examples/quickstart.ipynb +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/examples/run_tests.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/examples/submit_job.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_backend/aws.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/common/config.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/common/enums.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/common/errors.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/common/models.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/src/gpu_dev/py.typed +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/tests/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/sdk/python/tests/test_models.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/setup.cfg +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-deck/backend.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-deck/main.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-deck/terraform.tfvars.example +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/ami-baker.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/build-node.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/check_b200.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/cmd_proxy.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/list_b200.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/pytorch-prebuild.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/warm-pool.tf +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/tests/submit/README.md +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.7.5 → gpu_dev-0.7.6}/tests/submit/success/run.sh +0 -0
|
@@ -1523,12 +1523,19 @@ def reserve(
|
|
|
1523
1523
|
@click.option("--gpu-type", default="b200", show_default=True, help="GPU type for the repro box.")
|
|
1524
1524
|
@click.option("--gpus", type=int, default=1, show_default=True)
|
|
1525
1525
|
@click.option("--hours", type=float, default=3.0, show_default=True,
|
|
1526
|
-
help="Lifetime ceiling
|
|
1526
|
+
help="Lifetime ceiling for the box.")
|
|
1527
|
+
@click.option("--no-connect", is_flag=True, default=False,
|
|
1528
|
+
help="CI mode: run the test, auto-cancel, exit code = test result. Default (on a TTY) drops you into the box to iterate.")
|
|
1527
1529
|
@click.option("--keep", is_flag=True, default=False,
|
|
1528
|
-
help="
|
|
1530
|
+
help="Never cancel the box (skip the cancel prompt / auto-cancel).")
|
|
1529
1531
|
@click.pass_context
|
|
1530
|
-
def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
|
|
1531
|
-
"""Reserve a GPU, check out a PR/commit, run a test, then
|
|
1532
|
+
def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
|
|
1533
|
+
"""Reserve a GPU, check out a PR/commit, run a test, then drop you into the box.
|
|
1534
|
+
|
|
1535
|
+
By default (in a terminal) repro runs the test and then **connects you into the
|
|
1536
|
+
box** at ~/pytorch — the ref is checked out, so you can fix and re-run. The box
|
|
1537
|
+
stays alive until you cancel it (you're prompted on exit). Use --no-connect for
|
|
1538
|
+
CI/scripts (run the test, auto-cancel, process exit code = the test result).
|
|
1532
1539
|
|
|
1533
1540
|
REF: pr/<N>, #<N>, a bare PR number, a branch, or a commit sha. PRs use
|
|
1534
1541
|
pull/<N>/merge (what CI tests), falling back to /head.
|
|
@@ -1539,6 +1546,7 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
|
|
|
1539
1546
|
"""
|
|
1540
1547
|
import shlex
|
|
1541
1548
|
import subprocess
|
|
1549
|
+
import sys
|
|
1542
1550
|
config = load_config()
|
|
1543
1551
|
reservation_mgr = ReservationManager(config)
|
|
1544
1552
|
try:
|
|
@@ -1602,21 +1610,55 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
|
|
|
1602
1610
|
if "StrictHostKeyChecking" not in ssh_cmd:
|
|
1603
1611
|
ssh_cmd = ssh_cmd.replace("ssh ", "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR ", 1)
|
|
1604
1612
|
rprint(f"[dim]→ {ssh_cmd}[/dim]\n")
|
|
1613
|
+
rid8 = str(rid)[:8]
|
|
1605
1614
|
rc = 1
|
|
1606
1615
|
try:
|
|
1607
1616
|
rc = subprocess.run(f"{ssh_cmd} {shlex.quote(remote)}", shell=True).returncode
|
|
1608
1617
|
except KeyboardInterrupt:
|
|
1609
|
-
rprint("\n[yellow]interrupted[/yellow]")
|
|
1610
|
-
|
|
1618
|
+
rprint("\n[yellow]interrupted[/yellow]"); rc = 130
|
|
1619
|
+
|
|
1620
|
+
verdict = "[green]✓ test passed[/green]" if rc == 0 else f"[red]✗ test failed (exit {rc})[/red]"
|
|
1621
|
+
|
|
1622
|
+
# Default (TTY): drop into the box so you can fix and re-run. --no-connect is the
|
|
1623
|
+
# CI path: auto-cancel and exit with the test's code.
|
|
1624
|
+
connect = (not no_connect) and sys.stdout.isatty()
|
|
1625
|
+
if connect:
|
|
1626
|
+
rprint(f"\n{verdict} — dropping you into the box at ~/pytorch ({ref} checked out).")
|
|
1627
|
+
rprint(f"[dim] re-run: python {testcmd}[/dim]")
|
|
1628
|
+
rprint(f"[dim] finish: gpu-dev cancel (from inside) • or exit this shell[/dim]\n")
|
|
1629
|
+
shell_cmd = f"{ssh_cmd} -t {shlex.quote('cd /home/dev/pytorch 2>/dev/null; exec ${SHELL:-bash} -l')}"
|
|
1630
|
+
try:
|
|
1631
|
+
subprocess.run(shell_cmd, shell=True)
|
|
1632
|
+
except KeyboardInterrupt:
|
|
1633
|
+
pass
|
|
1611
1634
|
if keep:
|
|
1612
|
-
rprint(f"[cyan]📌
|
|
1613
|
-
|
|
1635
|
+
rprint(f"[cyan]📌 left {rid8} running — connect: gpu-dev connect {rid8} • cancel: gpu-dev cancel {rid8}[/cyan]")
|
|
1636
|
+
return
|
|
1637
|
+
try:
|
|
1638
|
+
drop = click.confirm(f"Cancel repro box {rid8}?", default=True)
|
|
1639
|
+
except (KeyboardInterrupt, EOFError, click.Abort):
|
|
1640
|
+
drop = False
|
|
1641
|
+
if drop:
|
|
1614
1642
|
try:
|
|
1615
1643
|
reservation_mgr.cancel_reservation(rid, user_info["user_id"])
|
|
1616
|
-
rprint(f"[green]🧹 cancelled
|
|
1644
|
+
rprint(f"[green]🧹 cancelled {rid8}[/green]")
|
|
1617
1645
|
except Exception as e:
|
|
1618
|
-
rprint(f"[yellow]
|
|
1619
|
-
|
|
1646
|
+
rprint(f"[yellow]cancel failed for {rid8}: {e}[/yellow]")
|
|
1647
|
+
else:
|
|
1648
|
+
rprint(f"[cyan]📌 left {rid8} running — connect: gpu-dev connect {rid8} • cancel: gpu-dev cancel {rid8}[/cyan]")
|
|
1649
|
+
return
|
|
1650
|
+
|
|
1651
|
+
# --no-connect / non-TTY: auto-cancel unless --keep, exit code = test result.
|
|
1652
|
+
if keep:
|
|
1653
|
+
rprint(f"[cyan]📌 kept {rid8} — gpu-dev connect {rid8} • gpu-dev cancel {rid8}[/cyan]")
|
|
1654
|
+
else:
|
|
1655
|
+
try:
|
|
1656
|
+
reservation_mgr.cancel_reservation(rid, user_info["user_id"])
|
|
1657
|
+
rprint(f"[green]🧹 cancelled repro box {rid8}[/green]")
|
|
1658
|
+
except Exception as e:
|
|
1659
|
+
rprint(f"[yellow]auto-cancel failed for {rid8}: {e}[/yellow]")
|
|
1660
|
+
rprint(f"\n[bold]repro exit code: {rc}[/bold] ({verdict})")
|
|
1661
|
+
sys.exit(rc)
|
|
1620
1662
|
|
|
1621
1663
|
|
|
1622
1664
|
_SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
@@ -2,7 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Reserve a **warm** GPU box in ~1s, run code on it, and auto-clean up — from Python
|
|
4
4
|
or one CLI command. Backed by a pool of pre-booted pods with **PyTorch prebuilt**
|
|
5
|
-
(viable/strict), so `import torch` works instantly with no build.
|
|
5
|
+
(viable/strict), so `import torch` works instantly with no build. And when you *do*
|
|
6
|
+
have to compile — your ref moved past viable/strict, or you touch C++ — a shared
|
|
7
|
+
compiler cache (ccache) makes it an **incremental, not a cold, build** (see
|
|
8
|
+
[Builds are cached](#builds-are-cached-shared-ccache)).
|
|
6
9
|
|
|
7
10
|
> Requires `gpu-dev` ≥ 0.7.1 (CLI **and** SDK in one package): `pip install --upgrade gpu-dev`
|
|
8
11
|
|
|
@@ -32,14 +35,18 @@ with client.reserve(gpu_type="b200", gpu_count=1, hours=1) as sb:
|
|
|
32
35
|
PyTorch is pre-staged at `~/pytorch` (importable). To reproduce a failure, point at
|
|
33
36
|
the **PR or commit** and run the test.
|
|
34
37
|
|
|
35
|
-
**One CLI command** (reserve → checkout → run →
|
|
38
|
+
**One CLI command** (reserve → checkout → run → **drop you into the box to fix**):
|
|
36
39
|
```bash
|
|
37
40
|
gpu-dev repro pr/185264 test/inductor/test_flex_attention.py TestFlexAttentionCUDA.test_large_kv_int64_pointer_math_cuda
|
|
38
41
|
```
|
|
39
42
|
- `REF`: `pr/<N>`, `#<N>`, a bare PR number, a branch, or a commit sha.
|
|
40
43
|
- PRs use **`pull/<N>/merge`** (what CI actually tests — the PR merged onto current
|
|
41
44
|
trunk), falling back to `/head`. Use this, not the raw branch.
|
|
42
|
-
-
|
|
45
|
+
- By default (in a terminal) repro runs the test, prints the verdict, then **lands
|
|
46
|
+
you in the box** at `~/pytorch` with the ref checked out so you can fix and re-run;
|
|
47
|
+
it stays alive until you cancel (prompted on exit).
|
|
48
|
+
- `--no-connect` = CI mode: run, auto-cancel, process exit code = the test result.
|
|
49
|
+
- `--keep` never cancels (no prompt). `--gpu-type` / `--gpus` / `--hours` to size it.
|
|
43
50
|
|
|
44
51
|
**From the SDK:**
|
|
45
52
|
```python
|
|
@@ -62,12 +69,48 @@ pip install -e . --no-build-isolation
|
|
|
62
69
|
```
|
|
63
70
|
Python-only changes need no rebuild — `PYTHONPATH=~/pytorch` already resolves.
|
|
64
71
|
|
|
72
|
+
## Builds are cached (shared ccache)
|
|
73
|
+
|
|
74
|
+
Two layers of caching mean you almost never pay for a cold, from-scratch build —
|
|
75
|
+
including the full C++/CUDA compile (gcc/nvcc):
|
|
76
|
+
|
|
77
|
+
1. **Prebuilt tree.** Every box gets PyTorch already built at viable/strict and
|
|
78
|
+
staged at `~/pytorch`, so `import torch` works with **zero build**.
|
|
79
|
+
2. **Shared compiler cache (ccache).** `CCACHE_DIR=/ccache_shared` is an EFS volume
|
|
80
|
+
mounted in **every** dev pod *and* the dedicated build node, so all the C++/CUDA
|
|
81
|
+
object compiles are cached and **shared across users and the build node**. When
|
|
82
|
+
you check out a ref past viable/strict — or edit C++ — the rebuild reuses those
|
|
83
|
+
cached objects (and the warm `build/` for ninja) instead of recompiling from
|
|
84
|
+
scratch. So even a "full" `pip install -e .` is a warm build, not a cold one.
|
|
85
|
+
|
|
86
|
+
Measured (m7i build node, 128 jobs, CUDA 13.2):
|
|
87
|
+
|
|
88
|
+
| scenario | time |
|
|
89
|
+
|---|---|
|
|
90
|
+
| `import torch` (prebuilt, no build) | ~0s |
|
|
91
|
+
| incremental (1 kernel changed + relink) | ~40s |
|
|
92
|
+
| ninja no-op (nothing changed) | ~20s |
|
|
93
|
+
| from-scratch `build/` with warm ccache (~86% hit) | ~21 min |
|
|
94
|
+
|
|
95
|
+
(A true cold build from an empty ccache is far longer.) The cache stays warm on its
|
|
96
|
+
own: an hourly build-node job compiles each viable/strict bump into `/ccache_shared`,
|
|
97
|
+
so the objects you need are usually already there by the time you build — and your
|
|
98
|
+
own compiles populate it for the next person too.
|
|
99
|
+
|
|
65
100
|
## Gotchas
|
|
66
101
|
- **`/merge` vs `/head`**: `/head` is the PR author's raw branch and often lacks
|
|
67
102
|
trunk-added tests; `/merge` is what CI ran. `gpu-dev repro` / `--ref` use `/merge`.
|
|
68
103
|
- **The prebuilt is viable/strict.** If your ref moved past it and a test needs new
|
|
69
|
-
C++, do the one incremental `pip install -e . --no-build-isolation
|
|
104
|
+
C++, do the one incremental `pip install -e . --no-build-isolation` — it's fast
|
|
105
|
+
(warm shared ccache), not a cold build. See [Builds are cached](#builds-are-cached-shared-ccache).
|
|
70
106
|
- **Ephemeral by design.** Repro boxes have no persistent disk; bring code via
|
|
71
107
|
`--ref`, `sb.upload`, or git.
|
|
108
|
+
- **Reproducing a reverted PR / an OOM.** `pr/N` uses `/merge` = the PR re-applied
|
|
109
|
+
onto *current* trunk — so if the PR was reverted, `/merge` effectively un-reverts
|
|
110
|
+
it and you test the **fixed** tree (it'll pass). To repro the failing trunk state,
|
|
111
|
+
check out the **exact land commit** instead (`gpu-dev repro <sha> …`). And match the
|
|
112
|
+
CI runner's GPU: an **OOM** only reproduces on a GPU as small as the runner's — the
|
|
113
|
+
default `b200` has far more memory, so a memory-bound failure won't show there
|
|
114
|
+
(`--gpu-type h100`/`a100`/… to match).
|
|
72
115
|
|
|
73
116
|
See also: `sdk/python/README.md` and `sdk/python/examples/`.
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.6"
|
|
8
8
|
description = "CLI + Python SDK for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -5283,10 +5283,11 @@ EOF_PROFILE
|
|
|
5283
5283
|
|
|
5284
5284
|
# User identification
|
|
5285
5285
|
export GPU_DEV_USER_ID="{user_id or 'dev'}"
|
|
5286
|
-
# Reservation id
|
|
5287
|
-
#
|
|
5288
|
-
#
|
|
5289
|
-
|
|
5286
|
+
# Reservation id from the pod hostname; warm claims overwrite it with the full id,
|
|
5287
|
+
# cold pods keep the 8-char prefix. Used by gpu-dev cancel (no args) inside the pod.
|
|
5288
|
+
# NOTE: escape the dollar so this is evaluated when the shell sources the file, NOT
|
|
5289
|
+
# command-substituted while this (unquoted) heredoc is written at pod startup.
|
|
5290
|
+
export GPU_DEV_RESERVATION_ID="\$(hostname | sed -e 's/^gpu-dev-//')"
|
|
5290
5291
|
|
|
5291
5292
|
# Multinode peer info — inlined from container env at pod startup. sshd strips
|
|
5292
5293
|
# container env vars from login shells, so we materialize the values into rc files.
|
|
@@ -5353,10 +5354,11 @@ EOF_BASHRC_EXT
|
|
|
5353
5354
|
|
|
5354
5355
|
# User identification
|
|
5355
5356
|
export GPU_DEV_USER_ID="{user_id or 'dev'}"
|
|
5356
|
-
# Reservation id
|
|
5357
|
-
#
|
|
5358
|
-
#
|
|
5359
|
-
|
|
5357
|
+
# Reservation id from the pod hostname; warm claims overwrite it with the full id,
|
|
5358
|
+
# cold pods keep the 8-char prefix. Used by gpu-dev cancel (no args) inside the pod.
|
|
5359
|
+
# NOTE: escape the dollar so this is evaluated when the shell sources the file, NOT
|
|
5360
|
+
# command-substituted while this (unquoted) heredoc is written at pod startup.
|
|
5361
|
+
export GPU_DEV_RESERVATION_ID="\$(hostname | sed -e 's/^gpu-dev-//')"
|
|
5360
5362
|
|
|
5361
5363
|
# Multinode peer info — inlined from container env at pod startup. sshd strips
|
|
5362
5364
|
# container env vars from login shells, so we materialize the values into rc files.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.7.5 → gpu_dev-0.7.6}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|