gpu-dev 0.5.17__tar.gz → 0.5.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/PKG-INFO +1 -1
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +5 -1
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +248 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/pyproject.toml +1 -1
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/index.py +147 -16
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_client.py +6 -1
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda.tf +1 -1
- gpu_dev-0.5.19/tests/submit/README.md +63 -0
- gpu_dev-0.5.19/tests/submit/fail/run.sh +20 -0
- gpu_dev-0.5.19/tests/submit/multinode/run.sh +65 -0
- gpu_dev-0.5.19/tests/submit/success/run.sh +23 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/.gitignore +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/CLAUDE.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/PROGRESS.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/TODO.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/README.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/post.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/setup.cfg +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -112,4 +112,8 @@ terraform-gpu-devservers/ssh-proxy/requirements.txt
|
|
|
112
112
|
terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
|
|
113
113
|
terraform-gpu-devservers/templates/al2023-user-data.sh
|
|
114
114
|
terraform-gpu-devservers/templates/user-data-self-managed.sh
|
|
115
|
-
terraform-gpu-devservers/templates/user-data.sh
|
|
115
|
+
terraform-gpu-devservers/templates/user-data.sh
|
|
116
|
+
tests/submit/README.md
|
|
117
|
+
tests/submit/fail/run.sh
|
|
118
|
+
tests/submit/multinode/run.sh
|
|
119
|
+
tests/submit/success/run.sh
|
|
@@ -1349,6 +1349,254 @@ def reserve(
|
|
|
1349
1349
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
1350
1350
|
|
|
1351
1351
|
|
|
1352
|
+
_SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1353
|
+
"h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
|
|
1354
|
+
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
|
|
1355
|
+
|
|
1356
|
+
|
|
1357
|
+
@main.command(context_settings={"ignore_unknown_options": True})
|
|
1358
|
+
@click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
|
|
1359
|
+
@click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
|
|
1360
|
+
@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
|
|
1361
|
+
@click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
|
|
1362
|
+
@click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
|
|
1363
|
+
@click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
|
|
1364
|
+
help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
|
|
1365
|
+
@click.option("--dockerimage", type=str, default=None,
|
|
1366
|
+
help="Pre-built container image reference (e.g. ghcr.io/me/img:tag) to run instead of the default.")
|
|
1367
|
+
@click.option("--preserve-entrypoint", is_flag=True,
|
|
1368
|
+
help="Keep the custom image's ENTRYPOINT/CMD instead of letting gpu-dev wrap with the SSH harness. Note: submit needs SSH to work.")
|
|
1369
|
+
@click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
|
|
1370
|
+
help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
|
|
1371
|
+
@click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
|
|
1372
|
+
@click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
|
|
1373
|
+
@click.option("--name", type=str, default=None, help="Reservation name.")
|
|
1374
|
+
@click.option("--timeout", type=int, default=24 * 60, show_default=True,
|
|
1375
|
+
help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
|
|
1376
|
+
@click.argument("command", nargs=-1, required=True)
|
|
1377
|
+
@click.pass_context
|
|
1378
|
+
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
|
|
1379
|
+
runtime, no_pull, keep_alive, name, timeout, command):
|
|
1380
|
+
"""Submit a job: reserve, sync code, run, sync results back, auto-cancel.
|
|
1381
|
+
|
|
1382
|
+
\b
|
|
1383
|
+
Examples:
|
|
1384
|
+
gpu-dev submit --runtime ./ -- python train.py
|
|
1385
|
+
gpu-dev submit --gpus 16 --gpu-type h100 --runtime . -- bash run.sh
|
|
1386
|
+
gpu-dev submit --keep-alive -- nvidia-smi
|
|
1387
|
+
|
|
1388
|
+
The job runs on rank 0 (master pod). For multinode jobs, MULTINODE_HOSTS / RANK /
|
|
1389
|
+
SIZE / MASTER_ADDR / MASTER_PORT are exported on every pod so torchrun and friends
|
|
1390
|
+
work without manual wiring. Exit code mirrors the remote command's exit code.
|
|
1391
|
+
"""
|
|
1392
|
+
import subprocess
|
|
1393
|
+
import shlex
|
|
1394
|
+
import sys
|
|
1395
|
+
from pathlib import Path
|
|
1396
|
+
|
|
1397
|
+
if not command:
|
|
1398
|
+
rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
|
|
1399
|
+
sys.exit(2)
|
|
1400
|
+
|
|
1401
|
+
gt = gpu_type.lower()
|
|
1402
|
+
# Per-type max GPUs (mirrors gpu_configs in reserve flow)
|
|
1403
|
+
max_per_node = {
|
|
1404
|
+
"t4": 4, "l4": 4, "a10g": 4, "rtxpro6000": 4, "t4-small": 1,
|
|
1405
|
+
"a100": 8, "h100": 8, "h200": 8, "b200": 8,
|
|
1406
|
+
"h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8,
|
|
1407
|
+
"b200-mig-1g": 4, "b200-mig-2g": 2, "b200-mig-3g": 2,
|
|
1408
|
+
"cpu-arm": 0, "cpu-x86": 0,
|
|
1409
|
+
}.get(gt)
|
|
1410
|
+
if max_per_node is None:
|
|
1411
|
+
rprint(f"[red]❌ Unknown gpu-type '{gpu_type}'[/red]")
|
|
1412
|
+
sys.exit(2)
|
|
1413
|
+
|
|
1414
|
+
is_multinode = gt not in ("cpu-arm", "cpu-x86") and gpus > max_per_node
|
|
1415
|
+
if is_multinode and gpus % max_per_node != 0:
|
|
1416
|
+
rprint(f"[red]❌ For multinode {gt}, --gpus must be a multiple of {max_per_node}[/red]")
|
|
1417
|
+
sys.exit(2)
|
|
1418
|
+
|
|
1419
|
+
config = load_config()
|
|
1420
|
+
try:
|
|
1421
|
+
user_info = authenticate_user(config)
|
|
1422
|
+
except RuntimeError as e:
|
|
1423
|
+
rprint(f"[red]❌ {str(e)}[/red]")
|
|
1424
|
+
sys.exit(2)
|
|
1425
|
+
|
|
1426
|
+
rm = ReservationManager(config)
|
|
1427
|
+
|
|
1428
|
+
# Determine effective disk handling. Multinode: only master gets persistent disk; we always
|
|
1429
|
+
# SSH into rank 0, so passing --disk is fine.
|
|
1430
|
+
disk_name = None if no_persistent_disk else disk
|
|
1431
|
+
|
|
1432
|
+
# Build dockerfile context if provided (mirrors the reserve-flow logic)
|
|
1433
|
+
dockerfile_payload = None
|
|
1434
|
+
if dockerfile:
|
|
1435
|
+
import os, tarfile, tempfile, base64
|
|
1436
|
+
if os.path.getsize(dockerfile) > 512 * 1024:
|
|
1437
|
+
rprint("[red]❌ Dockerfile too large (max 512KB)[/red]")
|
|
1438
|
+
sys.exit(2)
|
|
1439
|
+
ctx_dir = os.path.dirname(os.path.abspath(dockerfile))
|
|
1440
|
+
rprint(f"[cyan]📦 Building tar.gz context from {ctx_dir}[/cyan]")
|
|
1441
|
+
with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
|
|
1442
|
+
with tarfile.open(tmp.name, "w:gz") as tar:
|
|
1443
|
+
for root, _, files in os.walk(ctx_dir):
|
|
1444
|
+
for f in files:
|
|
1445
|
+
full = os.path.join(root, f)
|
|
1446
|
+
tar.add(full, arcname=os.path.relpath(full, ctx_dir))
|
|
1447
|
+
if os.path.basename(dockerfile).lower() != "dockerfile":
|
|
1448
|
+
tar.add(dockerfile, arcname="Dockerfile")
|
|
1449
|
+
tar_size = os.path.getsize(tmp.name)
|
|
1450
|
+
if tar_size > 700 * 1024:
|
|
1451
|
+
os.unlink(tmp.name)
|
|
1452
|
+
rprint(f"[red]❌ Build context too large: {tar_size}B (max ~700KB compressed)[/red]")
|
|
1453
|
+
sys.exit(2)
|
|
1454
|
+
with open(tmp.name, "rb") as fh:
|
|
1455
|
+
dockerfile_payload = base64.b64encode(fh.read()).decode("utf-8")
|
|
1456
|
+
os.unlink(tmp.name)
|
|
1457
|
+
rprint(f"[green]✅ Dockerfile context: {tar_size}B compressed[/green]")
|
|
1458
|
+
|
|
1459
|
+
if dockerimage and not preserve_entrypoint:
|
|
1460
|
+
rprint("[dim]Note: passing --dockerimage without --preserve-entrypoint, so gpu-dev wraps the image with the SSH harness.[/dim]")
|
|
1461
|
+
if preserve_entrypoint and not (dockerfile or dockerimage):
|
|
1462
|
+
rprint("[red]❌ --preserve-entrypoint requires --dockerfile or --dockerimage[/red]")
|
|
1463
|
+
sys.exit(2)
|
|
1464
|
+
|
|
1465
|
+
rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
|
|
1466
|
+
if is_multinode:
|
|
1467
|
+
reservation_ids = rm.create_multinode_reservation(
|
|
1468
|
+
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1469
|
+
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1470
|
+
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1471
|
+
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1472
|
+
preserve_entrypoint=preserve_entrypoint)
|
|
1473
|
+
if not reservation_ids:
|
|
1474
|
+
rprint("[red]❌ Failed to create multinode reservation[/red]")
|
|
1475
|
+
sys.exit(2)
|
|
1476
|
+
primary_id = reservation_ids[0]
|
|
1477
|
+
else:
|
|
1478
|
+
primary_id = rm.create_reservation(
|
|
1479
|
+
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1480
|
+
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1481
|
+
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1482
|
+
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1483
|
+
preserve_entrypoint=preserve_entrypoint)
|
|
1484
|
+
if not primary_id:
|
|
1485
|
+
rprint("[red]❌ Failed to create reservation[/red]")
|
|
1486
|
+
sys.exit(2)
|
|
1487
|
+
reservation_ids = [primary_id]
|
|
1488
|
+
|
|
1489
|
+
short_id = primary_id[:8]
|
|
1490
|
+
cancelled = {"done": False}
|
|
1491
|
+
|
|
1492
|
+
def maybe_cancel(reason: str):
|
|
1493
|
+
if cancelled["done"] or keep_alive:
|
|
1494
|
+
return
|
|
1495
|
+
cancelled["done"] = True
|
|
1496
|
+
rprint(f"[yellow]🛑 Cancelling reservation {short_id} ({reason})[/yellow]")
|
|
1497
|
+
for rid in reservation_ids:
|
|
1498
|
+
try:
|
|
1499
|
+
rm.cancel_reservation(rid, user_info["user_id"])
|
|
1500
|
+
except Exception as ce:
|
|
1501
|
+
rprint(f"[dim] cancel {rid[:8]} failed: {ce}[/dim]")
|
|
1502
|
+
|
|
1503
|
+
try:
|
|
1504
|
+
if timeout >= 60:
|
|
1505
|
+
wait_str = f"up to {timeout//60}h{(" " + str(timeout%60) + "m") if timeout%60 else ""}"
|
|
1506
|
+
else:
|
|
1507
|
+
wait_str = f"up to {timeout}m"
|
|
1508
|
+
rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active ({wait_str}; can queue when cluster is full)...[/cyan]")
|
|
1509
|
+
if is_multinode:
|
|
1510
|
+
results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
|
|
1511
|
+
else:
|
|
1512
|
+
single = rm.wait_for_reservation_completion(primary_id, timeout_minutes=timeout)
|
|
1513
|
+
results = [single] if single else None
|
|
1514
|
+
if not results:
|
|
1515
|
+
rprint("[red]❌ Reservation never became active[/red]")
|
|
1516
|
+
maybe_cancel("activation timeout")
|
|
1517
|
+
sys.exit(1)
|
|
1518
|
+
|
|
1519
|
+
# Resolve master pod (rank 0)
|
|
1520
|
+
conn = rm.get_connection_info(primary_id, user_info["user_id"])
|
|
1521
|
+
if not conn:
|
|
1522
|
+
rprint("[red]❌ Could not fetch connection info[/red]")
|
|
1523
|
+
maybe_cancel("no connection info")
|
|
1524
|
+
sys.exit(1)
|
|
1525
|
+
if conn.get("is_multinode"):
|
|
1526
|
+
nodes = sorted(conn["nodes"], key=lambda n: n.get("node_index", 0))
|
|
1527
|
+
master = nodes[0]
|
|
1528
|
+
master_id, master_pod, master_fqdn, master_name = (
|
|
1529
|
+
master["reservation_id"], master["pod_name"],
|
|
1530
|
+
master.get("fqdn"), master.get("name"))
|
|
1531
|
+
else:
|
|
1532
|
+
master_id, master_pod, master_fqdn, master_name = (
|
|
1533
|
+
primary_id, conn["pod_name"], conn.get("fqdn"), conn.get("name"))
|
|
1534
|
+
|
|
1535
|
+
# Ensure SSH config exists
|
|
1536
|
+
gpu_dev_dir = Path.home() / ".gpu-dev"
|
|
1537
|
+
config_file = gpu_dev_dir / f"{master_id[:8]}-sshconfig"
|
|
1538
|
+
if not config_file.exists():
|
|
1539
|
+
if not (master_fqdn and master_pod):
|
|
1540
|
+
rprint("[red]❌ Master pod has no FQDN yet — can't SSH[/red]")
|
|
1541
|
+
maybe_cancel("no fqdn")
|
|
1542
|
+
sys.exit(1)
|
|
1543
|
+
create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
|
|
1544
|
+
|
|
1545
|
+
ssh_alias = master_pod
|
|
1546
|
+
ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
|
|
1547
|
+
rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
|
|
1548
|
+
|
|
1549
|
+
# Working directory and rsync up
|
|
1550
|
+
if runtime:
|
|
1551
|
+
workdir = f"/workspace/submit-{master_id[:8]}"
|
|
1552
|
+
rprint(f"[cyan]📦 Syncing {runtime} → {ssh_alias}:{workdir}[/cyan]")
|
|
1553
|
+
r = subprocess.run(ssh_base + [ssh_alias, f"mkdir -p {shlex.quote(workdir)}"])
|
|
1554
|
+
if r.returncode != 0:
|
|
1555
|
+
rprint("[red]❌ Failed to create remote workspace[/red]")
|
|
1556
|
+
maybe_cancel("mkdir failed"); sys.exit(2)
|
|
1557
|
+
r = subprocess.run([
|
|
1558
|
+
"rsync", "-az", "--delete", "-e", rsync_e,
|
|
1559
|
+
f"{runtime.rstrip('/')}/", f"{ssh_alias}:{workdir}/",
|
|
1560
|
+
])
|
|
1561
|
+
if r.returncode != 0:
|
|
1562
|
+
rprint("[red]❌ Upload rsync failed[/red]")
|
|
1563
|
+
maybe_cancel("upload failed"); sys.exit(2)
|
|
1564
|
+
else:
|
|
1565
|
+
workdir = "/home/dev"
|
|
1566
|
+
|
|
1567
|
+
# Run remote command via login shell so MULTINODE_* etc. are loaded
|
|
1568
|
+
remote_cmd = " ".join(shlex.quote(c) for c in command)
|
|
1569
|
+
rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
|
|
1570
|
+
ssh_run = ssh_base + [ssh_alias,
|
|
1571
|
+
f"cd {shlex.quote(workdir)} && bash -lc {shlex.quote(remote_cmd)}"]
|
|
1572
|
+
rc = subprocess.call(ssh_run)
|
|
1573
|
+
rprint(f"\n[dim]Job exited with code {rc}[/dim]")
|
|
1574
|
+
|
|
1575
|
+
# Sync back results before cancelling
|
|
1576
|
+
if runtime and not no_pull:
|
|
1577
|
+
rprint(f"[cyan]📥 Syncing {ssh_alias}:{workdir}/ → {runtime}[/cyan]")
|
|
1578
|
+
pull = subprocess.run([
|
|
1579
|
+
"rsync", "-az", "-e", rsync_e,
|
|
1580
|
+
f"{ssh_alias}:{workdir}/", f"{runtime.rstrip('/')}/",
|
|
1581
|
+
])
|
|
1582
|
+
if pull.returncode != 0:
|
|
1583
|
+
rprint(f"[yellow]⚠️ Result rsync exited with {pull.returncode} — your output may be incomplete[/yellow]")
|
|
1584
|
+
|
|
1585
|
+
maybe_cancel("job complete")
|
|
1586
|
+
sys.exit(rc)
|
|
1587
|
+
|
|
1588
|
+
except KeyboardInterrupt:
|
|
1589
|
+
rprint("\n[yellow]Interrupted — cancelling[/yellow]")
|
|
1590
|
+
maybe_cancel("user interrupt")
|
|
1591
|
+
sys.exit(130)
|
|
1592
|
+
except SystemExit:
|
|
1593
|
+
raise
|
|
1594
|
+
except Exception as e:
|
|
1595
|
+
rprint(f"[red]❌ Submit error: {e}[/red]")
|
|
1596
|
+
maybe_cancel("submit error")
|
|
1597
|
+
sys.exit(2)
|
|
1598
|
+
|
|
1599
|
+
|
|
1352
1600
|
@main.command()
|
|
1353
1601
|
@click.option(
|
|
1354
1602
|
"--user",
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.19"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -1423,6 +1423,11 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
|
|
|
1423
1423
|
logger.info(
|
|
1424
1424
|
f"Starting parallel processing for {total_nodes} nodes")
|
|
1425
1425
|
|
|
1426
|
+
# Deterministic peer pod names by node_index so MULTINODE_RANK aligns with the
|
|
1427
|
+
# position of this pod in MULTINODE_HOSTS across all replicas.
|
|
1428
|
+
nodes_sorted = sorted(nodes, key=lambda n: int(n.get("node_index", 0)))
|
|
1429
|
+
peer_pod_names = [f"gpu-dev-{n['reservation_id'][:8]}" for n in nodes_sorted]
|
|
1430
|
+
|
|
1426
1431
|
def process_single_node(node_data):
|
|
1427
1432
|
"""Process a single node - to be run in parallel"""
|
|
1428
1433
|
i, node = node_data
|
|
@@ -1435,7 +1440,8 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
|
|
|
1435
1440
|
'action': 'process_multinode_individual',
|
|
1436
1441
|
'node_index': int(node_index),
|
|
1437
1442
|
'total_nodes': int(total_nodes),
|
|
1438
|
-
'master_reservation_id': str(master_reservation_id)
|
|
1443
|
+
'master_reservation_id': str(master_reservation_id),
|
|
1444
|
+
'multinode_peer_pods': peer_pod_names,
|
|
1439
1445
|
}
|
|
1440
1446
|
|
|
1441
1447
|
logger.info(
|
|
@@ -1541,6 +1547,12 @@ def process_multinode_individual_node(message_body: dict) -> bool:
|
|
|
1541
1547
|
|
|
1542
1548
|
node_data = response["Item"]
|
|
1543
1549
|
|
|
1550
|
+
# Forward peer pod list from coordinator into request dict so create_pod can
|
|
1551
|
+
# bake MULTINODE_HOSTS / MASTER_ADDR / MULTINODE_RANK env vars into the pod.
|
|
1552
|
+
peer_pods = message_body.get("multinode_peer_pods")
|
|
1553
|
+
if peer_pods:
|
|
1554
|
+
node_data["multinode_peer_pods"] = peer_pods
|
|
1555
|
+
|
|
1544
1556
|
# Update status to preparing pod
|
|
1545
1557
|
update_multinode_pod_status(
|
|
1546
1558
|
reservation_id, "preparing pod", node_index, total_nodes)
|
|
@@ -2888,6 +2900,8 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2888
2900
|
dockerimage=dockerimage,
|
|
2889
2901
|
target_az=target_az,
|
|
2890
2902
|
target_node=target_node,
|
|
2903
|
+
multinode_peer_pods=request.get("multinode_peer_pods"),
|
|
2904
|
+
multinode_rank=int(request.get("node_index", 0)) if is_multinode else 0,
|
|
2891
2905
|
preserve_entrypoint=preserve_entrypoint,
|
|
2892
2906
|
node_labels=node_labels,
|
|
2893
2907
|
trace_data=trace_data,
|
|
@@ -3429,6 +3443,8 @@ def create_kubernetes_resources(
|
|
|
3429
3443
|
efs_filesystem_id: str = None,
|
|
3430
3444
|
is_multinode: bool = False,
|
|
3431
3445
|
target_node: str = None,
|
|
3446
|
+
multinode_peer_pods: list = None,
|
|
3447
|
+
multinode_rank: int = 0,
|
|
3432
3448
|
dockerfile_base64_data: str = None,
|
|
3433
3449
|
dockerimage: str = None,
|
|
3434
3450
|
target_az: str = None,
|
|
@@ -3533,6 +3549,8 @@ def create_kubernetes_resources(
|
|
|
3533
3549
|
dockerimage=dockerimage,
|
|
3534
3550
|
target_az=target_az,
|
|
3535
3551
|
target_node=target_node,
|
|
3552
|
+
multinode_peer_pods=multinode_peer_pods,
|
|
3553
|
+
multinode_rank=multinode_rank,
|
|
3536
3554
|
preserve_entrypoint=preserve_entrypoint,
|
|
3537
3555
|
node_labels=node_labels,
|
|
3538
3556
|
trace_data=trace_data,
|
|
@@ -3620,6 +3638,8 @@ def create_kubernetes_resources(
|
|
|
3620
3638
|
dockerimage=dockerimage,
|
|
3621
3639
|
target_az=target_az,
|
|
3622
3640
|
target_node=target_node,
|
|
3641
|
+
multinode_peer_pods=multinode_peer_pods,
|
|
3642
|
+
multinode_rank=multinode_rank,
|
|
3623
3643
|
preserve_entrypoint=preserve_entrypoint,
|
|
3624
3644
|
node_labels=node_labels,
|
|
3625
3645
|
trace_data=trace_data,
|
|
@@ -3722,6 +3742,30 @@ def find_available_node_port(k8s_client) -> int:
|
|
|
3722
3742
|
return random.randint(30000, 32767)
|
|
3723
3743
|
|
|
3724
3744
|
|
|
3745
|
+
def _mig_slice_fraction(gpu_type: str) -> float:
|
|
3746
|
+
"""For MIG SKUs return slice fraction of a single GPU (1g=1/7, 2g=2/7, ..., 7g=1).
|
|
3747
|
+
|
|
3748
|
+
Slice naming counts GPCs (compute slices). H100 and B200 both have 7 GPCs per GPU
|
|
3749
|
+
in the typical all-balanced profile, so a 1g slice is 1/7 of a GPU regardless of
|
|
3750
|
+
family. Used to size CPU/memory requests proportional to the GPU fraction the pod
|
|
3751
|
+
actually consumes — the older `gpu_count/max_gpus` ratio over-claimed node resources
|
|
3752
|
+
(a 1g slice would claim 1/4 or 1/16 of the host instead of 1/56).
|
|
3753
|
+
"""
|
|
3754
|
+
if "mig" not in gpu_type:
|
|
3755
|
+
return 1.0
|
|
3756
|
+
try:
|
|
3757
|
+
slices = int(gpu_type.split("-mig-")[1].rstrip("g"))
|
|
3758
|
+
except (IndexError, ValueError):
|
|
3759
|
+
return 1.0
|
|
3760
|
+
return slices / 7.0
|
|
3761
|
+
|
|
3762
|
+
|
|
3763
|
+
# Number of full GPUs on the underlying instance — used to convert the slice fraction
|
|
3764
|
+
# into a fraction of the host's CPU/memory. Both p5.48xlarge (H100) and p6-b200.48xlarge
|
|
3765
|
+
# (B200) have 8 GPUs, which matches every MIG-capable instance type we currently run.
|
|
3766
|
+
_FULL_GPUS_PER_MIG_NODE = 8
|
|
3767
|
+
|
|
3768
|
+
|
|
3725
3769
|
def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool = False) -> dict:
|
|
3726
3770
|
"""Get resource limits for pod based on GPU type and deployment mode"""
|
|
3727
3771
|
gpu_count = int(gpu_count)
|
|
@@ -3741,13 +3785,19 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
|
|
|
3741
3785
|
resource_name = config.get("k8s_resource", "nvidia.com/gpu")
|
|
3742
3786
|
limits[resource_name] = str(gpu_count)
|
|
3743
3787
|
|
|
3744
|
-
|
|
3745
|
-
|
|
3746
|
-
|
|
3747
|
-
|
|
3748
|
-
|
|
3749
|
-
|
|
3750
|
-
|
|
3788
|
+
if "mig" in gpu_type:
|
|
3789
|
+
# Scale by GPC fraction (slice of one GPU), not slice count over max slices.
|
|
3790
|
+
slice_fraction = _mig_slice_fraction(gpu_type)
|
|
3791
|
+
cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
|
|
3792
|
+
mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
|
|
3793
|
+
fractional_cpu = cpu_per_full_gpu * slice_fraction * gpu_count
|
|
3794
|
+
proportional_cpu_limit = max(1, min(config["cpus"], int(fractional_cpu * 1.5)))
|
|
3795
|
+
proportional_memory_limit = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count))
|
|
3796
|
+
else:
|
|
3797
|
+
gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
|
|
3798
|
+
fractional_cpu = config["cpus"] * gpu_ratio
|
|
3799
|
+
proportional_cpu_limit = min(config["cpus"], int(fractional_cpu * 1.5))
|
|
3800
|
+
proportional_memory_limit = int(config["memory_gb"] * gpu_ratio)
|
|
3751
3801
|
|
|
3752
3802
|
limits.update({
|
|
3753
3803
|
"cpu": str(proportional_cpu_limit),
|
|
@@ -3787,13 +3837,16 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
|
|
|
3787
3837
|
if gpu_count > 0:
|
|
3788
3838
|
resource_name = config.get("k8s_resource", "nvidia.com/gpu")
|
|
3789
3839
|
requests[resource_name] = str(gpu_count)
|
|
3790
|
-
|
|
3791
|
-
|
|
3792
|
-
|
|
3793
|
-
|
|
3794
|
-
|
|
3795
|
-
|
|
3796
|
-
|
|
3840
|
+
if "mig" in gpu_type:
|
|
3841
|
+
slice_fraction = _mig_slice_fraction(gpu_type)
|
|
3842
|
+
cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
|
|
3843
|
+
mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
|
|
3844
|
+
proportional_cpu_request = max(1, int(cpu_per_full_gpu * slice_fraction * gpu_count * 0.9))
|
|
3845
|
+
proportional_memory_request = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count * 0.9))
|
|
3846
|
+
else:
|
|
3847
|
+
gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
|
|
3848
|
+
proportional_cpu_request = int(config["cpus"] * gpu_ratio * 0.9)
|
|
3849
|
+
proportional_memory_request = int(config["memory_gb"] * gpu_ratio * 0.9)
|
|
3797
3850
|
|
|
3798
3851
|
requests.update({
|
|
3799
3852
|
"cpu": str(proportional_cpu_request),
|
|
@@ -3896,6 +3949,30 @@ def get_nccl_env_vars(gpu_type: str) -> list:
|
|
|
3896
3949
|
return env_vars
|
|
3897
3950
|
|
|
3898
3951
|
|
|
3952
|
+
def _get_multinode_env_vars(peer_pods: list, rank: int) -> list:
|
|
3953
|
+
"""Build env vars exposing peer hostnames/rank/master to the pod.
|
|
3954
|
+
|
|
3955
|
+
Hostnames use the per-pod headless service we already create elsewhere, so they
|
|
3956
|
+
resolve to the current pod IP via cluster DNS even if a pod is recreated. We
|
|
3957
|
+
don\'t inject IPs at pod-creation time (they aren\'t known until kube schedules
|
|
3958
|
+
everyone) — the bashrc/zshrc helper resolves and exports MULTINODE_IPS at shell
|
|
3959
|
+
start, and a /usr/local/bin/multinode-ips helper is available for non-interactive
|
|
3960
|
+
callers.
|
|
3961
|
+
"""
|
|
3962
|
+
if not peer_pods or len(peer_pods) <= 1:
|
|
3963
|
+
return []
|
|
3964
|
+
namespace = "gpu-dev"
|
|
3965
|
+
hosts = [f"{p}-headless.{namespace}.svc.cluster.local" for p in peer_pods]
|
|
3966
|
+
return [
|
|
3967
|
+
client.V1EnvVar(name="MULTINODE_HOSTS", value=",".join(hosts)),
|
|
3968
|
+
client.V1EnvVar(name="MULTINODE_PEER_PODS", value=",".join(peer_pods)),
|
|
3969
|
+
client.V1EnvVar(name="MULTINODE_RANK", value=str(rank)),
|
|
3970
|
+
client.V1EnvVar(name="MULTINODE_SIZE", value=str(len(peer_pods))),
|
|
3971
|
+
client.V1EnvVar(name="MASTER_ADDR", value=hosts[0]),
|
|
3972
|
+
client.V1EnvVar(name="MASTER_PORT", value="29500"),
|
|
3973
|
+
]
|
|
3974
|
+
|
|
3975
|
+
|
|
3899
3976
|
def create_pod(
|
|
3900
3977
|
k8s_client,
|
|
3901
3978
|
pod_name: str,
|
|
@@ -3913,6 +3990,8 @@ def create_pod(
|
|
|
3913
3990
|
dockerimage: str = None,
|
|
3914
3991
|
target_az: str = None,
|
|
3915
3992
|
target_node: str = None,
|
|
3993
|
+
multinode_peer_pods: list = None,
|
|
3994
|
+
multinode_rank: int = 0,
|
|
3916
3995
|
preserve_entrypoint: bool = False,
|
|
3917
3996
|
node_labels: dict = None,
|
|
3918
3997
|
trace_data: dict = None,
|
|
@@ -4397,6 +4476,16 @@ EOF_PROFILE
|
|
|
4397
4476
|
# User identification
|
|
4398
4477
|
export GPU_DEV_USER_ID="{user_id or 'dev'}"
|
|
4399
4478
|
|
|
4479
|
+
# Multinode peer info — inlined from container env at pod startup. sshd strips
|
|
4480
|
+
# container env vars from login shells, so we materialize the values into rc files.
|
|
4481
|
+
# Skipped (empty exports) for single-node reservations where MULTINODE_* aren't set.
|
|
4482
|
+
export MULTINODE_HOSTS="$MULTINODE_HOSTS"
|
|
4483
|
+
export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
|
|
4484
|
+
export MULTINODE_RANK="$MULTINODE_RANK"
|
|
4485
|
+
export MULTINODE_SIZE="$MULTINODE_SIZE"
|
|
4486
|
+
export MASTER_ADDR="$MASTER_ADDR"
|
|
4487
|
+
export MASTER_PORT="$MASTER_PORT"
|
|
4488
|
+
|
|
4400
4489
|
# Function to check for GPU reservation expiry warnings and startup script status
|
|
4401
4490
|
check_warnings() {{
|
|
4402
4491
|
# Check for startup script still running
|
|
@@ -4415,6 +4504,22 @@ check_warnings() {{
|
|
|
4415
4504
|
|
|
4416
4505
|
# Run warning check before every command prompt
|
|
4417
4506
|
PROMPT_COMMAND="check_warnings; \$PROMPT_COMMAND"
|
|
4507
|
+
|
|
4508
|
+
# Multinode peer IP resolution: MULTINODE_HOSTS is baked at pod creation, but per-pod
|
|
4509
|
+
# IPs are only known once kube schedules them. Resolve at shell start so users can do
|
|
4510
|
+
# torchrun --master_addr=\$MASTER_ADDR or mpirun -H "\$MULTINODE_IPS" without extra steps.
|
|
4511
|
+
if [ -n "\$MULTINODE_HOSTS" ]; then
|
|
4512
|
+
_MULTINODE_IPS=""
|
|
4513
|
+
for _h in \$(echo "\$MULTINODE_HOSTS" | tr ',' ' '); do
|
|
4514
|
+
_ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
|
|
4515
|
+
if [ -n "\$_ip" ]; then
|
|
4516
|
+
_MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
|
|
4517
|
+
fi
|
|
4518
|
+
done
|
|
4519
|
+
export MULTINODE_IPS="\$_MULTINODE_IPS"
|
|
4520
|
+
[ -n "\$MULTINODE_IPS" ] && export MASTER_IP=\$(echo "\$MULTINODE_IPS" | cut -d, -f1)
|
|
4521
|
+
unset _MULTINODE_IPS _h _ip
|
|
4522
|
+
fi
|
|
4418
4523
|
EOF_BASHRC_EXT
|
|
4419
4524
|
|
|
4420
4525
|
cat > /home/dev/.zshrc_ext << EOF_ZSHRC_EXT
|
|
@@ -4425,6 +4530,15 @@ EOF_BASHRC_EXT
|
|
|
4425
4530
|
# User identification
|
|
4426
4531
|
export GPU_DEV_USER_ID="{user_id or 'dev'}"
|
|
4427
4532
|
|
|
4533
|
+
# Multinode peer info — inlined from container env at pod startup. sshd strips
|
|
4534
|
+
# container env vars from login shells, so we materialize the values into rc files.
|
|
4535
|
+
export MULTINODE_HOSTS="$MULTINODE_HOSTS"
|
|
4536
|
+
export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
|
|
4537
|
+
export MULTINODE_RANK="$MULTINODE_RANK"
|
|
4538
|
+
export MULTINODE_SIZE="$MULTINODE_SIZE"
|
|
4539
|
+
export MASTER_ADDR="$MASTER_ADDR"
|
|
4540
|
+
export MASTER_PORT="$MASTER_PORT"
|
|
4541
|
+
|
|
4428
4542
|
# Function to check for GPU reservation expiry warnings and startup script status
|
|
4429
4543
|
check_warnings() {{
|
|
4430
4544
|
# Check for startup script still running
|
|
@@ -4444,6 +4558,20 @@ check_warnings() {{
|
|
|
4444
4558
|
|
|
4445
4559
|
# Run warning check before every command prompt (zsh hook)
|
|
4446
4560
|
precmd() {{ check_warnings }}
|
|
4561
|
+
|
|
4562
|
+
# Multinode peer IP resolution (see .bashrc_ext for rationale)
|
|
4563
|
+
if [[ -n "\$MULTINODE_HOSTS" ]]; then
|
|
4564
|
+
_MULTINODE_IPS=""
|
|
4565
|
+
for _h in \${{(s:,:)MULTINODE_HOSTS}}; do
|
|
4566
|
+
_ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
|
|
4567
|
+
if [[ -n "\$_ip" ]]; then
|
|
4568
|
+
_MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
|
|
4569
|
+
fi
|
|
4570
|
+
done
|
|
4571
|
+
export MULTINODE_IPS="\$_MULTINODE_IPS"
|
|
4572
|
+
[[ -n "\$MULTINODE_IPS" ]] && export MASTER_IP="\${{MULTINODE_IPS%%,*}}"
|
|
4573
|
+
unset _MULTINODE_IPS _h _ip
|
|
4574
|
+
fi
|
|
4447
4575
|
EOF_ZSHRC_EXT
|
|
4448
4576
|
|
|
4449
4577
|
chown 1081:1081 /home/dev/.bashrc_ext /home/dev/.zshrc_ext
|
|
@@ -5174,7 +5302,7 @@ EOF
|
|
|
5174
5302
|
client.V1EnvVar(
|
|
5175
5303
|
name="NVIDIA_DRIVER_CAPABILITIES", value="compute,utility"
|
|
5176
5304
|
)
|
|
5177
|
-
] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type),
|
|
5305
|
+
] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
|
|
5178
5306
|
resources=client.V1ResourceRequirements(
|
|
5179
5307
|
limits=get_pod_resource_limits(
|
|
5180
5308
|
gpu_count, gpu_type, is_multinode),
|
|
@@ -6319,6 +6447,9 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6319
6447
|
"nvidia.com/mig-3g.40gb": "h100-mig-3g",
|
|
6320
6448
|
"nvidia.com/mig-4g.40gb": "h100-mig-4g",
|
|
6321
6449
|
"nvidia.com/mig-7g.80gb": "h100-mig-7g",
|
|
6450
|
+
"nvidia.com/mig-1g.23gb": "b200-mig-1g",
|
|
6451
|
+
"nvidia.com/mig-2g.45gb": "b200-mig-2g",
|
|
6452
|
+
"nvidia.com/mig-3g.90gb": "b200-mig-3g",
|
|
6322
6453
|
}
|
|
6323
6454
|
if pod.spec.containers:
|
|
6324
6455
|
for c in pod.spec.containers:
|
|
@@ -31,9 +31,14 @@ def get_bearer_token() -> str:
|
|
|
31
31
|
"""
|
|
32
32
|
Create a k8s-aws-v1 bearer token by presigning STS:GetCallerIdentity.
|
|
33
33
|
IMPORTANT: base64url-encode the FULL presigned URL, then strip padding.
|
|
34
|
+
|
|
35
|
+
expires_in must match _EFFECTIVE_TOKEN_TTL: previously this was 60s while the cache
|
|
36
|
+
held the token for 14 min, so warm Lambda containers handed EKS expired URLs and got
|
|
37
|
+
401s for ~13 min until the next refresh. 900s is the typical EKS get-token default
|
|
38
|
+
and the max for IAM-role-derived presigned URLs.
|
|
34
39
|
"""
|
|
35
40
|
logger.info("Starting bearer token generation")
|
|
36
|
-
STS_TOKEN_EXPIRES_IN =
|
|
41
|
+
STS_TOKEN_EXPIRES_IN = 900
|
|
37
42
|
session = boto3.session.Session(region_name=REGION)
|
|
38
43
|
logger.info(f"Created boto3 session for region {REGION}")
|
|
39
44
|
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.22"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.16"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# `gpu-dev submit` smoke tests
|
|
2
|
+
|
|
3
|
+
Three tests that exercise `gpu-dev submit` end-to-end. Each test lives in its
|
|
4
|
+
own folder so you can `--runtime` it directly. Output files written by the
|
|
5
|
+
script are pulled back into the same folder via the post-run rsync.
|
|
6
|
+
|
|
7
|
+
> Requires `gpu-dev >= 0.5.19`. No Lambda update needed.
|
|
8
|
+
|
|
9
|
+
## 1. success — single T4 GPU, exit 0
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
cd tests/submit/success
|
|
13
|
+
gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
|
|
14
|
+
echo $? # 0
|
|
15
|
+
ls # nvidia-info.txt, compute.txt, status.txt all created
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## 2. fail — single T4 GPU, exit 7
|
|
19
|
+
|
|
20
|
+
Writes a partial file before exploding so you can confirm rsync still pulls
|
|
21
|
+
output on failure and the local exit code is the remote's.
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
cd tests/submit/fail
|
|
25
|
+
gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
|
|
26
|
+
echo $? # 7
|
|
27
|
+
ls # step1.txt, step2.txt, gpus-before-fail.txt — but no step3.txt
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## 3. multinode — 2x H100 nodes, exit 0
|
|
31
|
+
|
|
32
|
+
Reserves 16 H100s (= 2 nodes), verifies env vars + peer ssh + NCCL all_reduce
|
|
33
|
+
across the whole cluster via mpirun (orchestrated entirely from rank 0).
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
cd tests/submit/multinode
|
|
37
|
+
gpu-dev submit --gpu-type h100 --gpus 16 --runtime ./ -- bash run.sh
|
|
38
|
+
echo $? # 0
|
|
39
|
+
cat multinode-env.txt resolved-ips.txt peer-ssh.txt nccl-all_reduce.log
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## What each test proves
|
|
43
|
+
|
|
44
|
+
| Test | Proves |
|
|
45
|
+
|------------|-------------------------------------------------------------------------------|
|
|
46
|
+
| success | reserve → rsync up → exec → rsync back → cancel → exit 0 |
|
|
47
|
+
| fail | exit code propagation; rsync-back still runs on non-zero exit; cancel fires |
|
|
48
|
+
| multinode | MULTINODE_* env vars; peer DNS / passwordless ssh; cross-node NCCL via mpirun |
|
|
49
|
+
|
|
50
|
+
After every run, `gpu-dev list` should show neither reservation — both auto-cancelled.
|
|
51
|
+
Use `--keep-alive` on any of them if you want to debug interactively afterward.
|
|
52
|
+
|
|
53
|
+
## Other submit flags (forwarded to `reserve`)
|
|
54
|
+
|
|
55
|
+
- `--hours N` — reservation lifetime ceiling (default 1.0)
|
|
56
|
+
- `--disk NAME` — attach a persistent disk to the master node
|
|
57
|
+
- `--no-persistent-disk` — skip persistent disk
|
|
58
|
+
- `--dockerfile PATH` — build a custom image from this Dockerfile
|
|
59
|
+
- `--dockerimage REF` — use a pre-built container image
|
|
60
|
+
- `--preserve-entrypoint` — keep the custom image's ENTRYPOINT (you must run sshd yourself for submit to work)
|
|
61
|
+
- `--timeout MINUTES` — wait-for-active timeout (default 1440 = 24h, since reservations may queue)
|
|
62
|
+
- `--no-pull` — skip the post-run sync-back
|
|
63
|
+
- `--keep-alive` — skip auto-cancel
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Failure test for `gpu-dev submit`: writes a partial output, then exits 7.
|
|
3
|
+
# Verifies the post-run rsync still pulls the partial files even on failure,
|
|
4
|
+
# the auto-cancel runs on non-zero exit, and the local exit code is preserved.
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
echo "=== host ==="
|
|
8
|
+
hostname
|
|
9
|
+
date -u
|
|
10
|
+
|
|
11
|
+
# Write a partial file so we can verify it was synced back
|
|
12
|
+
echo "step1 done at $(date -u)" > step1.txt
|
|
13
|
+
nvidia-smi -L > gpus-before-fail.txt
|
|
14
|
+
|
|
15
|
+
# Now error out
|
|
16
|
+
echo "About to fail..." > step2.txt
|
|
17
|
+
python3 -c "import sys; sys.exit(7)"
|
|
18
|
+
|
|
19
|
+
# Should not reach here
|
|
20
|
+
echo "should-not-appear" > step3.txt
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Multinode test for `gpu-dev submit`: runs on rank 0 only and orchestrates the
|
|
3
|
+
# whole cluster via mpirun (uses passwordless ssh + the headless service DNS we
|
|
4
|
+
# already set up). Verifies env vars, peer connectivity, and an actual NCCL
|
|
5
|
+
# all_reduce across all nodes.
|
|
6
|
+
set -euo pipefail
|
|
7
|
+
cd "$(dirname "$0")"
|
|
8
|
+
|
|
9
|
+
echo "=== rank 0 host: $(hostname) at $(date -u) ==="
|
|
10
|
+
|
|
11
|
+
echo "=== multinode env ==="
|
|
12
|
+
{
|
|
13
|
+
echo "MULTINODE_HOSTS=$MULTINODE_HOSTS"
|
|
14
|
+
echo "MULTINODE_PEER_PODS=$MULTINODE_PEER_PODS"
|
|
15
|
+
echo "MULTINODE_RANK=$MULTINODE_RANK"
|
|
16
|
+
echo "MULTINODE_SIZE=$MULTINODE_SIZE"
|
|
17
|
+
echo "MASTER_ADDR=$MASTER_ADDR"
|
|
18
|
+
echo "MASTER_PORT=$MASTER_PORT"
|
|
19
|
+
echo "MULTINODE_IPS=${MULTINODE_IPS:-(not set)}"
|
|
20
|
+
} | tee multinode-env.txt
|
|
21
|
+
|
|
22
|
+
if [[ -z "${MULTINODE_HOSTS:-}" ]]; then
|
|
23
|
+
echo "ERROR: MULTINODE_HOSTS empty — submit with --gpus >= 16 on h100" >&2
|
|
24
|
+
exit 2
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
# Resolve IPs even if the bashrc helper didn't run (defensive)
|
|
28
|
+
IPS=""
|
|
29
|
+
for h in $(echo "$MULTINODE_HOSTS" | tr ',' ' '); do
|
|
30
|
+
ip=$(getent hosts "$h" | awk '{print $1}' | head -1)
|
|
31
|
+
[[ -n "$ip" ]] && IPS="${IPS:+$IPS,}$ip"
|
|
32
|
+
done
|
|
33
|
+
echo "Resolved IPS=$IPS" | tee resolved-ips.txt
|
|
34
|
+
|
|
35
|
+
echo "=== peer ssh check (port 2222 inside cluster) ==="
|
|
36
|
+
peer_host=$(echo "$MULTINODE_HOSTS" | cut -d, -f2)
|
|
37
|
+
ssh -o StrictHostKeyChecking=no -p 2222 "$peer_host" 'hostname; nvidia-smi -L | wc -l' \
|
|
38
|
+
| tee peer-ssh.txt
|
|
39
|
+
|
|
40
|
+
GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
|
|
41
|
+
echo "GPUS_PER_NODE=$GPUS_PER_NODE" | tee gpus-per-node.txt
|
|
42
|
+
|
|
43
|
+
# Build --host arg: ip1:N,ip2:N,...
|
|
44
|
+
HOST_ARG=$(echo "$IPS" | awk -v g="$GPUS_PER_NODE" -F, '{out=""; for(i=1;i<=NF;i++){out=out ($i ":" g) (i<NF?",":"")}; print out}')
|
|
45
|
+
echo "HOST_ARG=$HOST_ARG"
|
|
46
|
+
|
|
47
|
+
echo "=== NCCL all_reduce_perf via mpirun ==="
|
|
48
|
+
# Note: -g 1 = 1 GPU per process, -n 20 iterations. Sweep 1M..1G in factor-of-2 steps.
|
|
49
|
+
mpirun --host "$HOST_ARG" \
|
|
50
|
+
--mca plm_rsh_args "-p 2222 -o StrictHostKeyChecking=no" \
|
|
51
|
+
-x PATH -x LD_LIBRARY_PATH \
|
|
52
|
+
-x FI_PROVIDER -x FI_EFA_USE_DEVICE_RDMA \
|
|
53
|
+
-x NCCL_NET_GDR_LEVEL -x NCCL_ALGO \
|
|
54
|
+
-x NCCL_SOCKET_IFNAME -x NCCL_DEBUG -x NCCL_IB_HCA \
|
|
55
|
+
/opt/nccl-tests/build/all_reduce_perf -b 1M -e 1G -f 2 -g 1 -n 20 \
|
|
56
|
+
2>&1 | tee nccl-all_reduce.log
|
|
57
|
+
|
|
58
|
+
echo "=== summary ==="
|
|
59
|
+
{
|
|
60
|
+
echo "rank=$MULTINODE_RANK size=$MULTINODE_SIZE"
|
|
61
|
+
echo "host_arg=$HOST_ARG"
|
|
62
|
+
echo "completed at $(date -u)"
|
|
63
|
+
} | tee summary.txt
|
|
64
|
+
|
|
65
|
+
echo "DONE"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Smoke test for `gpu-dev submit`: runs on a single GPU, expected exit 0.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
echo "=== host ==="
|
|
6
|
+
hostname
|
|
7
|
+
date -u
|
|
8
|
+
|
|
9
|
+
echo "=== nvidia-smi ==="
|
|
10
|
+
nvidia-smi | tee nvidia-info.txt
|
|
11
|
+
|
|
12
|
+
echo "=== compute ==="
|
|
13
|
+
python3 - <<'PY' | tee compute.txt
|
|
14
|
+
import torch
|
|
15
|
+
assert torch.cuda.is_available(), "CUDA not available"
|
|
16
|
+
n = torch.cuda.device_count()
|
|
17
|
+
x = torch.arange(1_000_000, device="cuda", dtype=torch.float32)
|
|
18
|
+
s = x.sum().item()
|
|
19
|
+
print(f"devices={n} sum(0..999_999)={s}")
|
|
20
|
+
PY
|
|
21
|
+
|
|
22
|
+
echo "ok at $(date -u)" > status.txt
|
|
23
|
+
echo "DONE"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|