gpu-dev 0.5.18__tar.gz → 0.5.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/PKG-INFO +1 -1
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +5 -1
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +64 -6
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/pyproject.toml +1 -1
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/Dockerfile +2 -1
- gpu_dev-0.5.20/tests/submit/README.md +63 -0
- gpu_dev-0.5.20/tests/submit/fail/run.sh +20 -0
- gpu_dev-0.5.20/tests/submit/multinode/run.sh +65 -0
- gpu_dev-0.5.20/tests/submit/success/run.sh +23 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/.gitignore +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/CLAUDE.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/PROGRESS.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/TODO.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/admin/README.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/post.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/setup.cfg +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -112,4 +112,8 @@ terraform-gpu-devservers/ssh-proxy/requirements.txt
|
|
|
112
112
|
terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
|
|
113
113
|
terraform-gpu-devservers/templates/al2023-user-data.sh
|
|
114
114
|
terraform-gpu-devservers/templates/user-data-self-managed.sh
|
|
115
|
-
terraform-gpu-devservers/templates/user-data.sh
|
|
115
|
+
terraform-gpu-devservers/templates/user-data.sh
|
|
116
|
+
tests/submit/README.md
|
|
117
|
+
tests/submit/fail/run.sh
|
|
118
|
+
tests/submit/multinode/run.sh
|
|
119
|
+
tests/submit/success/run.sh
|
|
@@ -1357,18 +1357,26 @@ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200"
|
|
|
1357
1357
|
@main.command(context_settings={"ignore_unknown_options": True})
|
|
1358
1358
|
@click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
|
|
1359
1359
|
@click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
|
|
1360
|
-
@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation
|
|
1360
|
+
@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
|
|
1361
1361
|
@click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
|
|
1362
1362
|
@click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
|
|
1363
|
+
@click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
|
|
1364
|
+
help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
|
|
1365
|
+
@click.option("--dockerimage", type=str, default=None,
|
|
1366
|
+
help="Pre-built container image reference (e.g. ghcr.io/me/img:tag) to run instead of the default.")
|
|
1367
|
+
@click.option("--preserve-entrypoint", is_flag=True,
|
|
1368
|
+
help="Keep the custom image's ENTRYPOINT/CMD instead of letting gpu-dev wrap with the SSH harness. Note: submit needs SSH to work.")
|
|
1363
1369
|
@click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
|
|
1364
1370
|
help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
|
|
1365
1371
|
@click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
|
|
1366
1372
|
@click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
|
|
1367
1373
|
@click.option("--name", type=str, default=None, help="Reservation name.")
|
|
1368
|
-
@click.option("--timeout", type=int, default=
|
|
1374
|
+
@click.option("--timeout", type=int, default=24 * 60, show_default=True,
|
|
1375
|
+
help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
|
|
1369
1376
|
@click.argument("command", nargs=-1, required=True)
|
|
1370
1377
|
@click.pass_context
|
|
1371
|
-
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk,
|
|
1378
|
+
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
|
|
1379
|
+
runtime, no_pull, keep_alive, name, timeout, command):
|
|
1372
1380
|
"""Submit a job: reserve, sync code, run, sync results back, auto-cancel.
|
|
1373
1381
|
|
|
1374
1382
|
\b
|
|
@@ -1390,6 +1398,15 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
|
|
|
1390
1398
|
rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
|
|
1391
1399
|
sys.exit(2)
|
|
1392
1400
|
|
|
1401
|
+
# rsync is on macOS by default and on virtually every Linux distro; bail early with a
|
|
1402
|
+
# readable message if the user has somehow uninstalled it locally rather than failing
|
|
1403
|
+
# mid-flight after the reservation has already been created.
|
|
1404
|
+
if runtime:
|
|
1405
|
+
import shutil
|
|
1406
|
+
if not shutil.which("rsync"):
|
|
1407
|
+
rprint("[red]❌ rsync not found on PATH locally. Install it (Mac: 'brew install rsync', Debian/Ubuntu: 'sudo apt install rsync') and retry.[/red]")
|
|
1408
|
+
sys.exit(2)
|
|
1409
|
+
|
|
1393
1410
|
gt = gpu_type.lower()
|
|
1394
1411
|
# Per-type max GPUs (mirrors gpu_configs in reserve flow)
|
|
1395
1412
|
max_per_node = {
|
|
@@ -1421,12 +1438,47 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
|
|
|
1421
1438
|
# SSH into rank 0, so passing --disk is fine.
|
|
1422
1439
|
disk_name = None if no_persistent_disk else disk
|
|
1423
1440
|
|
|
1441
|
+
# Build dockerfile context if provided (mirrors the reserve-flow logic)
|
|
1442
|
+
dockerfile_payload = None
|
|
1443
|
+
if dockerfile:
|
|
1444
|
+
import os, tarfile, tempfile, base64
|
|
1445
|
+
if os.path.getsize(dockerfile) > 512 * 1024:
|
|
1446
|
+
rprint("[red]❌ Dockerfile too large (max 512KB)[/red]")
|
|
1447
|
+
sys.exit(2)
|
|
1448
|
+
ctx_dir = os.path.dirname(os.path.abspath(dockerfile))
|
|
1449
|
+
rprint(f"[cyan]📦 Building tar.gz context from {ctx_dir}[/cyan]")
|
|
1450
|
+
with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
|
|
1451
|
+
with tarfile.open(tmp.name, "w:gz") as tar:
|
|
1452
|
+
for root, _, files in os.walk(ctx_dir):
|
|
1453
|
+
for f in files:
|
|
1454
|
+
full = os.path.join(root, f)
|
|
1455
|
+
tar.add(full, arcname=os.path.relpath(full, ctx_dir))
|
|
1456
|
+
if os.path.basename(dockerfile).lower() != "dockerfile":
|
|
1457
|
+
tar.add(dockerfile, arcname="Dockerfile")
|
|
1458
|
+
tar_size = os.path.getsize(tmp.name)
|
|
1459
|
+
if tar_size > 700 * 1024:
|
|
1460
|
+
os.unlink(tmp.name)
|
|
1461
|
+
rprint(f"[red]❌ Build context too large: {tar_size}B (max ~700KB compressed)[/red]")
|
|
1462
|
+
sys.exit(2)
|
|
1463
|
+
with open(tmp.name, "rb") as fh:
|
|
1464
|
+
dockerfile_payload = base64.b64encode(fh.read()).decode("utf-8")
|
|
1465
|
+
os.unlink(tmp.name)
|
|
1466
|
+
rprint(f"[green]✅ Dockerfile context: {tar_size}B compressed[/green]")
|
|
1467
|
+
|
|
1468
|
+
if dockerimage and not preserve_entrypoint:
|
|
1469
|
+
rprint("[dim]Note: passing --dockerimage without --preserve-entrypoint, so gpu-dev wraps the image with the SSH harness.[/dim]")
|
|
1470
|
+
if preserve_entrypoint and not (dockerfile or dockerimage):
|
|
1471
|
+
rprint("[red]❌ --preserve-entrypoint requires --dockerfile or --dockerimage[/red]")
|
|
1472
|
+
sys.exit(2)
|
|
1473
|
+
|
|
1424
1474
|
rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
|
|
1425
1475
|
if is_multinode:
|
|
1426
1476
|
reservation_ids = rm.create_multinode_reservation(
|
|
1427
1477
|
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1428
1478
|
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1429
|
-
no_persistent_disk=no_persistent_disk, disk_name=disk_name
|
|
1479
|
+
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1480
|
+
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1481
|
+
preserve_entrypoint=preserve_entrypoint)
|
|
1430
1482
|
if not reservation_ids:
|
|
1431
1483
|
rprint("[red]❌ Failed to create multinode reservation[/red]")
|
|
1432
1484
|
sys.exit(2)
|
|
@@ -1435,7 +1487,9 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
|
|
|
1435
1487
|
primary_id = rm.create_reservation(
|
|
1436
1488
|
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1437
1489
|
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1438
|
-
no_persistent_disk=no_persistent_disk, disk_name=disk_name
|
|
1490
|
+
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1491
|
+
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1492
|
+
preserve_entrypoint=preserve_entrypoint)
|
|
1439
1493
|
if not primary_id:
|
|
1440
1494
|
rprint("[red]❌ Failed to create reservation[/red]")
|
|
1441
1495
|
sys.exit(2)
|
|
@@ -1456,7 +1510,11 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
|
|
|
1456
1510
|
rprint(f"[dim] cancel {rid[:8]} failed: {ce}[/dim]")
|
|
1457
1511
|
|
|
1458
1512
|
try:
|
|
1459
|
-
|
|
1513
|
+
if timeout >= 60:
|
|
1514
|
+
wait_str = f"up to {timeout//60}h{(" " + str(timeout%60) + "m") if timeout%60 else ""}"
|
|
1515
|
+
else:
|
|
1516
|
+
wait_str = f"up to {timeout}m"
|
|
1517
|
+
rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active ({wait_str}; can queue when cluster is full)...[/cyan]")
|
|
1460
1518
|
if is_multinode:
|
|
1461
1519
|
results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
|
|
1462
1520
|
else:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.20"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -36,7 +36,8 @@ RUN apt-get install -y --no-install-recommends \
|
|
|
36
36
|
unzip \
|
|
37
37
|
ccache \
|
|
38
38
|
htop \
|
|
39
|
-
tree
|
|
39
|
+
tree \
|
|
40
|
+
rsync
|
|
40
41
|
# Install Node.js 20 from NodeSource (required for Claude CLI)
|
|
41
42
|
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
|
|
42
43
|
apt-get install -y nodejs
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# `gpu-dev submit` smoke tests
|
|
2
|
+
|
|
3
|
+
Three tests that exercise `gpu-dev submit` end-to-end. Each test lives in its
|
|
4
|
+
own folder so you can `--runtime` it directly. Output files written by the
|
|
5
|
+
script are pulled back into the same folder via the post-run rsync.
|
|
6
|
+
|
|
7
|
+
> Requires `gpu-dev >= 0.5.19`. No Lambda update needed.
|
|
8
|
+
|
|
9
|
+
## 1. success — single T4 GPU, exit 0
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
cd tests/submit/success
|
|
13
|
+
gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
|
|
14
|
+
echo $? # 0
|
|
15
|
+
ls # nvidia-info.txt, compute.txt, status.txt all created
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## 2. fail — single T4 GPU, exit 7
|
|
19
|
+
|
|
20
|
+
Writes a partial file before exploding so you can confirm rsync still pulls
|
|
21
|
+
output on failure and the local exit code is the remote's.
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
cd tests/submit/fail
|
|
25
|
+
gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
|
|
26
|
+
echo $? # 7
|
|
27
|
+
ls # step1.txt, step2.txt, gpus-before-fail.txt — but no step3.txt
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## 3. multinode — 2x H100 nodes, exit 0
|
|
31
|
+
|
|
32
|
+
Reserves 16 H100s (= 2 nodes), verifies env vars + peer ssh + NCCL all_reduce
|
|
33
|
+
across the whole cluster via mpirun (orchestrated entirely from rank 0).
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
cd tests/submit/multinode
|
|
37
|
+
gpu-dev submit --gpu-type h100 --gpus 16 --runtime ./ -- bash run.sh
|
|
38
|
+
echo $? # 0
|
|
39
|
+
cat multinode-env.txt resolved-ips.txt peer-ssh.txt nccl-all_reduce.log
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## What each test proves
|
|
43
|
+
|
|
44
|
+
| Test | Proves |
|
|
45
|
+
|------------|-------------------------------------------------------------------------------|
|
|
46
|
+
| success | reserve → rsync up → exec → rsync back → cancel → exit 0 |
|
|
47
|
+
| fail | exit code propagation; rsync-back still runs on non-zero exit; cancel fires |
|
|
48
|
+
| multinode | MULTINODE_* env vars; peer DNS / passwordless ssh; cross-node NCCL via mpirun |
|
|
49
|
+
|
|
50
|
+
After every run, `gpu-dev list` should show neither reservation — both auto-cancelled.
|
|
51
|
+
Use `--keep-alive` on any of them if you want to debug interactively afterward.
|
|
52
|
+
|
|
53
|
+
## Other submit flags (forwarded to `reserve`)
|
|
54
|
+
|
|
55
|
+
- `--hours N` — reservation lifetime ceiling (default 1.0)
|
|
56
|
+
- `--disk NAME` — attach a persistent disk to the master node
|
|
57
|
+
- `--no-persistent-disk` — skip persistent disk
|
|
58
|
+
- `--dockerfile PATH` — build a custom image from this Dockerfile
|
|
59
|
+
- `--dockerimage REF` — use a pre-built container image
|
|
60
|
+
- `--preserve-entrypoint` — keep the custom image's ENTRYPOINT (you must run sshd yourself for submit to work)
|
|
61
|
+
- `--timeout MINUTES` — wait-for-active timeout (default 1440 = 24h, since reservations may queue)
|
|
62
|
+
- `--no-pull` — skip the post-run sync-back
|
|
63
|
+
- `--keep-alive` — skip auto-cancel
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Failure test for `gpu-dev submit`: writes a partial output, then exits 7.
|
|
3
|
+
# Verifies the post-run rsync still pulls the partial files even on failure,
|
|
4
|
+
# the auto-cancel runs on non-zero exit, and the local exit code is preserved.
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
echo "=== host ==="
|
|
8
|
+
hostname
|
|
9
|
+
date -u
|
|
10
|
+
|
|
11
|
+
# Write a partial file so we can verify it was synced back
|
|
12
|
+
echo "step1 done at $(date -u)" > step1.txt
|
|
13
|
+
nvidia-smi -L > gpus-before-fail.txt
|
|
14
|
+
|
|
15
|
+
# Now error out
|
|
16
|
+
echo "About to fail..." > step2.txt
|
|
17
|
+
python3 -c "import sys; sys.exit(7)"
|
|
18
|
+
|
|
19
|
+
# Should not reach here
|
|
20
|
+
echo "should-not-appear" > step3.txt
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Multinode test for `gpu-dev submit`: runs on rank 0 only and orchestrates the
|
|
3
|
+
# whole cluster via mpirun (uses passwordless ssh + the headless service DNS we
|
|
4
|
+
# already set up). Verifies env vars, peer connectivity, and an actual NCCL
|
|
5
|
+
# all_reduce across all nodes.
|
|
6
|
+
set -euo pipefail
|
|
7
|
+
cd "$(dirname "$0")"
|
|
8
|
+
|
|
9
|
+
echo "=== rank 0 host: $(hostname) at $(date -u) ==="
|
|
10
|
+
|
|
11
|
+
echo "=== multinode env ==="
|
|
12
|
+
{
|
|
13
|
+
echo "MULTINODE_HOSTS=$MULTINODE_HOSTS"
|
|
14
|
+
echo "MULTINODE_PEER_PODS=$MULTINODE_PEER_PODS"
|
|
15
|
+
echo "MULTINODE_RANK=$MULTINODE_RANK"
|
|
16
|
+
echo "MULTINODE_SIZE=$MULTINODE_SIZE"
|
|
17
|
+
echo "MASTER_ADDR=$MASTER_ADDR"
|
|
18
|
+
echo "MASTER_PORT=$MASTER_PORT"
|
|
19
|
+
echo "MULTINODE_IPS=${MULTINODE_IPS:-(not set)}"
|
|
20
|
+
} | tee multinode-env.txt
|
|
21
|
+
|
|
22
|
+
if [[ -z "${MULTINODE_HOSTS:-}" ]]; then
|
|
23
|
+
echo "ERROR: MULTINODE_HOSTS empty — submit with --gpus >= 16 on h100" >&2
|
|
24
|
+
exit 2
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
# Resolve IPs even if the bashrc helper didn't run (defensive)
|
|
28
|
+
IPS=""
|
|
29
|
+
for h in $(echo "$MULTINODE_HOSTS" | tr ',' ' '); do
|
|
30
|
+
ip=$(getent hosts "$h" | awk '{print $1}' | head -1)
|
|
31
|
+
[[ -n "$ip" ]] && IPS="${IPS:+$IPS,}$ip"
|
|
32
|
+
done
|
|
33
|
+
echo "Resolved IPS=$IPS" | tee resolved-ips.txt
|
|
34
|
+
|
|
35
|
+
echo "=== peer ssh check (port 2222 inside cluster) ==="
|
|
36
|
+
peer_host=$(echo "$MULTINODE_HOSTS" | cut -d, -f2)
|
|
37
|
+
ssh -o StrictHostKeyChecking=no -p 2222 "$peer_host" 'hostname; nvidia-smi -L | wc -l' \
|
|
38
|
+
| tee peer-ssh.txt
|
|
39
|
+
|
|
40
|
+
GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
|
|
41
|
+
echo "GPUS_PER_NODE=$GPUS_PER_NODE" | tee gpus-per-node.txt
|
|
42
|
+
|
|
43
|
+
# Build --host arg: ip1:N,ip2:N,...
|
|
44
|
+
HOST_ARG=$(echo "$IPS" | awk -v g="$GPUS_PER_NODE" -F, '{out=""; for(i=1;i<=NF;i++){out=out ($i ":" g) (i<NF?",":"")}; print out}')
|
|
45
|
+
echo "HOST_ARG=$HOST_ARG"
|
|
46
|
+
|
|
47
|
+
echo "=== NCCL all_reduce_perf via mpirun ==="
|
|
48
|
+
# Note: -g 1 = 1 GPU per process, -n 20 iterations. Sweep 1M..1G in factor-of-2 steps.
|
|
49
|
+
mpirun --host "$HOST_ARG" \
|
|
50
|
+
--mca plm_rsh_args "-p 2222 -o StrictHostKeyChecking=no" \
|
|
51
|
+
-x PATH -x LD_LIBRARY_PATH \
|
|
52
|
+
-x FI_PROVIDER -x FI_EFA_USE_DEVICE_RDMA \
|
|
53
|
+
-x NCCL_NET_GDR_LEVEL -x NCCL_ALGO \
|
|
54
|
+
-x NCCL_SOCKET_IFNAME -x NCCL_DEBUG -x NCCL_IB_HCA \
|
|
55
|
+
/opt/nccl-tests/build/all_reduce_perf -b 1M -e 1G -f 2 -g 1 -n 20 \
|
|
56
|
+
2>&1 | tee nccl-all_reduce.log
|
|
57
|
+
|
|
58
|
+
echo "=== summary ==="
|
|
59
|
+
{
|
|
60
|
+
echo "rank=$MULTINODE_RANK size=$MULTINODE_SIZE"
|
|
61
|
+
echo "host_arg=$HOST_ARG"
|
|
62
|
+
echo "completed at $(date -u)"
|
|
63
|
+
} | tee summary.txt
|
|
64
|
+
|
|
65
|
+
echo "DONE"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Smoke test for `gpu-dev submit`: runs on a single GPU, expected exit 0.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
echo "=== host ==="
|
|
6
|
+
hostname
|
|
7
|
+
date -u
|
|
8
|
+
|
|
9
|
+
echo "=== nvidia-smi ==="
|
|
10
|
+
nvidia-smi | tee nvidia-info.txt
|
|
11
|
+
|
|
12
|
+
echo "=== compute ==="
|
|
13
|
+
python3 - <<'PY' | tee compute.txt
|
|
14
|
+
import torch
|
|
15
|
+
assert torch.cuda.is_available(), "CUDA not available"
|
|
16
|
+
n = torch.cuda.device_count()
|
|
17
|
+
x = torch.arange(1_000_000, device="cuda", dtype=torch.float32)
|
|
18
|
+
s = x.sum().item()
|
|
19
|
+
print(f"devices={n} sum(0..999_999)={s}")
|
|
20
|
+
PY
|
|
21
|
+
|
|
22
|
+
echo "ok at $(date -u)" > status.txt
|
|
23
|
+
echo "DONE"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.20}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|