gpu-dev 0.5.18__tar.gz → 0.5.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/PKG-INFO +1 -1
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +5 -1
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +55 -6
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/pyproject.toml +1 -1
- gpu_dev-0.5.19/tests/submit/README.md +63 -0
- gpu_dev-0.5.19/tests/submit/fail/run.sh +20 -0
- gpu_dev-0.5.19/tests/submit/multinode/run.sh +65 -0
- gpu_dev-0.5.19/tests/submit/success/run.sh +23 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/.gitignore +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/CLAUDE.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/PROGRESS.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/TODO.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/admin/README.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/post.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/setup.cfg +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -112,4 +112,8 @@ terraform-gpu-devservers/ssh-proxy/requirements.txt
|
|
|
112
112
|
terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
|
|
113
113
|
terraform-gpu-devservers/templates/al2023-user-data.sh
|
|
114
114
|
terraform-gpu-devservers/templates/user-data-self-managed.sh
|
|
115
|
-
terraform-gpu-devservers/templates/user-data.sh
|
|
115
|
+
terraform-gpu-devservers/templates/user-data.sh
|
|
116
|
+
tests/submit/README.md
|
|
117
|
+
tests/submit/fail/run.sh
|
|
118
|
+
tests/submit/multinode/run.sh
|
|
119
|
+
tests/submit/success/run.sh
|
|
@@ -1357,18 +1357,26 @@ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200"
|
|
|
1357
1357
|
@main.command(context_settings={"ignore_unknown_options": True})
|
|
1358
1358
|
@click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
|
|
1359
1359
|
@click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
|
|
1360
|
-
@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation
|
|
1360
|
+
@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
|
|
1361
1361
|
@click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
|
|
1362
1362
|
@click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
|
|
1363
|
+
@click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
|
|
1364
|
+
help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
|
|
1365
|
+
@click.option("--dockerimage", type=str, default=None,
|
|
1366
|
+
help="Pre-built container image reference (e.g. ghcr.io/me/img:tag) to run instead of the default.")
|
|
1367
|
+
@click.option("--preserve-entrypoint", is_flag=True,
|
|
1368
|
+
help="Keep the custom image's ENTRYPOINT/CMD instead of letting gpu-dev wrap with the SSH harness. Note: submit needs SSH to work.")
|
|
1363
1369
|
@click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
|
|
1364
1370
|
help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
|
|
1365
1371
|
@click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
|
|
1366
1372
|
@click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
|
|
1367
1373
|
@click.option("--name", type=str, default=None, help="Reservation name.")
|
|
1368
|
-
@click.option("--timeout", type=int, default=
|
|
1374
|
+
@click.option("--timeout", type=int, default=24 * 60, show_default=True,
|
|
1375
|
+
help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
|
|
1369
1376
|
@click.argument("command", nargs=-1, required=True)
|
|
1370
1377
|
@click.pass_context
|
|
1371
|
-
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk,
|
|
1378
|
+
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
|
|
1379
|
+
runtime, no_pull, keep_alive, name, timeout, command):
|
|
1372
1380
|
"""Submit a job: reserve, sync code, run, sync results back, auto-cancel.
|
|
1373
1381
|
|
|
1374
1382
|
\b
|
|
@@ -1421,12 +1429,47 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
|
|
|
1421
1429
|
# SSH into rank 0, so passing --disk is fine.
|
|
1422
1430
|
disk_name = None if no_persistent_disk else disk
|
|
1423
1431
|
|
|
1432
|
+
# Build dockerfile context if provided (mirrors the reserve-flow logic)
|
|
1433
|
+
dockerfile_payload = None
|
|
1434
|
+
if dockerfile:
|
|
1435
|
+
import os, tarfile, tempfile, base64
|
|
1436
|
+
if os.path.getsize(dockerfile) > 512 * 1024:
|
|
1437
|
+
rprint("[red]❌ Dockerfile too large (max 512KB)[/red]")
|
|
1438
|
+
sys.exit(2)
|
|
1439
|
+
ctx_dir = os.path.dirname(os.path.abspath(dockerfile))
|
|
1440
|
+
rprint(f"[cyan]📦 Building tar.gz context from {ctx_dir}[/cyan]")
|
|
1441
|
+
with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
|
|
1442
|
+
with tarfile.open(tmp.name, "w:gz") as tar:
|
|
1443
|
+
for root, _, files in os.walk(ctx_dir):
|
|
1444
|
+
for f in files:
|
|
1445
|
+
full = os.path.join(root, f)
|
|
1446
|
+
tar.add(full, arcname=os.path.relpath(full, ctx_dir))
|
|
1447
|
+
if os.path.basename(dockerfile).lower() != "dockerfile":
|
|
1448
|
+
tar.add(dockerfile, arcname="Dockerfile")
|
|
1449
|
+
tar_size = os.path.getsize(tmp.name)
|
|
1450
|
+
if tar_size > 700 * 1024:
|
|
1451
|
+
os.unlink(tmp.name)
|
|
1452
|
+
rprint(f"[red]❌ Build context too large: {tar_size}B (max ~700KB compressed)[/red]")
|
|
1453
|
+
sys.exit(2)
|
|
1454
|
+
with open(tmp.name, "rb") as fh:
|
|
1455
|
+
dockerfile_payload = base64.b64encode(fh.read()).decode("utf-8")
|
|
1456
|
+
os.unlink(tmp.name)
|
|
1457
|
+
rprint(f"[green]✅ Dockerfile context: {tar_size}B compressed[/green]")
|
|
1458
|
+
|
|
1459
|
+
if dockerimage and not preserve_entrypoint:
|
|
1460
|
+
rprint("[dim]Note: passing --dockerimage without --preserve-entrypoint, so gpu-dev wraps the image with the SSH harness.[/dim]")
|
|
1461
|
+
if preserve_entrypoint and not (dockerfile or dockerimage):
|
|
1462
|
+
rprint("[red]❌ --preserve-entrypoint requires --dockerfile or --dockerimage[/red]")
|
|
1463
|
+
sys.exit(2)
|
|
1464
|
+
|
|
1424
1465
|
rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
|
|
1425
1466
|
if is_multinode:
|
|
1426
1467
|
reservation_ids = rm.create_multinode_reservation(
|
|
1427
1468
|
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1428
1469
|
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1429
|
-
no_persistent_disk=no_persistent_disk, disk_name=disk_name
|
|
1470
|
+
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1471
|
+
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1472
|
+
preserve_entrypoint=preserve_entrypoint)
|
|
1430
1473
|
if not reservation_ids:
|
|
1431
1474
|
rprint("[red]❌ Failed to create multinode reservation[/red]")
|
|
1432
1475
|
sys.exit(2)
|
|
@@ -1435,7 +1478,9 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
|
|
|
1435
1478
|
primary_id = rm.create_reservation(
|
|
1436
1479
|
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1437
1480
|
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1438
|
-
no_persistent_disk=no_persistent_disk, disk_name=disk_name
|
|
1481
|
+
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1482
|
+
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1483
|
+
preserve_entrypoint=preserve_entrypoint)
|
|
1439
1484
|
if not primary_id:
|
|
1440
1485
|
rprint("[red]❌ Failed to create reservation[/red]")
|
|
1441
1486
|
sys.exit(2)
|
|
@@ -1456,7 +1501,11 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, runtime, no_pul
|
|
|
1456
1501
|
rprint(f"[dim] cancel {rid[:8]} failed: {ce}[/dim]")
|
|
1457
1502
|
|
|
1458
1503
|
try:
|
|
1459
|
-
|
|
1504
|
+
if timeout >= 60:
|
|
1505
|
+
wait_str = f"up to {timeout//60}h{(" " + str(timeout%60) + "m") if timeout%60 else ""}"
|
|
1506
|
+
else:
|
|
1507
|
+
wait_str = f"up to {timeout}m"
|
|
1508
|
+
rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active ({wait_str}; can queue when cluster is full)...[/cyan]")
|
|
1460
1509
|
if is_multinode:
|
|
1461
1510
|
results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
|
|
1462
1511
|
else:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.19"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# `gpu-dev submit` smoke tests
|
|
2
|
+
|
|
3
|
+
Three tests that exercise `gpu-dev submit` end-to-end. Each test lives in its
|
|
4
|
+
own folder so you can `--runtime` it directly. Output files written by the
|
|
5
|
+
script are pulled back into the same folder via the post-run rsync.
|
|
6
|
+
|
|
7
|
+
> Requires `gpu-dev >= 0.5.19`. No Lambda update needed.
|
|
8
|
+
|
|
9
|
+
## 1. success — single T4 GPU, exit 0
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
cd tests/submit/success
|
|
13
|
+
gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
|
|
14
|
+
echo $? # 0
|
|
15
|
+
ls # nvidia-info.txt, compute.txt, status.txt all created
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## 2. fail — single T4 GPU, exit 7
|
|
19
|
+
|
|
20
|
+
Writes a partial file before exploding so you can confirm rsync still pulls
|
|
21
|
+
output on failure and the local exit code is the remote's.
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
cd tests/submit/fail
|
|
25
|
+
gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
|
|
26
|
+
echo $? # 7
|
|
27
|
+
ls # step1.txt, step2.txt, gpus-before-fail.txt — but no step3.txt
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## 3. multinode — 2x H100 nodes, exit 0
|
|
31
|
+
|
|
32
|
+
Reserves 16 H100s (= 2 nodes), verifies env vars + peer ssh + NCCL all_reduce
|
|
33
|
+
across the whole cluster via mpirun (orchestrated entirely from rank 0).
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
cd tests/submit/multinode
|
|
37
|
+
gpu-dev submit --gpu-type h100 --gpus 16 --runtime ./ -- bash run.sh
|
|
38
|
+
echo $? # 0
|
|
39
|
+
cat multinode-env.txt resolved-ips.txt peer-ssh.txt nccl-all_reduce.log
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## What each test proves
|
|
43
|
+
|
|
44
|
+
| Test | Proves |
|
|
45
|
+
|------------|-------------------------------------------------------------------------------|
|
|
46
|
+
| success | reserve → rsync up → exec → rsync back → cancel → exit 0 |
|
|
47
|
+
| fail | exit code propagation; rsync-back still runs on non-zero exit; cancel fires |
|
|
48
|
+
| multinode | MULTINODE_* env vars; peer DNS / passwordless ssh; cross-node NCCL via mpirun |
|
|
49
|
+
|
|
50
|
+
After every run, `gpu-dev list` should show neither reservation — both auto-cancelled.
|
|
51
|
+
Use `--keep-alive` on any of them if you want to debug interactively afterward.
|
|
52
|
+
|
|
53
|
+
## Other submit flags (forwarded to `reserve`)
|
|
54
|
+
|
|
55
|
+
- `--hours N` — reservation lifetime ceiling (default 1.0)
|
|
56
|
+
- `--disk NAME` — attach a persistent disk to the master node
|
|
57
|
+
- `--no-persistent-disk` — skip persistent disk
|
|
58
|
+
- `--dockerfile PATH` — build a custom image from this Dockerfile
|
|
59
|
+
- `--dockerimage REF` — use a pre-built container image
|
|
60
|
+
- `--preserve-entrypoint` — keep the custom image's ENTRYPOINT (you must run sshd yourself for submit to work)
|
|
61
|
+
- `--timeout MINUTES` — wait-for-active timeout (default 1440 = 24h, since reservations may queue)
|
|
62
|
+
- `--no-pull` — skip the post-run sync-back
|
|
63
|
+
- `--keep-alive` — skip auto-cancel
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Failure test for `gpu-dev submit`: writes a partial output, then exits 7.
|
|
3
|
+
# Verifies the post-run rsync still pulls the partial files even on failure,
|
|
4
|
+
# the auto-cancel runs on non-zero exit, and the local exit code is preserved.
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
echo "=== host ==="
|
|
8
|
+
hostname
|
|
9
|
+
date -u
|
|
10
|
+
|
|
11
|
+
# Write a partial file so we can verify it was synced back
|
|
12
|
+
echo "step1 done at $(date -u)" > step1.txt
|
|
13
|
+
nvidia-smi -L > gpus-before-fail.txt
|
|
14
|
+
|
|
15
|
+
# Now error out
|
|
16
|
+
echo "About to fail..." > step2.txt
|
|
17
|
+
python3 -c "import sys; sys.exit(7)"
|
|
18
|
+
|
|
19
|
+
# Should not reach here
|
|
20
|
+
echo "should-not-appear" > step3.txt
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Multinode test for `gpu-dev submit`: runs on rank 0 only and orchestrates the
|
|
3
|
+
# whole cluster via mpirun (uses passwordless ssh + the headless service DNS we
|
|
4
|
+
# already set up). Verifies env vars, peer connectivity, and an actual NCCL
|
|
5
|
+
# all_reduce across all nodes.
|
|
6
|
+
set -euo pipefail
|
|
7
|
+
cd "$(dirname "$0")"
|
|
8
|
+
|
|
9
|
+
echo "=== rank 0 host: $(hostname) at $(date -u) ==="
|
|
10
|
+
|
|
11
|
+
echo "=== multinode env ==="
|
|
12
|
+
{
|
|
13
|
+
echo "MULTINODE_HOSTS=$MULTINODE_HOSTS"
|
|
14
|
+
echo "MULTINODE_PEER_PODS=$MULTINODE_PEER_PODS"
|
|
15
|
+
echo "MULTINODE_RANK=$MULTINODE_RANK"
|
|
16
|
+
echo "MULTINODE_SIZE=$MULTINODE_SIZE"
|
|
17
|
+
echo "MASTER_ADDR=$MASTER_ADDR"
|
|
18
|
+
echo "MASTER_PORT=$MASTER_PORT"
|
|
19
|
+
echo "MULTINODE_IPS=${MULTINODE_IPS:-(not set)}"
|
|
20
|
+
} | tee multinode-env.txt
|
|
21
|
+
|
|
22
|
+
if [[ -z "${MULTINODE_HOSTS:-}" ]]; then
|
|
23
|
+
echo "ERROR: MULTINODE_HOSTS empty — submit with --gpus >= 16 on h100" >&2
|
|
24
|
+
exit 2
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
# Resolve IPs even if the bashrc helper didn't run (defensive)
|
|
28
|
+
IPS=""
|
|
29
|
+
for h in $(echo "$MULTINODE_HOSTS" | tr ',' ' '); do
|
|
30
|
+
ip=$(getent hosts "$h" | awk '{print $1}' | head -1)
|
|
31
|
+
[[ -n "$ip" ]] && IPS="${IPS:+$IPS,}$ip"
|
|
32
|
+
done
|
|
33
|
+
echo "Resolved IPS=$IPS" | tee resolved-ips.txt
|
|
34
|
+
|
|
35
|
+
echo "=== peer ssh check (port 2222 inside cluster) ==="
|
|
36
|
+
peer_host=$(echo "$MULTINODE_HOSTS" | cut -d, -f2)
|
|
37
|
+
ssh -o StrictHostKeyChecking=no -p 2222 "$peer_host" 'hostname; nvidia-smi -L | wc -l' \
|
|
38
|
+
| tee peer-ssh.txt
|
|
39
|
+
|
|
40
|
+
GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
|
|
41
|
+
echo "GPUS_PER_NODE=$GPUS_PER_NODE" | tee gpus-per-node.txt
|
|
42
|
+
|
|
43
|
+
# Build --host arg: ip1:N,ip2:N,...
|
|
44
|
+
HOST_ARG=$(echo "$IPS" | awk -v g="$GPUS_PER_NODE" -F, '{out=""; for(i=1;i<=NF;i++){out=out ($i ":" g) (i<NF?",":"")}; print out}')
|
|
45
|
+
echo "HOST_ARG=$HOST_ARG"
|
|
46
|
+
|
|
47
|
+
echo "=== NCCL all_reduce_perf via mpirun ==="
|
|
48
|
+
# Note: -g 1 = 1 GPU per process, -n 20 iterations. Sweep 1M..1G in factor-of-2 steps.
|
|
49
|
+
mpirun --host "$HOST_ARG" \
|
|
50
|
+
--mca plm_rsh_args "-p 2222 -o StrictHostKeyChecking=no" \
|
|
51
|
+
-x PATH -x LD_LIBRARY_PATH \
|
|
52
|
+
-x FI_PROVIDER -x FI_EFA_USE_DEVICE_RDMA \
|
|
53
|
+
-x NCCL_NET_GDR_LEVEL -x NCCL_ALGO \
|
|
54
|
+
-x NCCL_SOCKET_IFNAME -x NCCL_DEBUG -x NCCL_IB_HCA \
|
|
55
|
+
/opt/nccl-tests/build/all_reduce_perf -b 1M -e 1G -f 2 -g 1 -n 20 \
|
|
56
|
+
2>&1 | tee nccl-all_reduce.log
|
|
57
|
+
|
|
58
|
+
echo "=== summary ==="
|
|
59
|
+
{
|
|
60
|
+
echo "rank=$MULTINODE_RANK size=$MULTINODE_SIZE"
|
|
61
|
+
echo "host_arg=$HOST_ARG"
|
|
62
|
+
echo "completed at $(date -u)"
|
|
63
|
+
} | tee summary.txt
|
|
64
|
+
|
|
65
|
+
echo "DONE"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Smoke test for `gpu-dev submit`: runs on a single GPU, expected exit 0.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
echo "=== host ==="
|
|
6
|
+
hostname
|
|
7
|
+
date -u
|
|
8
|
+
|
|
9
|
+
echo "=== nvidia-smi ==="
|
|
10
|
+
nvidia-smi | tee nvidia-info.txt
|
|
11
|
+
|
|
12
|
+
echo "=== compute ==="
|
|
13
|
+
python3 - <<'PY' | tee compute.txt
|
|
14
|
+
import torch
|
|
15
|
+
assert torch.cuda.is_available(), "CUDA not available"
|
|
16
|
+
n = torch.cuda.device_count()
|
|
17
|
+
x = torch.arange(1_000_000, device="cuda", dtype=torch.float32)
|
|
18
|
+
s = x.sum().item()
|
|
19
|
+
print(f"devices={n} sum(0..999_999)={s}")
|
|
20
|
+
PY
|
|
21
|
+
|
|
22
|
+
echo "ok at $(date -u)" > status.txt
|
|
23
|
+
echo "DONE"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.18 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|