gpu-dev 0.5.26__tar.gz → 0.5.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/PKG-INFO +1 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +14 -5
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +7 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/pyproject.toml +1 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/eks.tf +1 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_processor/index.py +18 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda.tf +6 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/main.tf +79 -2
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/node-termination-handler.tf +2 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/route53.tf +13 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/.gitignore +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/CLAUDE.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/PROGRESS.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/TODO.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/admin/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/post.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/setup.cfg +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.28}/tests/submit/success/run.sh +0 -0
|
@@ -496,7 +496,7 @@ def main(ctx: click.Context) -> None:
|
|
|
496
496
|
"--gpu-type",
|
|
497
497
|
"-t",
|
|
498
498
|
type=click.Choice(
|
|
499
|
-
["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
499
|
+
["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
500
500
|
),
|
|
501
501
|
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
|
|
502
502
|
)
|
|
@@ -576,6 +576,8 @@ def main(ctx: click.Context) -> None:
|
|
|
576
576
|
multiple=True,
|
|
577
577
|
help="Request nodes with specific label (format: key=value). Example: --node-label nsight=true for Nsight profiling nodes",
|
|
578
578
|
)
|
|
579
|
+
@click.option("--spot", is_flag=True, default=False,
|
|
580
|
+
help="Acknowledge spot instance (~1/3 cost, may be preempted with 2-min notice). Required for spot-only types.")
|
|
579
581
|
@click.pass_context
|
|
580
582
|
def reserve(
|
|
581
583
|
ctx: click.Context,
|
|
@@ -662,6 +664,7 @@ def reserve(
|
|
|
662
664
|
"b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
|
|
663
665
|
"h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
|
|
664
666
|
"b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
|
|
667
|
+
"b300": {"max_gpus": 8, "instance_type": "p6-b300.48xlarge"},
|
|
665
668
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
666
669
|
"cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
|
|
667
670
|
}
|
|
@@ -1270,6 +1273,7 @@ def reserve(
|
|
|
1270
1273
|
no_persistent_disk=no_persistent_disk,
|
|
1271
1274
|
preserve_entrypoint=preserve_entrypoint,
|
|
1272
1275
|
disk_name=disk,
|
|
1276
|
+
spot=spot,
|
|
1273
1277
|
node_labels=node_labels if node_labels else None,
|
|
1274
1278
|
)
|
|
1275
1279
|
else:
|
|
@@ -1288,6 +1292,7 @@ def reserve(
|
|
|
1288
1292
|
no_persistent_disk=no_persistent_disk,
|
|
1289
1293
|
preserve_entrypoint=preserve_entrypoint,
|
|
1290
1294
|
disk_name=disk,
|
|
1295
|
+
spot=spot,
|
|
1291
1296
|
node_labels=node_labels if node_labels else None,
|
|
1292
1297
|
trace=trace,
|
|
1293
1298
|
)
|
|
@@ -1350,7 +1355,7 @@ def reserve(
|
|
|
1350
1355
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
1351
1356
|
|
|
1352
1357
|
|
|
1353
|
-
_SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1358
|
+
_SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1354
1359
|
"h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
|
|
1355
1360
|
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
|
|
1356
1361
|
|
|
@@ -1361,6 +1366,8 @@ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200"
|
|
|
1361
1366
|
@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
|
|
1362
1367
|
@click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
|
|
1363
1368
|
@click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
|
|
1369
|
+
@click.option("--spot", is_flag=True, default=False,
|
|
1370
|
+
help="Acknowledge spot instance (~1/3 cost, may be preempted). Required for spot-only types.")
|
|
1364
1371
|
@click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
|
|
1365
1372
|
help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
|
|
1366
1373
|
@click.option("--dockerimage", type=str, default=None,
|
|
@@ -1376,7 +1383,7 @@ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200"
|
|
|
1376
1383
|
help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
|
|
1377
1384
|
@click.argument("command", nargs=-1, required=True)
|
|
1378
1385
|
@click.pass_context
|
|
1379
|
-
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
|
|
1386
|
+
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, spot, dockerfile, dockerimage, preserve_entrypoint,
|
|
1380
1387
|
runtime, no_pull, keep_alive, name, timeout, command):
|
|
1381
1388
|
"""Submit a job: reserve, sync code, run, sync results back, auto-cancel.
|
|
1382
1389
|
|
|
@@ -1490,7 +1497,7 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
|
|
|
1490
1497
|
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1491
1498
|
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1492
1499
|
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1493
|
-
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1500
|
+
spot=spot, dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1494
1501
|
preserve_entrypoint=preserve_entrypoint)
|
|
1495
1502
|
if not reservation_ids:
|
|
1496
1503
|
rprint("[red]❌ Failed to create multinode reservation[/red]")
|
|
@@ -1501,7 +1508,7 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
|
|
|
1501
1508
|
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1502
1509
|
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1503
1510
|
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1504
|
-
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1511
|
+
spot=spot, dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1505
1512
|
preserve_entrypoint=preserve_entrypoint)
|
|
1506
1513
|
if not primary_id:
|
|
1507
1514
|
rprint("[red]❌ Failed to create reservation[/red]")
|
|
@@ -2719,6 +2726,7 @@ def _show_availability() -> None:
|
|
|
2719
2726
|
# GPU architecture mapping (for display)
|
|
2720
2727
|
gpu_architectures = {
|
|
2721
2728
|
"b200": "Blackwell (sm100)",
|
|
2729
|
+
"b300": "Blackwell (sm100)",
|
|
2722
2730
|
"h200": "Hopper (sm90)",
|
|
2723
2731
|
"h100": "Hopper (sm90)",
|
|
2724
2732
|
"a100": "Ampere (sm80)",
|
|
@@ -2880,6 +2888,7 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2880
2888
|
# GPU architecture mapping (for display)
|
|
2881
2889
|
gpu_architectures = {
|
|
2882
2890
|
"b200": "Blackwell (sm100)",
|
|
2891
|
+
"b300": "Blackwell (sm100)",
|
|
2883
2892
|
"h200": "Hopper (sm90)",
|
|
2884
2893
|
"h100": "Hopper (sm90)",
|
|
2885
2894
|
"a100": "Ampere (sm80)",
|
|
@@ -421,6 +421,7 @@ class ReservationManager:
|
|
|
421
421
|
disk_name: Optional[str] = None,
|
|
422
422
|
node_labels: Optional[Dict[str, str]] = None,
|
|
423
423
|
trace: bool = False,
|
|
424
|
+
spot: bool = False,
|
|
424
425
|
) -> Optional[str]:
|
|
425
426
|
"""Create a new GPU reservation"""
|
|
426
427
|
try:
|
|
@@ -500,6 +501,9 @@ class ReservationManager:
|
|
|
500
501
|
if node_labels:
|
|
501
502
|
message["node_labels"] = node_labels
|
|
502
503
|
|
|
504
|
+
if spot:
|
|
505
|
+
message["spot"] = True
|
|
506
|
+
|
|
503
507
|
# Add trace flag and CLI start timestamp
|
|
504
508
|
if trace:
|
|
505
509
|
message["trace"] = True
|
|
@@ -536,6 +540,7 @@ class ReservationManager:
|
|
|
536
540
|
preserve_entrypoint: bool = False,
|
|
537
541
|
disk_name: Optional[str] = None,
|
|
538
542
|
node_labels: Optional[Dict[str, str]] = None,
|
|
543
|
+
spot: bool = False,
|
|
539
544
|
) -> Optional[List[str]]:
|
|
540
545
|
"""Create multiple GPU reservations for multinode setup"""
|
|
541
546
|
try:
|
|
@@ -557,6 +562,7 @@ class ReservationManager:
|
|
|
557
562
|
"b200-mig-3g": {"max_gpus": 2},
|
|
558
563
|
"h200": {"max_gpus": 8},
|
|
559
564
|
"b200": {"max_gpus": 8},
|
|
565
|
+
"b300": {"max_gpus": 8},
|
|
560
566
|
}
|
|
561
567
|
|
|
562
568
|
max_gpus_per_node = gpu_configs[gpu_type]["max_gpus"]
|
|
@@ -601,6 +607,7 @@ class ReservationManager:
|
|
|
601
607
|
"recreate_env": recreate_env,
|
|
602
608
|
"is_multinode": True,
|
|
603
609
|
"no_persistent_disk": no_persistent_disk,
|
|
610
|
+
"spot": spot,
|
|
604
611
|
}
|
|
605
612
|
|
|
606
613
|
if github_user:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.28"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -59,6 +59,7 @@ ECR_REPOSITORY_URL = os.environ.get("ECR_REPOSITORY_URL")
|
|
|
59
59
|
# Version validation - injected via Terraform
|
|
60
60
|
LAMBDA_VERSION = os.environ.get("LAMBDA_VERSION", "0.3.9")
|
|
61
61
|
MIN_CLI_VERSION = os.environ.get("MIN_CLI_VERSION", "0.3.9")
|
|
62
|
+
SPOT_GPU_TYPES = os.environ.get("SPOT_GPU_TYPES", "")
|
|
62
63
|
OPERATIONS_TABLE = os.environ.get("OPERATIONS_TABLE", "pytorch-gpu-dev-operations")
|
|
63
64
|
|
|
64
65
|
# GPU Configuration - single source of truth for all GPU type mappings
|
|
@@ -81,6 +82,7 @@ GPU_CONFIG = {
|
|
|
81
82
|
"h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
82
83
|
"h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
83
84
|
"b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
85
|
+
"b300": {"instance_type": "p6-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
|
|
84
86
|
"cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
85
87
|
"cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
86
88
|
}
|
|
@@ -2188,7 +2190,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2188
2190
|
# Validate GPU type
|
|
2189
2191
|
valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
|
|
2190
2192
|
"h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
|
|
2191
|
-
"h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
|
|
2193
|
+
"h200", "b200", "b300", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
|
|
2192
2194
|
"cpu-arm", "cpu-x86"]
|
|
2193
2195
|
if gpu_type not in valid_gpu_types:
|
|
2194
2196
|
error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
|
|
@@ -2205,6 +2207,19 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2205
2207
|
logger.warning(f"User {user_id} blocked from {gpu_type}: maintenance mode")
|
|
2206
2208
|
return False, error_msg
|
|
2207
2209
|
|
|
2210
|
+
# Spot acknowledgment: if this workspace marks the GPU type as spot-only and
|
|
2211
|
+
# the user didn't pass --spot, reject with a clear message.
|
|
2212
|
+
if SPOT_GPU_TYPES and not request.get("spot", False):
|
|
2213
|
+
is_spot = SPOT_GPU_TYPES.strip() == "all" or gpu_type in [t.strip() for t in SPOT_GPU_TYPES.split(",")]
|
|
2214
|
+
if is_spot:
|
|
2215
|
+
error_msg = (
|
|
2216
|
+
f"{gpu_type.upper()} is only available as a spot instance in this environment. "
|
|
2217
|
+
f"Spot instances are ~1/3 the cost but can be reclaimed by AWS with 2-min notice. "
|
|
2218
|
+
f"Pass --spot to confirm: gpu-dev reserve --gpu-type {gpu_type} --spot"
|
|
2219
|
+
)
|
|
2220
|
+
logger.warning(f"Reservation: spot acknowledgment missing for {gpu_type}")
|
|
2221
|
+
return False, error_msg
|
|
2222
|
+
|
|
2208
2223
|
# Validate GPU count based on type
|
|
2209
2224
|
if gpu_type.startswith("cpu-") and gpu_count == 0:
|
|
2210
2225
|
pass # Valid CPU-only instance
|
|
@@ -2435,6 +2450,7 @@ def update_gpu_availability_table(
|
|
|
2435
2450
|
"b200-mig-3g": {"gpus_per_instance": 2},
|
|
2436
2451
|
"h200": {"gpus_per_instance": 8},
|
|
2437
2452
|
"b200": {"gpus_per_instance": 8},
|
|
2453
|
+
"b300": {"gpus_per_instance": 8},
|
|
2438
2454
|
}
|
|
2439
2455
|
|
|
2440
2456
|
gpu_config = gpu_type_configs.get(gpu_type, {"gpus_per_instance": 8})
|
|
@@ -6529,6 +6545,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6529
6545
|
"p5e.48xlarge": "H200",
|
|
6530
6546
|
"p5en.48xlarge": "H200",
|
|
6531
6547
|
"p6-b200.48xlarge": "B200",
|
|
6548
|
+
"p6-b300.48xlarge": "B300",
|
|
6532
6549
|
}
|
|
6533
6550
|
|
|
6534
6551
|
gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
|
|
@@ -180,8 +180,13 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.28"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.16"
|
|
185
|
+
# Comma-separated GPU types that require --spot flag, or "all" for every type.
|
|
186
|
+
# Empty = no spot types (on-demand / reserved). Set per-workspace.
|
|
187
|
+
SPOT_GPU_TYPES = lookup({
|
|
188
|
+
"prod-east1" = "all"
|
|
189
|
+
}, terraform.workspace, "")
|
|
185
190
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
191
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
192
|
}, local.alb_env_vars)
|
|
@@ -334,6 +334,60 @@ locals {
|
|
|
334
334
|
use_self_managed_nodes = true
|
|
335
335
|
instance_type = "g4dn.12xlarge"
|
|
336
336
|
supported_gpu_types = {
|
|
337
|
+
# 8-GPU spot instances. instance_count=1 means the ASG tries to maintain 1
|
|
338
|
+
# spot instance per type — if AWS can't grant it (capacity / quota), the ASG
|
|
339
|
+
# sits at 0 and gpu-dev reservations queue. Bump counts once we see what
|
|
340
|
+
# actually gets fulfilled in us-east-1.
|
|
341
|
+
"b300" = {
|
|
342
|
+
instance_type = "p6-b300.48xlarge"
|
|
343
|
+
instance_types = null
|
|
344
|
+
instance_count = 1
|
|
345
|
+
gpus_per_instance = 8
|
|
346
|
+
use_placement_group = false
|
|
347
|
+
architecture = "x86_64"
|
|
348
|
+
efa_network_cards = 8
|
|
349
|
+
use_spot = true
|
|
350
|
+
}
|
|
351
|
+
"b200" = {
|
|
352
|
+
instance_type = "p6-b200.48xlarge"
|
|
353
|
+
instance_types = null
|
|
354
|
+
instance_count = 1
|
|
355
|
+
gpus_per_instance = 8
|
|
356
|
+
use_placement_group = false
|
|
357
|
+
architecture = "x86_64"
|
|
358
|
+
efa_network_cards = 8
|
|
359
|
+
use_spot = true
|
|
360
|
+
}
|
|
361
|
+
"h200" = {
|
|
362
|
+
instance_type = "p5e.48xlarge"
|
|
363
|
+
instance_types = null
|
|
364
|
+
instance_count = 1
|
|
365
|
+
gpus_per_instance = 8
|
|
366
|
+
use_placement_group = false
|
|
367
|
+
architecture = "x86_64"
|
|
368
|
+
efa_network_cards = 16
|
|
369
|
+
use_spot = true
|
|
370
|
+
}
|
|
371
|
+
"h100" = {
|
|
372
|
+
instance_type = "p5.48xlarge"
|
|
373
|
+
instance_types = null
|
|
374
|
+
instance_count = 1
|
|
375
|
+
gpus_per_instance = 8
|
|
376
|
+
use_placement_group = false
|
|
377
|
+
architecture = "x86_64"
|
|
378
|
+
efa_network_cards = 32
|
|
379
|
+
use_spot = true
|
|
380
|
+
}
|
|
381
|
+
"a100" = {
|
|
382
|
+
instance_type = "p4d.24xlarge"
|
|
383
|
+
instance_types = null
|
|
384
|
+
instance_count = 1
|
|
385
|
+
gpus_per_instance = 8
|
|
386
|
+
use_placement_group = false
|
|
387
|
+
architecture = "x86_64"
|
|
388
|
+
efa_network_cards = 4
|
|
389
|
+
use_spot = true
|
|
390
|
+
}
|
|
337
391
|
"t4" = {
|
|
338
392
|
instance_type = "g4dn.12xlarge"
|
|
339
393
|
instance_types = null
|
|
@@ -421,8 +475,15 @@ locals {
|
|
|
421
475
|
# Workspace-specific GPU type to subnet mappings
|
|
422
476
|
gpu_subnet_assignments = {
|
|
423
477
|
"prod-east1" = {
|
|
424
|
-
# All node types land in the primary subnet (us-east-1a).
|
|
425
|
-
#
|
|
478
|
+
# All node types land in the primary subnet (us-east-1a). Multi-EFA types
|
|
479
|
+
# (efa_network_cards > 1) automatically use the private subnet in the same AZ.
|
|
480
|
+
# Specific instance types may not have capacity in us-east-1a — those ASGs will
|
|
481
|
+
# sit at 0 until we widen to other AZs, that's expected for beta.
|
|
482
|
+
b300 = "primary"
|
|
483
|
+
b200 = "primary"
|
|
484
|
+
h200 = "primary"
|
|
485
|
+
h100 = "primary"
|
|
486
|
+
a100 = "primary"
|
|
426
487
|
t4 = "primary"
|
|
427
488
|
l4 = "primary"
|
|
428
489
|
"cpu-x86" = "primary"
|
|
@@ -451,6 +512,22 @@ locals {
|
|
|
451
512
|
}
|
|
452
513
|
}
|
|
453
514
|
|
|
515
|
+
# Subdomain NS delegations to create in *this* workspace's parent zone. Lets
|
|
516
|
+
# prod (which owns devservers.io) auto-publish NS records pointing at child zones
|
|
517
|
+
# in other workspaces (prod-east1, future regions) without manual -var flags.
|
|
518
|
+
# The NS values come from `tofu output devservers_name_servers` in the child
|
|
519
|
+
# workspace once its hosted zone has been created.
|
|
520
|
+
prod_subdomain_delegations = {
|
|
521
|
+
prod = {
|
|
522
|
+
"east1.devservers.io" = [
|
|
523
|
+
"ns-1079.awsdns-06.org",
|
|
524
|
+
"ns-1999.awsdns-57.co.uk",
|
|
525
|
+
"ns-341.awsdns-42.com",
|
|
526
|
+
"ns-624.awsdns-14.net",
|
|
527
|
+
]
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
454
531
|
# Per-capacity-reservation AZ mappings (overrides gpu_subnet_assignments when CR is used)
|
|
455
532
|
capacity_reservation_azs = {
|
|
456
533
|
"prod-east1" = {
|
|
@@ -12,7 +12,8 @@ resource "helm_release" "aws_node_termination_handler" {
|
|
|
12
12
|
repository = "https://aws.github.io/eks-charts"
|
|
13
13
|
chart = "aws-node-termination-handler"
|
|
14
14
|
namespace = "kube-system"
|
|
15
|
-
version
|
|
15
|
+
# No version pin — chart versions advance frequently and my first guess (0.27.1)
|
|
16
|
+
# didn't exist. helm picks current latest stable. Add a pin once we hit a regression.
|
|
16
17
|
cleanup_on_fail = true
|
|
17
18
|
|
|
18
19
|
values = [yamlencode({
|
|
@@ -51,6 +51,19 @@ resource "aws_route53_record" "manual_subdomain_delegation" {
|
|
|
51
51
|
records = var.subdomain_ns_records
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
+
# Auto-published NS delegations for child workspaces. Iterates prod_subdomain_delegations
|
|
55
|
+
# (defined in main.tf) for the current workspace and creates an NS record per entry in
|
|
56
|
+
# the parent zone — so `tofu apply` in prod automatically wires up east1.devservers.io
|
|
57
|
+
# (and any future region) without -var flags.
|
|
58
|
+
resource "aws_route53_record" "workspace_subdomain_delegations" {
|
|
59
|
+
for_each = local.effective_domain_name != "" && !local.is_subdomain ? try(local.prod_subdomain_delegations[terraform.workspace], {}) : {}
|
|
60
|
+
zone_id = data.aws_route53_zone.parent[0].zone_id
|
|
61
|
+
name = each.key
|
|
62
|
+
type = "NS"
|
|
63
|
+
ttl = 300
|
|
64
|
+
records = each.value
|
|
65
|
+
}
|
|
66
|
+
|
|
54
67
|
# Use appropriate hosted zone (subdomain if created, otherwise parent)
|
|
55
68
|
locals {
|
|
56
69
|
hosted_zone_id = local.is_subdomain ? aws_route53_zone.subdomain[0].zone_id : (local.effective_domain_name != "" ? data.aws_route53_zone.parent[0].zone_id : "")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|