gpu-dev 0.5.27__tar.gz → 0.5.29__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/PKG-INFO +1 -1
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +19 -12
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +6 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/pyproject.toml +1 -1
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_processor/index.py +16 -2
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda.tf +6 -1
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/main.tf +1 -1
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/.gitignore +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/CLAUDE.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/PROGRESS.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/README.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/TODO.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/admin/README.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/post.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/setup.cfg +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.27 → gpu_dev-0.5.29}/tests/submit/success/run.sh +0 -0
|
@@ -576,6 +576,8 @@ def main(ctx: click.Context) -> None:
|
|
|
576
576
|
multiple=True,
|
|
577
577
|
help="Request nodes with specific label (format: key=value). Example: --node-label nsight=true for Nsight profiling nodes",
|
|
578
578
|
)
|
|
579
|
+
@click.option("--spot", is_flag=True, default=False,
|
|
580
|
+
help="Acknowledge spot instance (~1/3 cost, may be preempted with 2-min notice). Required for spot-only types.")
|
|
579
581
|
@click.pass_context
|
|
580
582
|
def reserve(
|
|
581
583
|
ctx: click.Context,
|
|
@@ -662,7 +664,7 @@ def reserve(
|
|
|
662
664
|
"b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
|
|
663
665
|
"h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
|
|
664
666
|
"b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
|
|
665
|
-
"b300": {"max_gpus": 8, "instance_type": "
|
|
667
|
+
"b300": {"max_gpus": 8, "instance_type": "p6-b300.48xlarge"},
|
|
666
668
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
667
669
|
"cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
|
|
668
670
|
}
|
|
@@ -1271,6 +1273,7 @@ def reserve(
|
|
|
1271
1273
|
no_persistent_disk=no_persistent_disk,
|
|
1272
1274
|
preserve_entrypoint=preserve_entrypoint,
|
|
1273
1275
|
disk_name=disk,
|
|
1276
|
+
spot=spot,
|
|
1274
1277
|
node_labels=node_labels if node_labels else None,
|
|
1275
1278
|
)
|
|
1276
1279
|
else:
|
|
@@ -1289,6 +1292,7 @@ def reserve(
|
|
|
1289
1292
|
no_persistent_disk=no_persistent_disk,
|
|
1290
1293
|
preserve_entrypoint=preserve_entrypoint,
|
|
1291
1294
|
disk_name=disk,
|
|
1295
|
+
spot=spot,
|
|
1292
1296
|
node_labels=node_labels if node_labels else None,
|
|
1293
1297
|
trace=trace,
|
|
1294
1298
|
)
|
|
@@ -1362,6 +1366,8 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
|
|
|
1362
1366
|
@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
|
|
1363
1367
|
@click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
|
|
1364
1368
|
@click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
|
|
1369
|
+
@click.option("--spot", is_flag=True, default=False,
|
|
1370
|
+
help="Acknowledge spot instance (~1/3 cost, may be preempted). Required for spot-only types.")
|
|
1365
1371
|
@click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
|
|
1366
1372
|
help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
|
|
1367
1373
|
@click.option("--dockerimage", type=str, default=None,
|
|
@@ -1377,7 +1383,7 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
|
|
|
1377
1383
|
help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
|
|
1378
1384
|
@click.argument("command", nargs=-1, required=True)
|
|
1379
1385
|
@click.pass_context
|
|
1380
|
-
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
|
|
1386
|
+
def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, spot, dockerfile, dockerimage, preserve_entrypoint,
|
|
1381
1387
|
runtime, no_pull, keep_alive, name, timeout, command):
|
|
1382
1388
|
"""Submit a job: reserve, sync code, run, sync results back, auto-cancel.
|
|
1383
1389
|
|
|
@@ -1491,7 +1497,7 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
|
|
|
1491
1497
|
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1492
1498
|
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1493
1499
|
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1494
|
-
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1500
|
+
spot=spot, dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1495
1501
|
preserve_entrypoint=preserve_entrypoint)
|
|
1496
1502
|
if not reservation_ids:
|
|
1497
1503
|
rprint("[red]❌ Failed to create multinode reservation[/red]")
|
|
@@ -1502,7 +1508,7 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
|
|
|
1502
1508
|
user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
|
|
1503
1509
|
duration_hours=hours, name=name, github_user=user_info["github_user"],
|
|
1504
1510
|
no_persistent_disk=no_persistent_disk, disk_name=disk_name,
|
|
1505
|
-
dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1511
|
+
spot=spot, dockerfile=dockerfile_payload, dockerimage=dockerimage,
|
|
1506
1512
|
preserve_entrypoint=preserve_entrypoint)
|
|
1507
1513
|
if not primary_id:
|
|
1508
1514
|
rprint("[red]❌ Failed to create reservation[/red]")
|
|
@@ -3658,24 +3664,25 @@ def set(key: str, value: str) -> None:
|
|
|
3658
3664
|
|
|
3659
3665
|
|
|
3660
3666
|
@config.command()
|
|
3661
|
-
@click.argument("env_name", type=click.Choice(
|
|
3667
|
+
@click.argument("env_name", type=click.Choice(list(Config.ENVIRONMENTS.keys())))
|
|
3662
3668
|
def environment(env_name: str) -> None:
|
|
3663
|
-
"""Set the environment
|
|
3669
|
+
"""Set the environment
|
|
3664
3670
|
|
|
3665
3671
|
Sets the AWS region and Terraform workspace for the specified environment.
|
|
3666
|
-
This configuration is used by the switch-to.sh script.
|
|
3667
3672
|
|
|
3668
3673
|
Arguments:
|
|
3669
|
-
ENV_NAME: Environment name
|
|
3674
|
+
ENV_NAME: Environment name
|
|
3670
3675
|
|
|
3671
3676
|
\b
|
|
3672
3677
|
Examples:
|
|
3673
|
-
gpu-dev config environment
|
|
3674
|
-
gpu-dev config environment prod
|
|
3678
|
+
gpu-dev config environment prod # Production (us-east-2)
|
|
3679
|
+
gpu-dev config environment prod-east1 # Spot-only us-east-1
|
|
3680
|
+
gpu-dev config environment test # Test (us-west-1)
|
|
3675
3681
|
|
|
3676
3682
|
Environment configurations:
|
|
3677
|
-
test:
|
|
3678
|
-
prod:
|
|
3683
|
+
test: us-west-1, Terraform workspace 'default'
|
|
3684
|
+
prod: us-east-2, Terraform workspace 'prod'
|
|
3685
|
+
prod-east1: us-east-1, Terraform workspace 'prod-east1' (spot-only)'
|
|
3679
3686
|
"""
|
|
3680
3687
|
from .config import Config
|
|
3681
3688
|
|
|
@@ -421,6 +421,7 @@ class ReservationManager:
|
|
|
421
421
|
disk_name: Optional[str] = None,
|
|
422
422
|
node_labels: Optional[Dict[str, str]] = None,
|
|
423
423
|
trace: bool = False,
|
|
424
|
+
spot: bool = False,
|
|
424
425
|
) -> Optional[str]:
|
|
425
426
|
"""Create a new GPU reservation"""
|
|
426
427
|
try:
|
|
@@ -500,6 +501,9 @@ class ReservationManager:
|
|
|
500
501
|
if node_labels:
|
|
501
502
|
message["node_labels"] = node_labels
|
|
502
503
|
|
|
504
|
+
if spot:
|
|
505
|
+
message["spot"] = True
|
|
506
|
+
|
|
503
507
|
# Add trace flag and CLI start timestamp
|
|
504
508
|
if trace:
|
|
505
509
|
message["trace"] = True
|
|
@@ -536,6 +540,7 @@ class ReservationManager:
|
|
|
536
540
|
preserve_entrypoint: bool = False,
|
|
537
541
|
disk_name: Optional[str] = None,
|
|
538
542
|
node_labels: Optional[Dict[str, str]] = None,
|
|
543
|
+
spot: bool = False,
|
|
539
544
|
) -> Optional[List[str]]:
|
|
540
545
|
"""Create multiple GPU reservations for multinode setup"""
|
|
541
546
|
try:
|
|
@@ -602,6 +607,7 @@ class ReservationManager:
|
|
|
602
607
|
"recreate_env": recreate_env,
|
|
603
608
|
"is_multinode": True,
|
|
604
609
|
"no_persistent_disk": no_persistent_disk,
|
|
610
|
+
"spot": spot,
|
|
605
611
|
}
|
|
606
612
|
|
|
607
613
|
if github_user:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.29"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -59,6 +59,7 @@ ECR_REPOSITORY_URL = os.environ.get("ECR_REPOSITORY_URL")
|
|
|
59
59
|
# Version validation - injected via Terraform
|
|
60
60
|
LAMBDA_VERSION = os.environ.get("LAMBDA_VERSION", "0.3.9")
|
|
61
61
|
MIN_CLI_VERSION = os.environ.get("MIN_CLI_VERSION", "0.3.9")
|
|
62
|
+
SPOT_GPU_TYPES = os.environ.get("SPOT_GPU_TYPES", "")
|
|
62
63
|
OPERATIONS_TABLE = os.environ.get("OPERATIONS_TABLE", "pytorch-gpu-dev-operations")
|
|
63
64
|
|
|
64
65
|
# GPU Configuration - single source of truth for all GPU type mappings
|
|
@@ -81,7 +82,7 @@ GPU_CONFIG = {
|
|
|
81
82
|
"h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
82
83
|
"h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
83
84
|
"b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
84
|
-
"b300": {"instance_type": "
|
|
85
|
+
"b300": {"instance_type": "p6-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
|
|
85
86
|
"cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
86
87
|
"cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
87
88
|
}
|
|
@@ -2206,6 +2207,19 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2206
2207
|
logger.warning(f"User {user_id} blocked from {gpu_type}: maintenance mode")
|
|
2207
2208
|
return False, error_msg
|
|
2208
2209
|
|
|
2210
|
+
# Spot acknowledgment: if this workspace marks the GPU type as spot-only and
|
|
2211
|
+
# the user didn't pass --spot, reject with a clear message.
|
|
2212
|
+
if SPOT_GPU_TYPES and not request.get("spot", False):
|
|
2213
|
+
is_spot = SPOT_GPU_TYPES.strip() == "all" or gpu_type in [t.strip() for t in SPOT_GPU_TYPES.split(",")]
|
|
2214
|
+
if is_spot:
|
|
2215
|
+
error_msg = (
|
|
2216
|
+
f"{gpu_type.upper()} is only available as a spot instance in this environment. "
|
|
2217
|
+
f"Spot instances are ~1/3 the cost but can be reclaimed by AWS with 2-min notice. "
|
|
2218
|
+
f"Pass --spot to confirm: gpu-dev reserve --gpu-type {gpu_type} --spot"
|
|
2219
|
+
)
|
|
2220
|
+
logger.warning(f"Reservation: spot acknowledgment missing for {gpu_type}")
|
|
2221
|
+
return False, error_msg
|
|
2222
|
+
|
|
2209
2223
|
# Validate GPU count based on type
|
|
2210
2224
|
if gpu_type.startswith("cpu-") and gpu_count == 0:
|
|
2211
2225
|
pass # Valid CPU-only instance
|
|
@@ -6531,7 +6545,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6531
6545
|
"p5e.48xlarge": "H200",
|
|
6532
6546
|
"p5en.48xlarge": "H200",
|
|
6533
6547
|
"p6-b200.48xlarge": "B200",
|
|
6534
|
-
"
|
|
6548
|
+
"p6-b300.48xlarge": "B300",
|
|
6535
6549
|
}
|
|
6536
6550
|
|
|
6537
6551
|
gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
|
|
@@ -180,8 +180,13 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.28"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.16"
|
|
185
|
+
# Comma-separated GPU types that require --spot flag, or "all" for every type.
|
|
186
|
+
# Empty = no spot types (on-demand / reserved). Set per-workspace.
|
|
187
|
+
SPOT_GPU_TYPES = lookup({
|
|
188
|
+
"prod-east1" = "all"
|
|
189
|
+
}, terraform.workspace, "")
|
|
185
190
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
191
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
192
|
}, local.alb_env_vars)
|
|
@@ -339,7 +339,7 @@ locals {
|
|
|
339
339
|
# sits at 0 and gpu-dev reservations queue. Bump counts once we see what
|
|
340
340
|
# actually gets fulfilled in us-east-1.
|
|
341
341
|
"b300" = {
|
|
342
|
-
instance_type = "
|
|
342
|
+
instance_type = "p6-b300.48xlarge"
|
|
343
343
|
instance_types = null
|
|
344
344
|
instance_count = 1
|
|
345
345
|
gpus_per_instance = 8
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|