gpu-dev 0.5.25__tar.gz → 0.5.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/PKG-INFO +1 -1
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +19 -2
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +5 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +1 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/pyproject.toml +1 -1
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/eks.tf +7 -3
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/index.py +4 -1
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/main.tf +141 -0
- gpu_dev-0.5.27/terraform-gpu-devservers/node-termination-handler.tf +37 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/route53.tf +13 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/.gitignore +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/CLAUDE.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/PROGRESS.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/README.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/TODO.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/admin/README.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/post.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/setup.cfg +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.25 → gpu_dev-0.5.27}/tests/submit/success/run.sh +0 -0
|
@@ -52,6 +52,7 @@ terraform-gpu-devservers/main.tf
|
|
|
52
52
|
terraform-gpu-devservers/mig-config.tf
|
|
53
53
|
terraform-gpu-devservers/mig-parted-config.yaml
|
|
54
54
|
terraform-gpu-devservers/monitoring.tf
|
|
55
|
+
terraform-gpu-devservers/node-termination-handler.tf
|
|
55
56
|
terraform-gpu-devservers/outputs.tf
|
|
56
57
|
terraform-gpu-devservers/pyproject.toml
|
|
57
58
|
terraform-gpu-devservers/queue.tf
|
|
@@ -496,7 +496,7 @@ def main(ctx: click.Context) -> None:
|
|
|
496
496
|
"--gpu-type",
|
|
497
497
|
"-t",
|
|
498
498
|
type=click.Choice(
|
|
499
|
-
["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
499
|
+
["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
500
500
|
),
|
|
501
501
|
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
|
|
502
502
|
)
|
|
@@ -662,6 +662,7 @@ def reserve(
|
|
|
662
662
|
"b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
|
|
663
663
|
"h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
|
|
664
664
|
"b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
|
|
665
|
+
"b300": {"max_gpus": 8, "instance_type": "p6e-b300.48xlarge"},
|
|
665
666
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
666
667
|
"cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
|
|
667
668
|
}
|
|
@@ -1350,7 +1351,7 @@ def reserve(
|
|
|
1350
1351
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
1351
1352
|
|
|
1352
1353
|
|
|
1353
|
-
_SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1354
|
+
_SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1354
1355
|
"h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
|
|
1355
1356
|
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
|
|
1356
1357
|
|
|
@@ -2719,6 +2720,7 @@ def _show_availability() -> None:
|
|
|
2719
2720
|
# GPU architecture mapping (for display)
|
|
2720
2721
|
gpu_architectures = {
|
|
2721
2722
|
"b200": "Blackwell (sm100)",
|
|
2723
|
+
"b300": "Blackwell (sm100)",
|
|
2722
2724
|
"h200": "Hopper (sm90)",
|
|
2723
2725
|
"h100": "Hopper (sm90)",
|
|
2724
2726
|
"a100": "Ampere (sm80)",
|
|
@@ -2880,6 +2882,7 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2880
2882
|
# GPU architecture mapping (for display)
|
|
2881
2883
|
gpu_architectures = {
|
|
2882
2884
|
"b200": "Blackwell (sm100)",
|
|
2885
|
+
"b300": "Blackwell (sm100)",
|
|
2883
2886
|
"h200": "Hopper (sm90)",
|
|
2884
2887
|
"h100": "Hopper (sm90)",
|
|
2885
2888
|
"a100": "Ampere (sm80)",
|
|
@@ -3219,6 +3222,20 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3219
3222
|
if "-A" not in ssh_command and "-o ForwardAgent=yes" not in ssh_command:
|
|
3220
3223
|
ssh_command = ssh_command.replace("ssh ", "ssh -A ", 1)
|
|
3221
3224
|
|
|
3225
|
+
# Inject AddKeysToAgent so the first connect from this laptop loads the user\'s
|
|
3226
|
+
# IdentityFile into ssh-agent — without this the forwarded agent is empty on
|
|
3227
|
+
# subsequent pod→pod hops. UseKeychain persists the passphrase across reboots on
|
|
3228
|
+
# macOS; IgnoreUnknown lets Linux SSH ignore the macOS-only option cleanly.
|
|
3229
|
+
# The same options live in ~/.gpu-dev/<id>-sshconfig but ssh only honours them
|
|
3230
|
+
# when the command-line target matches a Host block, which this connect command
|
|
3231
|
+
# bypasses by passing the FQDN directly.
|
|
3232
|
+
if "AddKeysToAgent" not in ssh_command:
|
|
3233
|
+
ssh_command = ssh_command.replace(
|
|
3234
|
+
"ssh ",
|
|
3235
|
+
"ssh -o AddKeysToAgent=yes -o IgnoreUnknown=UseKeychain -o UseKeychain=yes ",
|
|
3236
|
+
1,
|
|
3237
|
+
)
|
|
3238
|
+
|
|
3222
3239
|
# When running from inside a gpu-dev pod (=GPU_DEV_USER_ID env var set) and the
|
|
3223
3240
|
# forwarded SSH agent is reachable but empty, the next hop is going to fail with
|
|
3224
3241
|
# 'Permission denied (publickey)'. Warn upfront so the user knows to ssh-add on
|
|
@@ -22,6 +22,11 @@ class Config:
|
|
|
22
22
|
"workspace": "prod",
|
|
23
23
|
"description": "Production environment",
|
|
24
24
|
},
|
|
25
|
+
"prod-east1": {
|
|
26
|
+
"region": "us-east-1",
|
|
27
|
+
"workspace": "prod-east1",
|
|
28
|
+
"description": "Spot-only us-east-1 environment (T4/L4/CPU)",
|
|
29
|
+
},
|
|
25
30
|
}
|
|
26
31
|
DEFAULT_ENVIRONMENT = "prod"
|
|
27
32
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.27"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -189,6 +189,7 @@ locals {
|
|
|
189
189
|
"h100" = "h100"
|
|
190
190
|
"h200" = "h200"
|
|
191
191
|
"b200" = "b200"
|
|
192
|
+
"b300" = "b300"
|
|
192
193
|
"a100" = "a100"
|
|
193
194
|
"cpu-arm" = "cpu-arm"
|
|
194
195
|
"cpu-x86" = "cpu-x86"
|
|
@@ -401,11 +402,14 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
|
|
|
401
402
|
}
|
|
402
403
|
}
|
|
403
404
|
|
|
404
|
-
#
|
|
405
|
+
# instance_market_options: capacity-block when bound to a reservation, spot when
|
|
406
|
+
# the workspace's gpu_config has use_spot=true, otherwise on-demand (no block).
|
|
407
|
+
# Spot is mutually exclusive with capacity reservations — AWS rejects launch templates
|
|
408
|
+
# carrying both, so the precedence here is CR > spot > on-demand.
|
|
405
409
|
dynamic "instance_market_options" {
|
|
406
|
-
for_each = each.value.capacity_reservation_id != null ? [1] : []
|
|
410
|
+
for_each = (each.value.capacity_reservation_id != null || try(each.value.gpu_config.use_spot, false)) ? [1] : []
|
|
407
411
|
content {
|
|
408
|
-
market_type = "capacity-block"
|
|
412
|
+
market_type = each.value.capacity_reservation_id != null ? "capacity-block" : "spot"
|
|
409
413
|
}
|
|
410
414
|
}
|
|
411
415
|
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -81,6 +81,7 @@ GPU_CONFIG = {
|
|
|
81
81
|
"h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
82
82
|
"h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
83
83
|
"b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
84
|
+
"b300": {"instance_type": "p6e-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
|
|
84
85
|
"cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
85
86
|
"cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
86
87
|
}
|
|
@@ -2188,7 +2189,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2188
2189
|
# Validate GPU type
|
|
2189
2190
|
valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
|
|
2190
2191
|
"h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
|
|
2191
|
-
"h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
|
|
2192
|
+
"h200", "b200", "b300", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
|
|
2192
2193
|
"cpu-arm", "cpu-x86"]
|
|
2193
2194
|
if gpu_type not in valid_gpu_types:
|
|
2194
2195
|
error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
|
|
@@ -2435,6 +2436,7 @@ def update_gpu_availability_table(
|
|
|
2435
2436
|
"b200-mig-3g": {"gpus_per_instance": 2},
|
|
2436
2437
|
"h200": {"gpus_per_instance": 8},
|
|
2437
2438
|
"b200": {"gpus_per_instance": 8},
|
|
2439
|
+
"b300": {"gpus_per_instance": 8},
|
|
2438
2440
|
}
|
|
2439
2441
|
|
|
2440
2442
|
gpu_config = gpu_type_configs.get(gpu_type, {"gpus_per_instance": 8})
|
|
@@ -6529,6 +6531,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6529
6531
|
"p5e.48xlarge": "H200",
|
|
6530
6532
|
"p5en.48xlarge": "H200",
|
|
6531
6533
|
"p6-b200.48xlarge": "B200",
|
|
6534
|
+
"p6e-b300.48xlarge": "B300",
|
|
6532
6535
|
}
|
|
6533
6536
|
|
|
6534
6537
|
gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.27"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.16"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
@@ -58,6 +58,13 @@ provider "helm" {
|
|
|
58
58
|
# Data sources
|
|
59
59
|
data "aws_availability_zones" "available" {
|
|
60
60
|
state = "available"
|
|
61
|
+
# Exclude Local Zones (e.g. us-east-1-dfw-2a) and Wavelength Zones — EKS control
|
|
62
|
+
# plane only supports standard AZs. us-east-2 doesn't have Local Zones so the
|
|
63
|
+
# existing prod workspace was unaffected; us-east-1 has several (dfw, bos, …).
|
|
64
|
+
filter {
|
|
65
|
+
name = "opt-in-status"
|
|
66
|
+
values = ["opt-in-not-required"]
|
|
67
|
+
}
|
|
61
68
|
}
|
|
62
69
|
|
|
63
70
|
data "aws_caller_identity" "current" {}
|
|
@@ -315,6 +322,104 @@ locals {
|
|
|
315
322
|
}
|
|
316
323
|
}
|
|
317
324
|
}
|
|
325
|
+
# us-east-1 spot-only experimental cluster.
|
|
326
|
+
# Same provisioning shape as prod (managed via the terraform.workspace switch) but
|
|
327
|
+
# backed entirely by EC2 Spot — first cheap-and-cheerful environment we can deploy
|
|
328
|
+
# new instance types into (B300 land here once on-demand quota arrives).
|
|
329
|
+
"prod-east1" = {
|
|
330
|
+
aws_region = "us-east-1"
|
|
331
|
+
environment = "prod-east1"
|
|
332
|
+
domain_name = "east1.devservers.io"
|
|
333
|
+
gpu_instance_count = 1
|
|
334
|
+
use_self_managed_nodes = true
|
|
335
|
+
instance_type = "g4dn.12xlarge"
|
|
336
|
+
supported_gpu_types = {
|
|
337
|
+
# 8-GPU spot instances. instance_count=1 means the ASG tries to maintain 1
|
|
338
|
+
# spot instance per type — if AWS can't grant it (capacity / quota), the ASG
|
|
339
|
+
# sits at 0 and gpu-dev reservations queue. Bump counts once we see what
|
|
340
|
+
# actually gets fulfilled in us-east-1.
|
|
341
|
+
"b300" = {
|
|
342
|
+
instance_type = "p6e-b300.48xlarge"
|
|
343
|
+
instance_types = null
|
|
344
|
+
instance_count = 1
|
|
345
|
+
gpus_per_instance = 8
|
|
346
|
+
use_placement_group = false
|
|
347
|
+
architecture = "x86_64"
|
|
348
|
+
efa_network_cards = 8
|
|
349
|
+
use_spot = true
|
|
350
|
+
}
|
|
351
|
+
"b200" = {
|
|
352
|
+
instance_type = "p6-b200.48xlarge"
|
|
353
|
+
instance_types = null
|
|
354
|
+
instance_count = 1
|
|
355
|
+
gpus_per_instance = 8
|
|
356
|
+
use_placement_group = false
|
|
357
|
+
architecture = "x86_64"
|
|
358
|
+
efa_network_cards = 8
|
|
359
|
+
use_spot = true
|
|
360
|
+
}
|
|
361
|
+
"h200" = {
|
|
362
|
+
instance_type = "p5e.48xlarge"
|
|
363
|
+
instance_types = null
|
|
364
|
+
instance_count = 1
|
|
365
|
+
gpus_per_instance = 8
|
|
366
|
+
use_placement_group = false
|
|
367
|
+
architecture = "x86_64"
|
|
368
|
+
efa_network_cards = 16
|
|
369
|
+
use_spot = true
|
|
370
|
+
}
|
|
371
|
+
"h100" = {
|
|
372
|
+
instance_type = "p5.48xlarge"
|
|
373
|
+
instance_types = null
|
|
374
|
+
instance_count = 1
|
|
375
|
+
gpus_per_instance = 8
|
|
376
|
+
use_placement_group = false
|
|
377
|
+
architecture = "x86_64"
|
|
378
|
+
efa_network_cards = 32
|
|
379
|
+
use_spot = true
|
|
380
|
+
}
|
|
381
|
+
"a100" = {
|
|
382
|
+
instance_type = "p4d.24xlarge"
|
|
383
|
+
instance_types = null
|
|
384
|
+
instance_count = 1
|
|
385
|
+
gpus_per_instance = 8
|
|
386
|
+
use_placement_group = false
|
|
387
|
+
architecture = "x86_64"
|
|
388
|
+
efa_network_cards = 4
|
|
389
|
+
use_spot = true
|
|
390
|
+
}
|
|
391
|
+
"t4" = {
|
|
392
|
+
instance_type = "g4dn.12xlarge"
|
|
393
|
+
instance_types = null
|
|
394
|
+
instance_count = 1
|
|
395
|
+
gpus_per_instance = 4
|
|
396
|
+
use_placement_group = false
|
|
397
|
+
architecture = "x86_64"
|
|
398
|
+
efa_network_cards = 0
|
|
399
|
+
use_spot = true
|
|
400
|
+
}
|
|
401
|
+
"l4" = {
|
|
402
|
+
instance_type = "g6.12xlarge"
|
|
403
|
+
instance_types = null
|
|
404
|
+
instance_count = 1
|
|
405
|
+
gpus_per_instance = 4
|
|
406
|
+
use_placement_group = false
|
|
407
|
+
architecture = "x86_64"
|
|
408
|
+
efa_network_cards = 1
|
|
409
|
+
use_spot = true
|
|
410
|
+
}
|
|
411
|
+
"cpu-x86" = {
|
|
412
|
+
instance_type = "c7i.8xlarge"
|
|
413
|
+
instance_types = null
|
|
414
|
+
instance_count = 5
|
|
415
|
+
gpus_per_instance = 0
|
|
416
|
+
use_placement_group = false
|
|
417
|
+
architecture = "x86_64"
|
|
418
|
+
efa_network_cards = 0
|
|
419
|
+
use_spot = true
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
318
423
|
}
|
|
319
424
|
|
|
320
425
|
# Current workspace configuration
|
|
@@ -322,6 +427,9 @@ locals {
|
|
|
322
427
|
|
|
323
428
|
# Workspace-specific capacity reservations (with manual instance counts)
|
|
324
429
|
capacity_reservations = {
|
|
430
|
+
"prod-east1" = {
|
|
431
|
+
# No capacity reservations — this workspace is spot-only.
|
|
432
|
+
}
|
|
325
433
|
default = {
|
|
326
434
|
# Test environment capacity reservations
|
|
327
435
|
# h100 = [
|
|
@@ -366,6 +474,20 @@ locals {
|
|
|
366
474
|
|
|
367
475
|
# Workspace-specific GPU type to subnet mappings
|
|
368
476
|
gpu_subnet_assignments = {
|
|
477
|
+
"prod-east1" = {
|
|
478
|
+
# All node types land in the primary subnet (us-east-1a). Multi-EFA types
|
|
479
|
+
# (efa_network_cards > 1) automatically use the private subnet in the same AZ.
|
|
480
|
+
# Specific instance types may not have capacity in us-east-1a — those ASGs will
|
|
481
|
+
# sit at 0 until we widen to other AZs, that's expected for beta.
|
|
482
|
+
b300 = "primary"
|
|
483
|
+
b200 = "primary"
|
|
484
|
+
h200 = "primary"
|
|
485
|
+
h100 = "primary"
|
|
486
|
+
a100 = "primary"
|
|
487
|
+
t4 = "primary"
|
|
488
|
+
l4 = "primary"
|
|
489
|
+
"cpu-x86" = "primary"
|
|
490
|
+
}
|
|
369
491
|
default = {
|
|
370
492
|
# Test environment - T4 nodes in multiple AZs for testing
|
|
371
493
|
t4 = "primary" # T4 in us-west-1a (primary AZ)
|
|
@@ -390,8 +512,27 @@ locals {
|
|
|
390
512
|
}
|
|
391
513
|
}
|
|
392
514
|
|
|
515
|
+
# Subdomain NS delegations to create in *this* workspace's parent zone. Lets
|
|
516
|
+
# prod (which owns devservers.io) auto-publish NS records pointing at child zones
|
|
517
|
+
# in other workspaces (prod-east1, future regions) without manual -var flags.
|
|
518
|
+
# The NS values come from `tofu output devservers_name_servers` in the child
|
|
519
|
+
# workspace once its hosted zone has been created.
|
|
520
|
+
prod_subdomain_delegations = {
|
|
521
|
+
prod = {
|
|
522
|
+
"east1.devservers.io" = [
|
|
523
|
+
"ns-1079.awsdns-06.org",
|
|
524
|
+
"ns-1999.awsdns-57.co.uk",
|
|
525
|
+
"ns-341.awsdns-42.com",
|
|
526
|
+
"ns-624.awsdns-14.net",
|
|
527
|
+
]
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
393
531
|
# Per-capacity-reservation AZ mappings (overrides gpu_subnet_assignments when CR is used)
|
|
394
532
|
capacity_reservation_azs = {
|
|
533
|
+
"prod-east1" = {
|
|
534
|
+
# Empty — no CRs in this workspace.
|
|
535
|
+
}
|
|
395
536
|
default = {
|
|
396
537
|
"cr-04d3d1d84e127a562" = "secondary" # us-west-1c
|
|
397
538
|
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# AWS Node Termination Handler — graceful drain on spot-interrupt + ASG lifecycle events.
|
|
2
|
+
#
|
|
3
|
+
# IMDS mode (one DaemonSet per node, no SQS / no IAM role) is plenty for our use case:
|
|
4
|
+
# we don't care about queue-processor features (rebalance recommendations, scheduled
|
|
5
|
+
# events). We just want pods to get a clean SIGTERM when AWS sends the 2-minute spot
|
|
6
|
+
# notice via instance metadata, instead of being killed cold.
|
|
7
|
+
#
|
|
8
|
+
# Tolerates everything so it runs on the GPU nodes that have nvidia.com/gpu:NoSchedule.
|
|
9
|
+
|
|
10
|
+
resource "helm_release" "aws_node_termination_handler" {
|
|
11
|
+
name = "aws-node-termination-handler"
|
|
12
|
+
repository = "https://aws.github.io/eks-charts"
|
|
13
|
+
chart = "aws-node-termination-handler"
|
|
14
|
+
namespace = "kube-system"
|
|
15
|
+
# No version pin — chart versions advance frequently and my first guess (0.27.1)
|
|
16
|
+
# didn't exist. helm picks current latest stable. Add a pin once we hit a regression.
|
|
17
|
+
cleanup_on_fail = true
|
|
18
|
+
|
|
19
|
+
values = [yamlencode({
|
|
20
|
+
enableSpotInterruptionDraining = true
|
|
21
|
+
enableScheduledEventDraining = true
|
|
22
|
+
enableRebalanceMonitoring = true
|
|
23
|
+
enableRebalanceDraining = false # warning only; rebalance recommendations are too noisy
|
|
24
|
+
nodeSelector = {
|
|
25
|
+
"kubernetes.io/os" = "linux"
|
|
26
|
+
}
|
|
27
|
+
tolerations = [
|
|
28
|
+
{ operator = "Exists" }, # tolerate every taint; we want NTH on every node, including GPU nodes
|
|
29
|
+
]
|
|
30
|
+
resources = {
|
|
31
|
+
requests = { cpu = "50m", memory = "64Mi" }
|
|
32
|
+
limits = { cpu = "100m", memory = "128Mi" }
|
|
33
|
+
}
|
|
34
|
+
})]
|
|
35
|
+
|
|
36
|
+
depends_on = [aws_eks_cluster.gpu_dev_cluster]
|
|
37
|
+
}
|
|
@@ -51,6 +51,19 @@ resource "aws_route53_record" "manual_subdomain_delegation" {
|
|
|
51
51
|
records = var.subdomain_ns_records
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
+
# Auto-published NS delegations for child workspaces. Iterates prod_subdomain_delegations
|
|
55
|
+
# (defined in main.tf) for the current workspace and creates an NS record per entry in
|
|
56
|
+
# the parent zone — so `tofu apply` in prod automatically wires up east1.devservers.io
|
|
57
|
+
# (and any future region) without -var flags.
|
|
58
|
+
resource "aws_route53_record" "workspace_subdomain_delegations" {
|
|
59
|
+
for_each = local.effective_domain_name != "" && !local.is_subdomain ? try(local.prod_subdomain_delegations[terraform.workspace], {}) : {}
|
|
60
|
+
zone_id = data.aws_route53_zone.parent[0].zone_id
|
|
61
|
+
name = each.key
|
|
62
|
+
type = "NS"
|
|
63
|
+
ttl = 300
|
|
64
|
+
records = each.value
|
|
65
|
+
}
|
|
66
|
+
|
|
54
67
|
# Use appropriate hosted zone (subdomain if created, otherwise parent)
|
|
55
68
|
locals {
|
|
56
69
|
hosted_zone_id = local.is_subdomain ? aws_route53_zone.subdomain[0].zone_id : (local.effective_domain_name != "" ? data.aws_route53_zone.parent[0].zone_id : "")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|