gpu-dev 0.5.26__tar.gz → 0.5.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/PKG-INFO +1 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +5 -2
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +1 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/pyproject.toml +1 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/eks.tf +1 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/index.py +4 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/main.tf +79 -2
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/node-termination-handler.tf +2 -1
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/route53.tf +13 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/.gitignore +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/CLAUDE.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/PROGRESS.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/TODO.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/admin/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/post.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/setup.cfg +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.26 → gpu_dev-0.5.27}/tests/submit/success/run.sh +0 -0
|
@@ -496,7 +496,7 @@ def main(ctx: click.Context) -> None:
|
|
|
496
496
|
"--gpu-type",
|
|
497
497
|
"-t",
|
|
498
498
|
type=click.Choice(
|
|
499
|
-
["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
499
|
+
["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
500
500
|
),
|
|
501
501
|
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
|
|
502
502
|
)
|
|
@@ -662,6 +662,7 @@ def reserve(
|
|
|
662
662
|
"b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
|
|
663
663
|
"h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
|
|
664
664
|
"b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
|
|
665
|
+
"b300": {"max_gpus": 8, "instance_type": "p6e-b300.48xlarge"},
|
|
665
666
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
666
667
|
"cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
|
|
667
668
|
}
|
|
@@ -1350,7 +1351,7 @@ def reserve(
|
|
|
1350
1351
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
1351
1352
|
|
|
1352
1353
|
|
|
1353
|
-
_SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1354
|
+
_SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
|
|
1354
1355
|
"h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
|
|
1355
1356
|
"a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
|
|
1356
1357
|
|
|
@@ -2719,6 +2720,7 @@ def _show_availability() -> None:
|
|
|
2719
2720
|
# GPU architecture mapping (for display)
|
|
2720
2721
|
gpu_architectures = {
|
|
2721
2722
|
"b200": "Blackwell (sm100)",
|
|
2723
|
+
"b300": "Blackwell (sm100)",
|
|
2722
2724
|
"h200": "Hopper (sm90)",
|
|
2723
2725
|
"h100": "Hopper (sm90)",
|
|
2724
2726
|
"a100": "Ampere (sm80)",
|
|
@@ -2880,6 +2882,7 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2880
2882
|
# GPU architecture mapping (for display)
|
|
2881
2883
|
gpu_architectures = {
|
|
2882
2884
|
"b200": "Blackwell (sm100)",
|
|
2885
|
+
"b300": "Blackwell (sm100)",
|
|
2883
2886
|
"h200": "Hopper (sm90)",
|
|
2884
2887
|
"h100": "Hopper (sm90)",
|
|
2885
2888
|
"a100": "Ampere (sm80)",
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.27"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -81,6 +81,7 @@ GPU_CONFIG = {
|
|
|
81
81
|
"h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
82
82
|
"h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
83
83
|
"b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
|
|
84
|
+
"b300": {"instance_type": "p6e-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
|
|
84
85
|
"cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
85
86
|
"cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
86
87
|
}
|
|
@@ -2188,7 +2189,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2188
2189
|
# Validate GPU type
|
|
2189
2190
|
valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
|
|
2190
2191
|
"h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
|
|
2191
|
-
"h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
|
|
2192
|
+
"h200", "b200", "b300", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
|
|
2192
2193
|
"cpu-arm", "cpu-x86"]
|
|
2193
2194
|
if gpu_type not in valid_gpu_types:
|
|
2194
2195
|
error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
|
|
@@ -2435,6 +2436,7 @@ def update_gpu_availability_table(
|
|
|
2435
2436
|
"b200-mig-3g": {"gpus_per_instance": 2},
|
|
2436
2437
|
"h200": {"gpus_per_instance": 8},
|
|
2437
2438
|
"b200": {"gpus_per_instance": 8},
|
|
2439
|
+
"b300": {"gpus_per_instance": 8},
|
|
2438
2440
|
}
|
|
2439
2441
|
|
|
2440
2442
|
gpu_config = gpu_type_configs.get(gpu_type, {"gpus_per_instance": 8})
|
|
@@ -6529,6 +6531,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6529
6531
|
"p5e.48xlarge": "H200",
|
|
6530
6532
|
"p5en.48xlarge": "H200",
|
|
6531
6533
|
"p6-b200.48xlarge": "B200",
|
|
6534
|
+
"p6e-b300.48xlarge": "B300",
|
|
6532
6535
|
}
|
|
6533
6536
|
|
|
6534
6537
|
gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.27"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.16"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
@@ -334,6 +334,60 @@ locals {
|
|
|
334
334
|
use_self_managed_nodes = true
|
|
335
335
|
instance_type = "g4dn.12xlarge"
|
|
336
336
|
supported_gpu_types = {
|
|
337
|
+
# 8-GPU spot instances. instance_count=1 means the ASG tries to maintain 1
|
|
338
|
+
# spot instance per type — if AWS can't grant it (capacity / quota), the ASG
|
|
339
|
+
# sits at 0 and gpu-dev reservations queue. Bump counts once we see what
|
|
340
|
+
# actually gets fulfilled in us-east-1.
|
|
341
|
+
"b300" = {
|
|
342
|
+
instance_type = "p6e-b300.48xlarge"
|
|
343
|
+
instance_types = null
|
|
344
|
+
instance_count = 1
|
|
345
|
+
gpus_per_instance = 8
|
|
346
|
+
use_placement_group = false
|
|
347
|
+
architecture = "x86_64"
|
|
348
|
+
efa_network_cards = 8
|
|
349
|
+
use_spot = true
|
|
350
|
+
}
|
|
351
|
+
"b200" = {
|
|
352
|
+
instance_type = "p6-b200.48xlarge"
|
|
353
|
+
instance_types = null
|
|
354
|
+
instance_count = 1
|
|
355
|
+
gpus_per_instance = 8
|
|
356
|
+
use_placement_group = false
|
|
357
|
+
architecture = "x86_64"
|
|
358
|
+
efa_network_cards = 8
|
|
359
|
+
use_spot = true
|
|
360
|
+
}
|
|
361
|
+
"h200" = {
|
|
362
|
+
instance_type = "p5e.48xlarge"
|
|
363
|
+
instance_types = null
|
|
364
|
+
instance_count = 1
|
|
365
|
+
gpus_per_instance = 8
|
|
366
|
+
use_placement_group = false
|
|
367
|
+
architecture = "x86_64"
|
|
368
|
+
efa_network_cards = 16
|
|
369
|
+
use_spot = true
|
|
370
|
+
}
|
|
371
|
+
"h100" = {
|
|
372
|
+
instance_type = "p5.48xlarge"
|
|
373
|
+
instance_types = null
|
|
374
|
+
instance_count = 1
|
|
375
|
+
gpus_per_instance = 8
|
|
376
|
+
use_placement_group = false
|
|
377
|
+
architecture = "x86_64"
|
|
378
|
+
efa_network_cards = 32
|
|
379
|
+
use_spot = true
|
|
380
|
+
}
|
|
381
|
+
"a100" = {
|
|
382
|
+
instance_type = "p4d.24xlarge"
|
|
383
|
+
instance_types = null
|
|
384
|
+
instance_count = 1
|
|
385
|
+
gpus_per_instance = 8
|
|
386
|
+
use_placement_group = false
|
|
387
|
+
architecture = "x86_64"
|
|
388
|
+
efa_network_cards = 4
|
|
389
|
+
use_spot = true
|
|
390
|
+
}
|
|
337
391
|
"t4" = {
|
|
338
392
|
instance_type = "g4dn.12xlarge"
|
|
339
393
|
instance_types = null
|
|
@@ -421,8 +475,15 @@ locals {
|
|
|
421
475
|
# Workspace-specific GPU type to subnet mappings
|
|
422
476
|
gpu_subnet_assignments = {
|
|
423
477
|
"prod-east1" = {
|
|
424
|
-
# All node types land in the primary subnet (us-east-1a).
|
|
425
|
-
#
|
|
478
|
+
# All node types land in the primary subnet (us-east-1a). Multi-EFA types
|
|
479
|
+
# (efa_network_cards > 1) automatically use the private subnet in the same AZ.
|
|
480
|
+
# Specific instance types may not have capacity in us-east-1a — those ASGs will
|
|
481
|
+
# sit at 0 until we widen to other AZs, that's expected for beta.
|
|
482
|
+
b300 = "primary"
|
|
483
|
+
b200 = "primary"
|
|
484
|
+
h200 = "primary"
|
|
485
|
+
h100 = "primary"
|
|
486
|
+
a100 = "primary"
|
|
426
487
|
t4 = "primary"
|
|
427
488
|
l4 = "primary"
|
|
428
489
|
"cpu-x86" = "primary"
|
|
@@ -451,6 +512,22 @@ locals {
|
|
|
451
512
|
}
|
|
452
513
|
}
|
|
453
514
|
|
|
515
|
+
# Subdomain NS delegations to create in *this* workspace's parent zone. Lets
|
|
516
|
+
# prod (which owns devservers.io) auto-publish NS records pointing at child zones
|
|
517
|
+
# in other workspaces (prod-east1, future regions) without manual -var flags.
|
|
518
|
+
# The NS values come from `tofu output devservers_name_servers` in the child
|
|
519
|
+
# workspace once its hosted zone has been created.
|
|
520
|
+
prod_subdomain_delegations = {
|
|
521
|
+
prod = {
|
|
522
|
+
"east1.devservers.io" = [
|
|
523
|
+
"ns-1079.awsdns-06.org",
|
|
524
|
+
"ns-1999.awsdns-57.co.uk",
|
|
525
|
+
"ns-341.awsdns-42.com",
|
|
526
|
+
"ns-624.awsdns-14.net",
|
|
527
|
+
]
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
454
531
|
# Per-capacity-reservation AZ mappings (overrides gpu_subnet_assignments when CR is used)
|
|
455
532
|
capacity_reservation_azs = {
|
|
456
533
|
"prod-east1" = {
|
|
@@ -12,7 +12,8 @@ resource "helm_release" "aws_node_termination_handler" {
|
|
|
12
12
|
repository = "https://aws.github.io/eks-charts"
|
|
13
13
|
chart = "aws-node-termination-handler"
|
|
14
14
|
namespace = "kube-system"
|
|
15
|
-
version
|
|
15
|
+
# No version pin — chart versions advance frequently and my first guess (0.27.1)
|
|
16
|
+
# didn't exist. helm picks current latest stable. Add a pin once we hit a regression.
|
|
16
17
|
cleanup_on_fail = true
|
|
17
18
|
|
|
18
19
|
values = [yamlencode({
|
|
@@ -51,6 +51,19 @@ resource "aws_route53_record" "manual_subdomain_delegation" {
|
|
|
51
51
|
records = var.subdomain_ns_records
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
+
# Auto-published NS delegations for child workspaces. Iterates prod_subdomain_delegations
|
|
55
|
+
# (defined in main.tf) for the current workspace and creates an NS record per entry in
|
|
56
|
+
# the parent zone — so `tofu apply` in prod automatically wires up east1.devservers.io
|
|
57
|
+
# (and any future region) without -var flags.
|
|
58
|
+
resource "aws_route53_record" "workspace_subdomain_delegations" {
|
|
59
|
+
for_each = local.effective_domain_name != "" && !local.is_subdomain ? try(local.prod_subdomain_delegations[terraform.workspace], {}) : {}
|
|
60
|
+
zone_id = data.aws_route53_zone.parent[0].zone_id
|
|
61
|
+
name = each.key
|
|
62
|
+
type = "NS"
|
|
63
|
+
ttl = 300
|
|
64
|
+
records = each.value
|
|
65
|
+
}
|
|
66
|
+
|
|
54
67
|
# Use appropriate hosted zone (subdomain if created, otherwise parent)
|
|
55
68
|
locals {
|
|
56
69
|
hosted_zone_id = local.is_subdomain ? aws_route53_zone.subdomain[0].zone_id : (local.effective_domain_name != "" ? data.aws_route53_zone.parent[0].zone_id : "")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|