gpu-dev 0.5.24__tar.gz → 0.5.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/PKG-INFO +1 -1
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +29 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +5 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +10 -1
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/pyproject.toml +1 -1
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/eks.tf +6 -3
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/main.tf +64 -0
- gpu_dev-0.5.26/terraform-gpu-devservers/node-termination-handler.tf +36 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/.gitignore +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/CLAUDE.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/PROGRESS.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/README.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/TODO.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/admin/README.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/post.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/setup.cfg +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.24 → gpu_dev-0.5.26}/tests/submit/success/run.sh +0 -0
|
@@ -52,6 +52,7 @@ terraform-gpu-devservers/main.tf
|
|
|
52
52
|
terraform-gpu-devservers/mig-config.tf
|
|
53
53
|
terraform-gpu-devservers/mig-parted-config.yaml
|
|
54
54
|
terraform-gpu-devservers/monitoring.tf
|
|
55
|
+
terraform-gpu-devservers/node-termination-handler.tf
|
|
55
56
|
terraform-gpu-devservers/outputs.tf
|
|
56
57
|
terraform-gpu-devservers/pyproject.toml
|
|
57
58
|
terraform-gpu-devservers/queue.tf
|
|
@@ -4,6 +4,7 @@ Reserve and manage GPU development servers
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
|
+
import os
|
|
7
8
|
from typing import Optional
|
|
8
9
|
from rich.console import Console
|
|
9
10
|
from rich.table import Table
|
|
@@ -3218,6 +3219,34 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
3218
3219
|
if "-A" not in ssh_command and "-o ForwardAgent=yes" not in ssh_command:
|
|
3219
3220
|
ssh_command = ssh_command.replace("ssh ", "ssh -A ", 1)
|
|
3220
3221
|
|
|
3222
|
+
# Inject AddKeysToAgent so the first connect from this laptop loads the user\'s
|
|
3223
|
+
# IdentityFile into ssh-agent — without this the forwarded agent is empty on
|
|
3224
|
+
# subsequent pod→pod hops. UseKeychain persists the passphrase across reboots on
|
|
3225
|
+
# macOS; IgnoreUnknown lets Linux SSH ignore the macOS-only option cleanly.
|
|
3226
|
+
# The same options live in ~/.gpu-dev/<id>-sshconfig but ssh only honours them
|
|
3227
|
+
# when the command-line target matches a Host block, which this connect command
|
|
3228
|
+
# bypasses by passing the FQDN directly.
|
|
3229
|
+
if "AddKeysToAgent" not in ssh_command:
|
|
3230
|
+
ssh_command = ssh_command.replace(
|
|
3231
|
+
"ssh ",
|
|
3232
|
+
"ssh -o AddKeysToAgent=yes -o IgnoreUnknown=UseKeychain -o UseKeychain=yes ",
|
|
3233
|
+
1,
|
|
3234
|
+
)
|
|
3235
|
+
|
|
3236
|
+
# When running from inside a gpu-dev pod (=GPU_DEV_USER_ID env var set) and the
|
|
3237
|
+
# forwarded SSH agent is reachable but empty, the next hop is going to fail with
|
|
3238
|
+
# 'Permission denied (publickey)'. Warn upfront so the user knows to ssh-add on
|
|
3239
|
+
# their laptop instead of debugging an opaque auth failure on the remote side.
|
|
3240
|
+
if os.environ.get("GPU_DEV_USER_ID") and os.environ.get("SSH_AUTH_SOCK"):
|
|
3241
|
+
try:
|
|
3242
|
+
import subprocess as _sp
|
|
3243
|
+
r = _sp.run(["ssh-add", "-L"], capture_output=True, text=True, timeout=3)
|
|
3244
|
+
if r.returncode != 0 or not r.stdout.strip() or "no identities" in r.stdout.lower():
|
|
3245
|
+
rprint("[yellow]⚠️ Forwarded SSH agent is empty — second-hop SSH from a pod will fail auth.[/yellow]")
|
|
3246
|
+
rprint("[yellow] On your laptop: `ssh-add ~/.ssh/id_ed25519` (or your GitHub key), then reconnect to this pod with `gpu-dev connect`.[/yellow]\n")
|
|
3247
|
+
except Exception:
|
|
3248
|
+
pass
|
|
3249
|
+
|
|
3221
3250
|
# Parse and execute the command, capturing exit code for auth failures
|
|
3222
3251
|
rprint(f"[dim]Executing: {ssh_command}[/dim]\n")
|
|
3223
3252
|
result = subprocess.run(ssh_command, shell=True)
|
|
@@ -22,6 +22,11 @@ class Config:
|
|
|
22
22
|
"workspace": "prod",
|
|
23
23
|
"description": "Production environment",
|
|
24
24
|
},
|
|
25
|
+
"prod-east1": {
|
|
26
|
+
"region": "us-east-1",
|
|
27
|
+
"workspace": "prod-east1",
|
|
28
|
+
"description": "Spot-only us-east-1 environment (T4/L4/CPU)",
|
|
29
|
+
},
|
|
25
30
|
}
|
|
26
31
|
DEFAULT_ENVIRONMENT = "prod"
|
|
27
32
|
|
|
@@ -162,11 +162,20 @@ def _generate_ssh_config(hostname: str, pod_name: str) -> str:
|
|
|
162
162
|
Returns:
|
|
163
163
|
SSH config content as string
|
|
164
164
|
"""
|
|
165
|
+
import sys
|
|
166
|
+
# AddKeysToAgent makes SSH stash the IdentityFile into ssh-agent the first time it
|
|
167
|
+
# uses it for this Host, so the next agent-forwarding hop (pod → pod) actually has
|
|
168
|
+
# something to forward. On macOS, UseKeychain persists the passphrase via Keychain
|
|
169
|
+
# so users aren\'t prompted on every shell restart. Linux ssh errors on UseKeychain,
|
|
170
|
+
# so guard it with IgnoreUnknown.
|
|
171
|
+
extra = " AddKeysToAgent yes\n"
|
|
172
|
+
if sys.platform == "darwin":
|
|
173
|
+
extra += " IgnoreUnknown UseKeychain\n UseKeychain yes\n"
|
|
165
174
|
config_content = f"""Host {pod_name}
|
|
166
175
|
HostName {hostname}
|
|
167
176
|
User dev
|
|
168
177
|
ForwardAgent yes
|
|
169
|
-
ProxyCommand gpu-dev-ssh-proxy %h %p
|
|
178
|
+
{extra} ProxyCommand gpu-dev-ssh-proxy %h %p
|
|
170
179
|
StrictHostKeyChecking no
|
|
171
180
|
UserKnownHostsFile /dev/null
|
|
172
181
|
"""
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.26"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -401,11 +401,14 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
|
|
|
401
401
|
}
|
|
402
402
|
}
|
|
403
403
|
|
|
404
|
-
#
|
|
404
|
+
# instance_market_options: capacity-block when bound to a reservation, spot when
|
|
405
|
+
# the workspace's gpu_config has use_spot=true, otherwise on-demand (no block).
|
|
406
|
+
# Spot is mutually exclusive with capacity reservations — AWS rejects launch templates
|
|
407
|
+
# carrying both, so the precedence here is CR > spot > on-demand.
|
|
405
408
|
dynamic "instance_market_options" {
|
|
406
|
-
for_each = each.value.capacity_reservation_id != null ? [1] : []
|
|
409
|
+
for_each = (each.value.capacity_reservation_id != null || try(each.value.gpu_config.use_spot, false)) ? [1] : []
|
|
407
410
|
content {
|
|
408
|
-
market_type = "capacity-block"
|
|
411
|
+
market_type = each.value.capacity_reservation_id != null ? "capacity-block" : "spot"
|
|
409
412
|
}
|
|
410
413
|
}
|
|
411
414
|
|
|
@@ -58,6 +58,13 @@ provider "helm" {
|
|
|
58
58
|
# Data sources
|
|
59
59
|
data "aws_availability_zones" "available" {
|
|
60
60
|
state = "available"
|
|
61
|
+
# Exclude Local Zones (e.g. us-east-1-dfw-2a) and Wavelength Zones — EKS control
|
|
62
|
+
# plane only supports standard AZs. us-east-2 doesn't have Local Zones so the
|
|
63
|
+
# existing prod workspace was unaffected; us-east-1 has several (dfw, bos, …).
|
|
64
|
+
filter {
|
|
65
|
+
name = "opt-in-status"
|
|
66
|
+
values = ["opt-in-not-required"]
|
|
67
|
+
}
|
|
61
68
|
}
|
|
62
69
|
|
|
63
70
|
data "aws_caller_identity" "current" {}
|
|
@@ -315,6 +322,50 @@ locals {
|
|
|
315
322
|
}
|
|
316
323
|
}
|
|
317
324
|
}
|
|
325
|
+
# us-east-1 spot-only experimental cluster.
|
|
326
|
+
# Same provisioning shape as prod (managed via the terraform.workspace switch) but
|
|
327
|
+
# backed entirely by EC2 Spot — first cheap-and-cheerful environment we can deploy
|
|
328
|
+
# new instance types into (B300 land here once on-demand quota arrives).
|
|
329
|
+
"prod-east1" = {
|
|
330
|
+
aws_region = "us-east-1"
|
|
331
|
+
environment = "prod-east1"
|
|
332
|
+
domain_name = "east1.devservers.io"
|
|
333
|
+
gpu_instance_count = 1
|
|
334
|
+
use_self_managed_nodes = true
|
|
335
|
+
instance_type = "g4dn.12xlarge"
|
|
336
|
+
supported_gpu_types = {
|
|
337
|
+
"t4" = {
|
|
338
|
+
instance_type = "g4dn.12xlarge"
|
|
339
|
+
instance_types = null
|
|
340
|
+
instance_count = 1
|
|
341
|
+
gpus_per_instance = 4
|
|
342
|
+
use_placement_group = false
|
|
343
|
+
architecture = "x86_64"
|
|
344
|
+
efa_network_cards = 0
|
|
345
|
+
use_spot = true
|
|
346
|
+
}
|
|
347
|
+
"l4" = {
|
|
348
|
+
instance_type = "g6.12xlarge"
|
|
349
|
+
instance_types = null
|
|
350
|
+
instance_count = 1
|
|
351
|
+
gpus_per_instance = 4
|
|
352
|
+
use_placement_group = false
|
|
353
|
+
architecture = "x86_64"
|
|
354
|
+
efa_network_cards = 1
|
|
355
|
+
use_spot = true
|
|
356
|
+
}
|
|
357
|
+
"cpu-x86" = {
|
|
358
|
+
instance_type = "c7i.8xlarge"
|
|
359
|
+
instance_types = null
|
|
360
|
+
instance_count = 5
|
|
361
|
+
gpus_per_instance = 0
|
|
362
|
+
use_placement_group = false
|
|
363
|
+
architecture = "x86_64"
|
|
364
|
+
efa_network_cards = 0
|
|
365
|
+
use_spot = true
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
}
|
|
318
369
|
}
|
|
319
370
|
|
|
320
371
|
# Current workspace configuration
|
|
@@ -322,6 +373,9 @@ locals {
|
|
|
322
373
|
|
|
323
374
|
# Workspace-specific capacity reservations (with manual instance counts)
|
|
324
375
|
capacity_reservations = {
|
|
376
|
+
"prod-east1" = {
|
|
377
|
+
# No capacity reservations — this workspace is spot-only.
|
|
378
|
+
}
|
|
325
379
|
default = {
|
|
326
380
|
# Test environment capacity reservations
|
|
327
381
|
# h100 = [
|
|
@@ -366,6 +420,13 @@ locals {
|
|
|
366
420
|
|
|
367
421
|
# Workspace-specific GPU type to subnet mappings
|
|
368
422
|
gpu_subnet_assignments = {
|
|
423
|
+
"prod-east1" = {
|
|
424
|
+
# All node types land in the primary subnet (us-east-1a). Spot availability is
|
|
425
|
+
# better than placement-group-strictness on these small ASGs.
|
|
426
|
+
t4 = "primary"
|
|
427
|
+
l4 = "primary"
|
|
428
|
+
"cpu-x86" = "primary"
|
|
429
|
+
}
|
|
369
430
|
default = {
|
|
370
431
|
# Test environment - T4 nodes in multiple AZs for testing
|
|
371
432
|
t4 = "primary" # T4 in us-west-1a (primary AZ)
|
|
@@ -392,6 +453,9 @@ locals {
|
|
|
392
453
|
|
|
393
454
|
# Per-capacity-reservation AZ mappings (overrides gpu_subnet_assignments when CR is used)
|
|
394
455
|
capacity_reservation_azs = {
|
|
456
|
+
"prod-east1" = {
|
|
457
|
+
# Empty — no CRs in this workspace.
|
|
458
|
+
}
|
|
395
459
|
default = {
|
|
396
460
|
"cr-04d3d1d84e127a562" = "secondary" # us-west-1c
|
|
397
461
|
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# AWS Node Termination Handler — graceful drain on spot-interrupt + ASG lifecycle events.
|
|
2
|
+
#
|
|
3
|
+
# IMDS mode (one DaemonSet per node, no SQS / no IAM role) is plenty for our use case:
|
|
4
|
+
# we don't care about queue-processor features (rebalance recommendations, scheduled
|
|
5
|
+
# events). We just want pods to get a clean SIGTERM when AWS sends the 2-minute spot
|
|
6
|
+
# notice via instance metadata, instead of being killed cold.
|
|
7
|
+
#
|
|
8
|
+
# Tolerates everything so it runs on the GPU nodes that have nvidia.com/gpu:NoSchedule.
|
|
9
|
+
|
|
10
|
+
resource "helm_release" "aws_node_termination_handler" {
|
|
11
|
+
name = "aws-node-termination-handler"
|
|
12
|
+
repository = "https://aws.github.io/eks-charts"
|
|
13
|
+
chart = "aws-node-termination-handler"
|
|
14
|
+
namespace = "kube-system"
|
|
15
|
+
version = "0.27.1"
|
|
16
|
+
cleanup_on_fail = true
|
|
17
|
+
|
|
18
|
+
values = [yamlencode({
|
|
19
|
+
enableSpotInterruptionDraining = true
|
|
20
|
+
enableScheduledEventDraining = true
|
|
21
|
+
enableRebalanceMonitoring = true
|
|
22
|
+
enableRebalanceDraining = false # warning only; rebalance recommendations are too noisy
|
|
23
|
+
nodeSelector = {
|
|
24
|
+
"kubernetes.io/os" = "linux"
|
|
25
|
+
}
|
|
26
|
+
tolerations = [
|
|
27
|
+
{ operator = "Exists" }, # tolerate every taint; we want NTH on every node, including GPU nodes
|
|
28
|
+
]
|
|
29
|
+
resources = {
|
|
30
|
+
requests = { cpu = "50m", memory = "64Mi" }
|
|
31
|
+
limits = { cpu = "100m", memory = "128Mi" }
|
|
32
|
+
}
|
|
33
|
+
})]
|
|
34
|
+
|
|
35
|
+
depends_on = [aws_eks_cluster.gpu_dev_cluster]
|
|
36
|
+
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.24 → gpu_dev-0.5.26}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|