gpu-dev 0.5.22__tar.gz → 0.5.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/PKG-INFO +1 -1
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +13 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +11 -2
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/pyproject.toml +1 -1
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_processor/index.py +34 -1
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/.gitignore +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/CLAUDE.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/PROGRESS.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/README.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/TODO.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/admin/README.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/post.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/setup.cfg +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/variables.tf +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/tests/submit/README.md +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/tests/submit/fail/run.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/tests/submit/multinode/run.sh +0 -0
- {gpu_dev-0.5.22 → gpu_dev-0.5.24}/tests/submit/success/run.sh +0 -0
|
@@ -38,6 +38,19 @@ def _load_auth_cache(github_user: str) -> Optional[Dict[str, Any]]:
|
|
|
38
38
|
return None
|
|
39
39
|
if time.time() - float(entry.get("ts", 0)) > _AUTH_CACHE_TTL_SECONDS:
|
|
40
40
|
return None
|
|
41
|
+
# Defense against stale cache on a persistent disk that pre-dates the IRSA fix:
|
|
42
|
+
# if AWS_ROLE_ARN points at a role the cached ARN doesn\'t reference, the cache
|
|
43
|
+
# is from a different identity (e.g. IMDS-fallback before fs_group=1081 landed)
|
|
44
|
+
# and should be ignored.
|
|
45
|
+
expected_role_arn = os.environ.get("AWS_ROLE_ARN", "")
|
|
46
|
+
cached_arn = (entry.get("result") or {}).get("arn", "")
|
|
47
|
+
if expected_role_arn:
|
|
48
|
+
try:
|
|
49
|
+
role_name = expected_role_arn.rsplit("/", 1)[-1]
|
|
50
|
+
if role_name and role_name not in cached_arn:
|
|
51
|
+
return None
|
|
52
|
+
except Exception:
|
|
53
|
+
pass
|
|
41
54
|
return entry.get("result")
|
|
42
55
|
except Exception:
|
|
43
56
|
return None
|
|
@@ -240,8 +240,17 @@ class Config:
|
|
|
240
240
|
return self.user_config.get(key)
|
|
241
241
|
|
|
242
242
|
def get_github_username(self) -> Optional[str]:
|
|
243
|
-
"""Get GitHub username
|
|
244
|
-
|
|
243
|
+
"""Get GitHub username, falling back to GPU_DEV_GITHUB_USER env var.
|
|
244
|
+
|
|
245
|
+
Lambda sets GPU_DEV_GITHUB_USER on every pod from the reservation's
|
|
246
|
+
github_user field, so a user running gpu-dev from inside their dev pod
|
|
247
|
+
doesn\'t have to `gpu-dev config set github_user <name>` first.
|
|
248
|
+
"""
|
|
249
|
+
v = self.user_config.get("github_user")
|
|
250
|
+
if v:
|
|
251
|
+
return v
|
|
252
|
+
v = os.environ.get("GPU_DEV_GITHUB_USER")
|
|
253
|
+
return v or None
|
|
245
254
|
|
|
246
255
|
|
|
247
256
|
def load_config() -> Config:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.24"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -2888,6 +2888,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2888
2888
|
gpu_count=gpu_count,
|
|
2889
2889
|
gpu_type=gpu_type,
|
|
2890
2890
|
github_public_key=github_public_key,
|
|
2891
|
+
github_user=github_user,
|
|
2891
2892
|
reservation_id=reservation_id,
|
|
2892
2893
|
jupyter_enabled=jupyter_enabled,
|
|
2893
2894
|
persistent_volume_id=persistent_volume_id,
|
|
@@ -3434,7 +3435,8 @@ def create_kubernetes_resources(
|
|
|
3434
3435
|
gpu_count: int,
|
|
3435
3436
|
gpu_type: str,
|
|
3436
3437
|
github_public_key: str,
|
|
3437
|
-
|
|
3438
|
+
github_user: str = "",
|
|
3439
|
+
reservation_id: str = None,
|
|
3438
3440
|
jupyter_enabled: bool = False,
|
|
3439
3441
|
persistent_volume_id: str = None,
|
|
3440
3442
|
user_id: str = None,
|
|
@@ -3538,6 +3540,7 @@ def create_kubernetes_resources(
|
|
|
3538
3540
|
gpu_count,
|
|
3539
3541
|
gpu_type,
|
|
3540
3542
|
github_public_key,
|
|
3543
|
+
github_user=github_user,
|
|
3541
3544
|
jupyter_enabled=True,
|
|
3542
3545
|
persistent_volume_id=persistent_volume_id,
|
|
3543
3546
|
user_id=user_id,
|
|
@@ -3627,6 +3630,7 @@ def create_kubernetes_resources(
|
|
|
3627
3630
|
gpu_count,
|
|
3628
3631
|
gpu_type,
|
|
3629
3632
|
github_public_key,
|
|
3633
|
+
github_user=github_user,
|
|
3630
3634
|
jupyter_enabled=False,
|
|
3631
3635
|
persistent_volume_id=persistent_volume_id,
|
|
3632
3636
|
user_id=user_id,
|
|
@@ -3979,6 +3983,7 @@ def create_pod(
|
|
|
3979
3983
|
gpu_count: int,
|
|
3980
3984
|
gpu_type: str,
|
|
3981
3985
|
github_public_key: str,
|
|
3986
|
+
github_user: str = "",
|
|
3982
3987
|
jupyter_enabled: bool = False,
|
|
3983
3988
|
persistent_volume_id: str = None,
|
|
3984
3989
|
user_id: str = None,
|
|
@@ -4486,6 +4491,18 @@ export MULTINODE_SIZE="$MULTINODE_SIZE"
|
|
|
4486
4491
|
export MASTER_ADDR="$MASTER_ADDR"
|
|
4487
4492
|
export MASTER_PORT="$MASTER_PORT"
|
|
4488
4493
|
|
|
4494
|
+
# IRSA + region — same reason as MULTINODE: sshd strips these from login shells, so
|
|
4495
|
+
# we bake the current container values into the rc file. Lets gpu-dev / aws / boto3
|
|
4496
|
+
# inside an SSH session pick up the gpu-dev-pod-sa IAM role automatically.
|
|
4497
|
+
export AWS_ROLE_ARN="$AWS_ROLE_ARN"
|
|
4498
|
+
export AWS_WEB_IDENTITY_TOKEN_FILE="$AWS_WEB_IDENTITY_TOKEN_FILE"
|
|
4499
|
+
export AWS_ROLE_SESSION_NAME="$AWS_ROLE_SESSION_NAME"
|
|
4500
|
+
export AWS_REGION="$AWS_REGION"
|
|
4501
|
+
export AWS_DEFAULT_REGION="$AWS_DEFAULT_REGION"
|
|
4502
|
+
export AWS_STS_REGIONAL_ENDPOINTS="$AWS_STS_REGIONAL_ENDPOINTS"
|
|
4503
|
+
# CLI falls back to this when ~/.config/gpu-dev/config.json has no github_user
|
|
4504
|
+
export GPU_DEV_GITHUB_USER="$GPU_DEV_GITHUB_USER"
|
|
4505
|
+
|
|
4489
4506
|
# Function to check for GPU reservation expiry warnings and startup script status
|
|
4490
4507
|
check_warnings() {{
|
|
4491
4508
|
# Check for startup script still running
|
|
@@ -4539,6 +4556,15 @@ export MULTINODE_SIZE="$MULTINODE_SIZE"
|
|
|
4539
4556
|
export MASTER_ADDR="$MASTER_ADDR"
|
|
4540
4557
|
export MASTER_PORT="$MASTER_PORT"
|
|
4541
4558
|
|
|
4559
|
+
# IRSA + region (see .bashrc_ext for rationale)
|
|
4560
|
+
export AWS_ROLE_ARN="$AWS_ROLE_ARN"
|
|
4561
|
+
export AWS_WEB_IDENTITY_TOKEN_FILE="$AWS_WEB_IDENTITY_TOKEN_FILE"
|
|
4562
|
+
export AWS_ROLE_SESSION_NAME="$AWS_ROLE_SESSION_NAME"
|
|
4563
|
+
export AWS_REGION="$AWS_REGION"
|
|
4564
|
+
export AWS_DEFAULT_REGION="$AWS_DEFAULT_REGION"
|
|
4565
|
+
export AWS_STS_REGIONAL_ENDPOINTS="$AWS_STS_REGIONAL_ENDPOINTS"
|
|
4566
|
+
export GPU_DEV_GITHUB_USER="$GPU_DEV_GITHUB_USER"
|
|
4567
|
+
|
|
4542
4568
|
# Function to check for GPU reservation expiry warnings and startup script status
|
|
4543
4569
|
check_warnings() {{
|
|
4544
4570
|
# Check for startup script still running
|
|
@@ -5314,6 +5340,9 @@ EOF
|
|
|
5314
5340
|
),
|
|
5315
5341
|
client.V1EnvVar(
|
|
5316
5342
|
name="AWS_ROLE_SESSION_NAME", value=(user_id or "gpu-dev-pod")[:64]
|
|
5343
|
+
),
|
|
5344
|
+
client.V1EnvVar(
|
|
5345
|
+
name="GPU_DEV_GITHUB_USER", value=github_user or ""
|
|
5317
5346
|
)
|
|
5318
5347
|
] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
|
|
5319
5348
|
resources=client.V1ResourceRequirements(
|
|
@@ -5501,6 +5530,10 @@ EOF
|
|
|
5501
5530
|
# with the AWS_ROLE_SESSION_NAME env var below this lets users run
|
|
5502
5531
|
# `gpu-dev submit` from inside their dev pod with no manual aws sso login.
|
|
5503
5532
|
service_account_name="gpu-dev-pod-sa",
|
|
5533
|
+
# fs_group=1081 makes the IRSA-projected token (default 0600 root:root)
|
|
5534
|
+
# readable by the dev user. Without it boto3-as-dev falls through to IMDS
|
|
5535
|
+
# and gets the node's IAM role, which doesn't have DDB/SQS permissions.
|
|
5536
|
+
security_context=client.V1PodSecurityContext(fs_group=1081),
|
|
5504
5537
|
# EFA requires host network namespace for RDMA access to efa0 interface
|
|
5505
5538
|
**({
|
|
5506
5539
|
"host_network": True,
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.25"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.16"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.22 → gpu_dev-0.5.24}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|