gpu-dev 0.5.7__tar.gz → 0.5.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/PKG-INFO +1 -1
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +53 -2
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +45 -19
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +6 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/pyproject.toml +1 -1
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/availability.tf +9 -2
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/availability_updater/index.py +41 -3
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/.gitignore +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/CLAUDE.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/PROGRESS.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/TODO.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/admin/README.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/post.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/setup.cfg +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -1,11 +1,53 @@
|
|
|
1
1
|
"""Minimal AWS-only authentication for GPU Dev CLI"""
|
|
2
2
|
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
3
5
|
import subprocess
|
|
4
6
|
import re
|
|
5
|
-
|
|
7
|
+
import time
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Any, Optional
|
|
6
10
|
from .config import Config
|
|
7
11
|
from rich.spinner import Spinner
|
|
8
12
|
|
|
13
|
+
# SSH validation result is cached locally for 24h. New keys pushed to GitHub still take effect
|
|
14
|
+
# at reservation time (pods fetch live keys via init container) — caching only skips the
|
|
15
|
+
# pre-flight "are you who you say you are" check.
|
|
16
|
+
_SSH_CACHE_TTL_SECONDS = 24 * 60 * 60
|
|
17
|
+
_SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cache.json"))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _load_ssh_cache(github_user: str) -> Optional[Dict[str, Any]]:
|
|
21
|
+
"""Return cached validation if it's fresh and matches the configured github_user, else None."""
|
|
22
|
+
try:
|
|
23
|
+
if not _SSH_CACHE_PATH.exists():
|
|
24
|
+
return None
|
|
25
|
+
with open(_SSH_CACHE_PATH) as f:
|
|
26
|
+
data = json.load(f)
|
|
27
|
+
if data.get("configured_user") != github_user:
|
|
28
|
+
return None
|
|
29
|
+
if time.time() - float(data.get("ts", 0)) > _SSH_CACHE_TTL_SECONDS:
|
|
30
|
+
return None
|
|
31
|
+
return data.get("result")
|
|
32
|
+
except Exception:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _save_ssh_cache(github_user: str, result: Dict[str, Any]) -> None:
|
|
37
|
+
"""Persist a successful validation result. Failures are not cached (so they can recover)."""
|
|
38
|
+
if not result.get("valid"):
|
|
39
|
+
return
|
|
40
|
+
try:
|
|
41
|
+
_SSH_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
with open(_SSH_CACHE_PATH, "w") as f:
|
|
43
|
+
json.dump({
|
|
44
|
+
"configured_user": github_user,
|
|
45
|
+
"ts": int(time.time()),
|
|
46
|
+
"result": result,
|
|
47
|
+
}, f)
|
|
48
|
+
except Exception:
|
|
49
|
+
pass
|
|
50
|
+
|
|
9
51
|
|
|
10
52
|
def authenticate_user(config: Config) -> Dict[str, Any]:
|
|
11
53
|
"""Authenticate using AWS credentials - if you can call AWS, you're authorized"""
|
|
@@ -59,6 +101,13 @@ def validate_ssh_key_matches_github_user(config: Config, live=None) -> Dict[str,
|
|
|
59
101
|
"error": "GitHub username not configured. Run: gpu-dev config set github_user <username>",
|
|
60
102
|
}
|
|
61
103
|
|
|
104
|
+
# Cache short-circuit — skip the SSH handshake (~1-3s) if we recently validated this user.
|
|
105
|
+
# Cache TTL is 24h. New keys pushed to GitHub still take effect at reservation time
|
|
106
|
+
# (pods fetch live keys via init container), so caching the pre-flight check is safe.
|
|
107
|
+
cached = _load_ssh_cache(github_user)
|
|
108
|
+
if cached is not None:
|
|
109
|
+
return cached
|
|
110
|
+
|
|
62
111
|
# Run ssh git@github.com with interactive host verification support
|
|
63
112
|
ssh_output = None
|
|
64
113
|
|
|
@@ -139,7 +188,7 @@ def validate_ssh_key_matches_github_user(config: Config, live=None) -> Dict[str,
|
|
|
139
188
|
# Compare usernames (case-insensitive)
|
|
140
189
|
is_valid = ssh_detected_user.lower() == github_user.lower()
|
|
141
190
|
|
|
142
|
-
|
|
191
|
+
result = {
|
|
143
192
|
"valid": is_valid,
|
|
144
193
|
"configured_user": github_user,
|
|
145
194
|
"ssh_user": ssh_detected_user,
|
|
@@ -147,6 +196,8 @@ def validate_ssh_key_matches_github_user(config: Config, live=None) -> Dict[str,
|
|
|
147
196
|
if is_valid
|
|
148
197
|
else f"SSH key belongs to '{ssh_detected_user}' but configured user is '{github_user}'",
|
|
149
198
|
}
|
|
199
|
+
_save_ssh_cache(github_user, result)
|
|
200
|
+
return result
|
|
150
201
|
|
|
151
202
|
except Exception as e:
|
|
152
203
|
return {
|
|
@@ -681,29 +681,55 @@ def reserve(
|
|
|
681
681
|
rprint(
|
|
682
682
|
"[dim]Use --no-interactive flag to disable interactive mode[/dim]\n")
|
|
683
683
|
|
|
684
|
-
#
|
|
684
|
+
# Run auth + SSH validation + availability fetch in parallel — they're independent
|
|
685
|
+
# and total wall-clock time drops from sum to max(each).
|
|
686
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
687
|
+
config = load_config()
|
|
688
|
+
|
|
685
689
|
with Live(
|
|
686
|
-
Spinner("dots", text="
|
|
690
|
+
Spinner("dots", text="🚀 Loading…"), console=console
|
|
687
691
|
) as live:
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
692
|
+
with ThreadPoolExecutor(max_workers=3) as ex:
|
|
693
|
+
f_auth = ex.submit(authenticate_user, config)
|
|
694
|
+
# SSH validation may invoke `ssh git@github.com` interactively for password-protected keys;
|
|
695
|
+
# do it on the main thread when the cache is cold so prompts work. Probe cache first.
|
|
696
|
+
from .auth import _load_ssh_cache, validate_ssh_key_matches_github_user
|
|
697
|
+
cached_ssh = _load_ssh_cache(config.get_github_username() or "")
|
|
698
|
+
if cached_ssh is not None:
|
|
699
|
+
f_ssh = None
|
|
700
|
+
ssh_result = cached_ssh
|
|
701
|
+
else:
|
|
702
|
+
f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
|
|
703
|
+
ssh_result = None
|
|
704
|
+
f_avail = ex.submit(
|
|
705
|
+
lambda: ReservationManager(config).get_gpu_availability_by_type()
|
|
706
|
+
)
|
|
700
707
|
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
708
|
+
# Surface auth failure first (most actionable).
|
|
709
|
+
try:
|
|
710
|
+
user_info = f_auth.result()
|
|
711
|
+
except RuntimeError as e:
|
|
712
|
+
live.stop()
|
|
713
|
+
rprint(f"[red]❌ {str(e)}[/red]")
|
|
714
|
+
return
|
|
705
715
|
|
|
706
|
-
|
|
716
|
+
if ssh_result is None:
|
|
717
|
+
ssh_result = f_ssh.result()
|
|
718
|
+
availability_info = f_avail.result()
|
|
719
|
+
|
|
720
|
+
# Surface SSH validation failure with the same UX as before.
|
|
721
|
+
if not ssh_result.get("valid"):
|
|
722
|
+
rprint("[red]❌ Github SSH key validation failed[/red]")
|
|
723
|
+
if ssh_result.get("ssh_user") and ssh_result.get("configured_user"):
|
|
724
|
+
rprint("\n[yellow]💡 Fix by updating your config:[/yellow]")
|
|
725
|
+
rprint(f" [cyan]gpu-dev config set github_user {ssh_result['ssh_user']}[/cyan]")
|
|
726
|
+
elif not ssh_result.get("configured_user"):
|
|
727
|
+
rprint("\n[yellow]💡 Fix by configuring your GitHub username:[/yellow]")
|
|
728
|
+
rprint(" [cyan]gpu-dev config set github_user <your-github-username>[/cyan]")
|
|
729
|
+
else:
|
|
730
|
+
rprint("\n[yellow]💡 gpu-dev utilizes Github keys for auth![/yellow]")
|
|
731
|
+
rprint("[yellow]💡 Check https://fburl.com/gh-ssh for info on how to add your ssh key to Github[/yellow]")
|
|
732
|
+
return
|
|
707
733
|
|
|
708
734
|
if not availability_info:
|
|
709
735
|
rprint("[red]❌ Could not get GPU availability information[/red]")
|
|
@@ -1032,6 +1032,11 @@ class ReservationManager:
|
|
|
1032
1032
|
queue_length = self._get_queue_length_for_gpu_type(gpu_type)
|
|
1033
1033
|
estimated_wait = queue_length * 15 if queue_length > 0 else 0
|
|
1034
1034
|
|
|
1035
|
+
# size_etas is a DDB Map of {size_str: epoch_seconds (Decimal)} — pass through
|
|
1036
|
+
# so the interactive count menu can render "[available in 1h24m]" labels.
|
|
1037
|
+
raw_etas = item.get("size_etas", {}) or {}
|
|
1038
|
+
size_etas = {str(k): int(v) for k, v in raw_etas.items()} if raw_etas else {}
|
|
1039
|
+
|
|
1035
1040
|
availability_info[gpu_type] = {
|
|
1036
1041
|
"available": int(item.get("available_gpus", 0)),
|
|
1037
1042
|
"total": int(item.get("total_gpus", 0)),
|
|
@@ -1045,6 +1050,7 @@ class ReservationManager:
|
|
|
1045
1050
|
"last_updated": item.get("last_updated_timestamp", 0),
|
|
1046
1051
|
"maintenance": bool(item.get("maintenance", False)),
|
|
1047
1052
|
"maintenance_reason": item.get("maintenance_reason", ""),
|
|
1053
|
+
"size_etas": size_etas,
|
|
1048
1054
|
}
|
|
1049
1055
|
|
|
1050
1056
|
return availability_info
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.9"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -26,8 +26,15 @@ resource "aws_lambda_function" "availability_updater" {
|
|
|
26
26
|
role = aws_iam_role.availability_updater_role.arn
|
|
27
27
|
handler = "index.handler"
|
|
28
28
|
runtime = "python3.11"
|
|
29
|
-
timeout
|
|
30
|
-
|
|
29
|
+
timeout = 300
|
|
30
|
+
# 1769 MB is the sweet spot — Lambda allocates one full vCPU at this threshold.
|
|
31
|
+
# Beyond 1769 MB you get fractional second vCPUs (less linear gain), and our work is single-threaded.
|
|
32
|
+
memory_size = 1769
|
|
33
|
+
# Cap concurrent invocations at 1: each run does ~30 EKS API calls per gpu_type, and
|
|
34
|
+
# uncapped concurrency was hammering the cluster API into throttling, leaving later
|
|
35
|
+
# gpu_types in each run timing out and never producing size_etas.
|
|
36
|
+
reserved_concurrent_executions = 1
|
|
37
|
+
source_code_hash = null_resource.availability_updater_build.triggers.code_hash
|
|
31
38
|
|
|
32
39
|
environment {
|
|
33
40
|
variables = {
|
{gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
@@ -24,6 +24,45 @@ RESERVATIONS_TABLE = os.environ.get("RESERVATIONS_TABLE", "pytorch-gpu-dev-reser
|
|
|
24
24
|
SUPPORTED_GPU_TYPES = json.loads(os.environ["SUPPORTED_GPU_TYPES"])
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
def _parse_expires_at(value):
|
|
28
|
+
"""Parse the reservations table's `expires_at` field to a unix epoch (int).
|
|
29
|
+
|
|
30
|
+
DDB stores it as either an ISO-8601 datetime string ("2026-05-02T00:12:03.674845") OR
|
|
31
|
+
occasionally a numeric epoch — handle both. Returns None if unparseable.
|
|
32
|
+
"""
|
|
33
|
+
if value is None:
|
|
34
|
+
return None
|
|
35
|
+
# Numeric (Decimal/int/float) → epoch seconds directly.
|
|
36
|
+
if not isinstance(value, str):
|
|
37
|
+
try:
|
|
38
|
+
return int(float(value))
|
|
39
|
+
except (ValueError, TypeError):
|
|
40
|
+
return None
|
|
41
|
+
s = value.strip()
|
|
42
|
+
if not s:
|
|
43
|
+
return None
|
|
44
|
+
# ISO-8601 first (the actual production format).
|
|
45
|
+
try:
|
|
46
|
+
from datetime import datetime, timezone
|
|
47
|
+
# `fromisoformat` accepts microseconds; tolerate optional 'Z' suffix.
|
|
48
|
+
if s.endswith("Z"):
|
|
49
|
+
s2 = s[:-1] + "+00:00"
|
|
50
|
+
else:
|
|
51
|
+
s2 = s
|
|
52
|
+
dt = datetime.fromisoformat(s2)
|
|
53
|
+
if dt.tzinfo is None:
|
|
54
|
+
# Convention in this codebase: timestamps written via datetime.utcnow().isoformat() are UTC.
|
|
55
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
56
|
+
return int(dt.timestamp())
|
|
57
|
+
except (ValueError, TypeError):
|
|
58
|
+
pass
|
|
59
|
+
# Numeric-as-string fallback.
|
|
60
|
+
try:
|
|
61
|
+
return int(float(s))
|
|
62
|
+
except (ValueError, TypeError):
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
27
66
|
def get_gpu_resource_name(gpu_type: str) -> str:
|
|
28
67
|
return SUPPORTED_GPU_TYPES.get(gpu_type, {}).get("k8s_resource", "nvidia.com/gpu")
|
|
29
68
|
|
|
@@ -542,9 +581,8 @@ def compute_size_etas(v1, gpu_type, node_label_value, resource_name, gpus_per_in
|
|
|
542
581
|
continue
|
|
543
582
|
if pod_name not in pod_to_info:
|
|
544
583
|
continue
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
except (ValueError, TypeError):
|
|
584
|
+
ts = _parse_expires_at(expires_at)
|
|
585
|
+
if ts is None:
|
|
548
586
|
continue
|
|
549
587
|
node_name, gpus = pod_to_info[pod_name]
|
|
550
588
|
node_state[node_name]["expirations"].append((ts, gpus))
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.9"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.5"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|