gpu-dev 0.5.13__tar.gz → 0.5.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/PKG-INFO +1 -1
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +69 -20
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +8 -9
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +2 -2
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/pyproject.toml +1 -1
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/mig-config.tf +23 -6
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/.gitignore +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/CLAUDE.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/PROGRESS.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/TODO.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/admin/README.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/post.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/setup.cfg +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -16,6 +16,53 @@ from rich.spinner import Spinner
|
|
|
16
16
|
_SSH_CACHE_TTL_SECONDS = 24 * 60 * 60
|
|
17
17
|
_SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cache.json"))
|
|
18
18
|
|
|
19
|
+
# Cache for authenticate_user. STS GetCallerIdentity is stable per AWS profile and slow under SSO
|
|
20
|
+
# (~500ms-1.5s). Cache for 24h keyed by AWS_PROFILE; if creds rotate the user_id rarely changes,
|
|
21
|
+
# and the next AWS call (DDB/SQS) will surface a credential error if it does.
|
|
22
|
+
_AUTH_CACHE_TTL_SECONDS = 24 * 60 * 60
|
|
23
|
+
_AUTH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/auth-cache.json"))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _auth_cache_key() -> str:
|
|
27
|
+
return os.environ.get("AWS_PROFILE", "default")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _load_auth_cache(github_user: str) -> Optional[Dict[str, Any]]:
|
|
31
|
+
try:
|
|
32
|
+
if not _AUTH_CACHE_PATH.exists():
|
|
33
|
+
return None
|
|
34
|
+
with open(_AUTH_CACHE_PATH) as f:
|
|
35
|
+
data = json.load(f)
|
|
36
|
+
entry = data.get(_auth_cache_key())
|
|
37
|
+
if not entry or entry.get("github_user") != github_user:
|
|
38
|
+
return None
|
|
39
|
+
if time.time() - float(entry.get("ts", 0)) > _AUTH_CACHE_TTL_SECONDS:
|
|
40
|
+
return None
|
|
41
|
+
return entry.get("result")
|
|
42
|
+
except Exception:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _save_auth_cache(github_user: str, result: Dict[str, Any]) -> None:
|
|
47
|
+
try:
|
|
48
|
+
_AUTH_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
data = {}
|
|
50
|
+
if _AUTH_CACHE_PATH.exists():
|
|
51
|
+
try:
|
|
52
|
+
with open(_AUTH_CACHE_PATH) as f:
|
|
53
|
+
data = json.load(f)
|
|
54
|
+
except Exception:
|
|
55
|
+
data = {}
|
|
56
|
+
data[_auth_cache_key()] = {
|
|
57
|
+
"github_user": github_user,
|
|
58
|
+
"ts": int(time.time()),
|
|
59
|
+
"result": result,
|
|
60
|
+
}
|
|
61
|
+
with open(_AUTH_CACHE_PATH, "w") as f:
|
|
62
|
+
json.dump(data, f)
|
|
63
|
+
except Exception:
|
|
64
|
+
pass
|
|
65
|
+
|
|
19
66
|
|
|
20
67
|
def _load_ssh_cache(github_user: str) -> Optional[Dict[str, Any]]:
|
|
21
68
|
"""Return cached validation if it's fresh and matches the configured github_user, else None."""
|
|
@@ -50,31 +97,33 @@ def _save_ssh_cache(github_user: str, result: Dict[str, Any]) -> None:
|
|
|
50
97
|
|
|
51
98
|
|
|
52
99
|
def authenticate_user(config: Config) -> Dict[str, Any]:
|
|
53
|
-
"""Authenticate using AWS credentials - if you can call AWS, you're authorized
|
|
54
|
-
try:
|
|
55
|
-
# Test AWS access by getting caller identity
|
|
56
|
-
identity = config.get_user_identity()
|
|
100
|
+
"""Authenticate using AWS credentials - if you can call AWS, you're authorized.
|
|
57
101
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
102
|
+
Cached for 24h per AWS profile. The previous SQS get_queue_url probe was dropped:
|
|
103
|
+
it's a redundant permission check; reserve/cancel call SQS directly and surface
|
|
104
|
+
failures themselves, while list/show/avail don't touch SQS at all.
|
|
105
|
+
"""
|
|
106
|
+
github_user = config.get_github_username()
|
|
107
|
+
if not github_user:
|
|
108
|
+
raise RuntimeError(
|
|
109
|
+
"GitHub username not configured. Please run: gpu-dev config set github_user <your-github-username>"
|
|
110
|
+
)
|
|
64
111
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
raise RuntimeError(
|
|
69
|
-
f"GitHub username not configured. Please run: gpu-dev config set github_user <your-github-username>"
|
|
70
|
-
)
|
|
112
|
+
cached = _load_auth_cache(github_user)
|
|
113
|
+
if cached is not None:
|
|
114
|
+
return cached
|
|
71
115
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
116
|
+
try:
|
|
117
|
+
identity = config.get_user_identity()
|
|
118
|
+
arn = identity["arn"]
|
|
119
|
+
user_name = arn.split("/")[-1]
|
|
120
|
+
result = {
|
|
121
|
+
"user_id": user_name,
|
|
122
|
+
"github_user": github_user,
|
|
75
123
|
"arn": arn,
|
|
76
124
|
}
|
|
77
|
-
|
|
125
|
+
_save_auth_cache(github_user, result)
|
|
126
|
+
return result
|
|
78
127
|
except Exception as e:
|
|
79
128
|
raise RuntimeError(f"AWS authentication failed: {e}")
|
|
80
129
|
|
|
@@ -688,6 +688,7 @@ def reserve(
|
|
|
688
688
|
# and total wall-clock time drops from sum to max(each).
|
|
689
689
|
from concurrent.futures import ThreadPoolExecutor
|
|
690
690
|
config = load_config()
|
|
691
|
+
reservation_mgr = ReservationManager(config)
|
|
691
692
|
|
|
692
693
|
with Live(
|
|
693
694
|
Spinner("dots", text="🚀 Loading…"), console=console
|
|
@@ -704,9 +705,7 @@ def reserve(
|
|
|
704
705
|
else:
|
|
705
706
|
f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
|
|
706
707
|
ssh_result = None
|
|
707
|
-
f_avail = ex.submit(
|
|
708
|
-
lambda: ReservationManager(config).get_gpu_availability_by_type()
|
|
709
|
-
)
|
|
708
|
+
f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
|
|
710
709
|
|
|
711
710
|
# Surface auth failure first (most actionable).
|
|
712
711
|
try:
|
|
@@ -2496,10 +2495,10 @@ def _show_availability() -> None:
|
|
|
2496
2495
|
table = Table(
|
|
2497
2496
|
title="GPU Availability by Type (numbers are GPUs, not nodes)")
|
|
2498
2497
|
table.add_column("GPU Type", style="cyan")
|
|
2499
|
-
table.add_column("
|
|
2500
|
-
table.add_column("Max
|
|
2498
|
+
table.add_column("Avail", style="green")
|
|
2499
|
+
table.add_column("Max\nReservable", style="bright_green")
|
|
2501
2500
|
table.add_column("Total", style="blue")
|
|
2502
|
-
table.add_column("Queue
|
|
2501
|
+
table.add_column("Queue\nLength", style="yellow")
|
|
2503
2502
|
table.add_column("Architecture", style="dim")
|
|
2504
2503
|
table.add_column("Est. Wait Time", style="magenta")
|
|
2505
2504
|
|
|
@@ -2657,10 +2656,10 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2657
2656
|
table = Table(
|
|
2658
2657
|
title="GPU Availability by Type (numbers are GPUs, not nodes)")
|
|
2659
2658
|
table.add_column("GPU Type", style="cyan")
|
|
2660
|
-
table.add_column("
|
|
2661
|
-
table.add_column("Max
|
|
2659
|
+
table.add_column("Avail", style="green")
|
|
2660
|
+
table.add_column("Max\nReservable", style="blue")
|
|
2662
2661
|
table.add_column("Total", style="blue")
|
|
2663
|
-
table.add_column("Queue
|
|
2662
|
+
table.add_column("Queue\nLength", style="yellow")
|
|
2664
2663
|
table.add_column("Architecture", style="dim")
|
|
2665
2664
|
table.add_column("Est. Wait Time", style="magenta")
|
|
2666
2665
|
|
|
@@ -88,9 +88,9 @@ def select_gpu_type_interactive(
|
|
|
88
88
|
console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
|
|
89
89
|
table = Table()
|
|
90
90
|
table.add_column("GPU Type", style="cyan")
|
|
91
|
-
table.add_column("
|
|
91
|
+
table.add_column("Avail", style="green")
|
|
92
92
|
table.add_column("Total", style="blue")
|
|
93
|
-
table.add_column("Queue
|
|
93
|
+
table.add_column("Queue\nLength", style="yellow")
|
|
94
94
|
table.add_column("Est. Wait Time", style="magenta")
|
|
95
95
|
|
|
96
96
|
choices = []
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.15"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -25,23 +25,40 @@ resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
|
|
|
25
25
|
depends_on = [helm_release.nvidia_gpu_operator]
|
|
26
26
|
}
|
|
27
27
|
|
|
28
|
-
#
|
|
29
|
-
#
|
|
30
|
-
# means "no node
|
|
28
|
+
# Declarative B200 MIG node label. Set b200_mig_node_name (per workspace via the locals lookup
|
|
29
|
+
# below, or override via tfvars / -var) to dedicate a specific B200 node to the mixed profile.
|
|
30
|
+
# Empty string means "no node labelled" — every B200 stays full.
|
|
31
|
+
#
|
|
32
|
+
# Future cleanup: when we split a B200 CR into two ASGs (one with mig_profile, one without),
|
|
33
|
+
# the user_data path will set this label at boot for any instance in the MIG-dedicated ASG —
|
|
34
|
+
# matching the H100 cr3 pattern. Until then, this declarative label pins the role to a hostname.
|
|
35
|
+
locals {
|
|
36
|
+
# Workspace-scoped defaults so the resource is a no-op in non-prod and no apply ever tries to
|
|
37
|
+
# label a node that doesn't exist.
|
|
38
|
+
default_b200_mig_node_by_workspace = {
|
|
39
|
+
prod = "ip-10-0-67-125.us-east-2.compute.internal"
|
|
40
|
+
}
|
|
41
|
+
b200_mig_node_effective = (
|
|
42
|
+
var.b200_mig_node_name != ""
|
|
43
|
+
? var.b200_mig_node_name
|
|
44
|
+
: lookup(local.default_b200_mig_node_by_workspace, terraform.workspace, "")
|
|
45
|
+
)
|
|
46
|
+
}
|
|
47
|
+
|
|
31
48
|
variable "b200_mig_node_name" {
|
|
32
|
-
description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to
|
|
49
|
+
description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to use the per-workspace default in mig-config.tf."
|
|
33
50
|
type = string
|
|
34
51
|
default = ""
|
|
35
52
|
}
|
|
36
53
|
|
|
37
54
|
resource "kubernetes_labels" "b200_mig_node" {
|
|
38
|
-
count =
|
|
55
|
+
count = local.b200_mig_node_effective == "" ? 0 : 1
|
|
39
56
|
|
|
40
57
|
api_version = "v1"
|
|
41
58
|
kind = "Node"
|
|
42
59
|
|
|
43
60
|
metadata {
|
|
44
|
-
name =
|
|
61
|
+
name = local.b200_mig_node_effective
|
|
45
62
|
}
|
|
46
63
|
|
|
47
64
|
labels = {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|