gpu-dev 0.5.4__tar.gz → 0.5.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/PKG-INFO +1 -1
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +32 -2
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/pyproject.toml +1 -1
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/availability.tf +12 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/availability_updater/index.py +199 -4
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/.gitignore +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/CLAUDE.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/PROGRESS.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/TODO.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/admin/README.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/post.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/setup.cfg +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -150,6 +150,17 @@ def select_gpu_type_interactive(
|
|
|
150
150
|
return None
|
|
151
151
|
|
|
152
152
|
|
|
153
|
+
def _format_eta_seconds(delta_seconds: int) -> str:
|
|
154
|
+
"""Format a positive seconds delta as e.g. '12min', '1h24min', '<1min'."""
|
|
155
|
+
if delta_seconds < 60:
|
|
156
|
+
return "<1min"
|
|
157
|
+
if delta_seconds < 3600:
|
|
158
|
+
return f"{delta_seconds // 60}min"
|
|
159
|
+
h = delta_seconds // 3600
|
|
160
|
+
m = (delta_seconds % 3600) // 60
|
|
161
|
+
return f"{h}h" if m == 0 else f"{h}h{m}min"
|
|
162
|
+
|
|
163
|
+
|
|
153
164
|
def select_gpu_count_interactive(
|
|
154
165
|
gpu_type: str,
|
|
155
166
|
max_gpus: int,
|
|
@@ -190,10 +201,13 @@ def select_gpu_count_interactive(
|
|
|
190
201
|
multinode_counts = [16, 24, 32, 40, 48] # multiples of 8
|
|
191
202
|
|
|
192
203
|
# Pull live availability for the parent SKU once — used to annotate every option.
|
|
204
|
+
import time as _time
|
|
193
205
|
parent_info = (availability_info or {}).get(gpu_type, {}) if availability_info else {}
|
|
194
206
|
parent_max_reservable = int(parent_info.get("max_reservable", 0))
|
|
195
207
|
parent_full_nodes = int(parent_info.get("full_nodes_available", 0))
|
|
196
208
|
parent_available = int(parent_info.get("available", 0))
|
|
209
|
+
parent_size_etas = parent_info.get("size_etas", {}) or {}
|
|
210
|
+
_now_ts = int(_time.time())
|
|
197
211
|
|
|
198
212
|
# MIG slice submenu: only for h100. Each tuple is (target_gpu_type, gpu_count, gb_label).
|
|
199
213
|
mig_options = []
|
|
@@ -247,7 +261,15 @@ def select_gpu_count_interactive(
|
|
|
247
261
|
if parent_max_reservable >= count:
|
|
248
262
|
label += f" [{parent_available} free]"
|
|
249
263
|
else:
|
|
250
|
-
|
|
264
|
+
eta_ts = parent_size_etas.get(str(count))
|
|
265
|
+
try:
|
|
266
|
+
eta_int = int(eta_ts) if eta_ts is not None else None
|
|
267
|
+
except (TypeError, ValueError):
|
|
268
|
+
eta_int = None
|
|
269
|
+
if eta_int is not None and eta_int > _now_ts:
|
|
270
|
+
label += f" [available in {_format_eta_seconds(eta_int - _now_ts)}]"
|
|
271
|
+
else:
|
|
272
|
+
label += " [unavailable now]"
|
|
251
273
|
choices.append(questionary.Choice(title=label, value=count))
|
|
252
274
|
|
|
253
275
|
# Multinode at the bottom.
|
|
@@ -261,7 +283,15 @@ def select_gpu_count_interactive(
|
|
|
261
283
|
if parent_max_reservable >= count:
|
|
262
284
|
label += f" [{parent_full_nodes} full nodes free]"
|
|
263
285
|
else:
|
|
264
|
-
|
|
286
|
+
eta_ts = parent_size_etas.get(str(count))
|
|
287
|
+
try:
|
|
288
|
+
eta_int = int(eta_ts) if eta_ts is not None else None
|
|
289
|
+
except (TypeError, ValueError):
|
|
290
|
+
eta_int = None
|
|
291
|
+
if eta_int is not None and eta_int > _now_ts:
|
|
292
|
+
label += f" [available in {_format_eta_seconds(eta_int - _now_ts)}]"
|
|
293
|
+
else:
|
|
294
|
+
label += " [unavailable now]"
|
|
265
295
|
choices.append(questionary.Choice(title=label, value=count))
|
|
266
296
|
|
|
267
297
|
try:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.5"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -32,6 +32,7 @@ resource "aws_lambda_function" "availability_updater" {
|
|
|
32
32
|
environment {
|
|
33
33
|
variables = {
|
|
34
34
|
AVAILABILITY_TABLE = aws_dynamodb_table.gpu_availability.name
|
|
35
|
+
RESERVATIONS_TABLE = aws_dynamodb_table.gpu_reservations.name
|
|
35
36
|
# Filter out nsight variants - they're counted under base types (h200/b200) via GpuType label mapping
|
|
36
37
|
SUPPORTED_GPU_TYPES = jsonencode({
|
|
37
38
|
for k, v in local.current_config.supported_gpu_types : k => v
|
|
@@ -103,6 +104,17 @@ resource "aws_iam_role_policy" "availability_updater_policy" {
|
|
|
103
104
|
]
|
|
104
105
|
Resource = aws_dynamodb_table.gpu_availability.arn
|
|
105
106
|
},
|
|
107
|
+
{
|
|
108
|
+
Effect = "Allow"
|
|
109
|
+
Action = [
|
|
110
|
+
"dynamodb:Scan",
|
|
111
|
+
"dynamodb:Query"
|
|
112
|
+
]
|
|
113
|
+
Resource = [
|
|
114
|
+
aws_dynamodb_table.gpu_reservations.arn,
|
|
115
|
+
"${aws_dynamodb_table.gpu_reservations.arn}/index/*"
|
|
116
|
+
]
|
|
117
|
+
},
|
|
106
118
|
{
|
|
107
119
|
Effect = "Allow"
|
|
108
120
|
Action = [
|
{gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
@@ -20,6 +20,7 @@ autoscaling = boto3.client("autoscaling")
|
|
|
20
20
|
|
|
21
21
|
# Environment variables
|
|
22
22
|
AVAILABILITY_TABLE = os.environ["AVAILABILITY_TABLE"]
|
|
23
|
+
RESERVATIONS_TABLE = os.environ.get("RESERVATIONS_TABLE", "pytorch-gpu-dev-reservations")
|
|
23
24
|
SUPPORTED_GPU_TYPES = json.loads(os.environ["SUPPORTED_GPU_TYPES"])
|
|
24
25
|
|
|
25
26
|
|
|
@@ -55,12 +56,20 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
|
|
55
56
|
logger.error(f"Failed to setup Kubernetes client: {k8s_setup_error}")
|
|
56
57
|
k8s_client = None
|
|
57
58
|
|
|
59
|
+
# Cache active reservations once for the whole invocation (used for per-size ETAs)
|
|
60
|
+
try:
|
|
61
|
+
active_reservations = scan_active_reservations()
|
|
62
|
+
logger.info(f"Cached {len(active_reservations)} active reservations for ETA computation")
|
|
63
|
+
except Exception as scan_err:
|
|
64
|
+
logger.warning(f"Failed to scan reservations table for ETAs: {scan_err}")
|
|
65
|
+
active_reservations = []
|
|
66
|
+
|
|
58
67
|
# Update availability for ALL GPU types (use any ASG event as trigger to refresh all)
|
|
59
68
|
updated_types = []
|
|
60
69
|
for gpu_type in SUPPORTED_GPU_TYPES.keys():
|
|
61
70
|
try:
|
|
62
71
|
logger.info(f"=== Starting update for GPU type: {gpu_type} ===")
|
|
63
|
-
update_gpu_availability(gpu_type, k8s_client)
|
|
72
|
+
update_gpu_availability(gpu_type, k8s_client, active_reservations=active_reservations)
|
|
64
73
|
updated_types.append(gpu_type)
|
|
65
74
|
logger.info(f"=== Successfully updated availability for GPU type: {gpu_type} ===")
|
|
66
75
|
except Exception as gpu_error:
|
|
@@ -85,8 +94,8 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
|
|
85
94
|
raise
|
|
86
95
|
|
|
87
96
|
|
|
88
|
-
def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
89
|
-
"""Update availability information for a specific GPU type"""
|
|
97
|
+
def update_gpu_availability(gpu_type: str, k8s_client=None, active_reservations=None) -> None:
|
|
98
|
+
"""Update availability information for a specific GPU type."""
|
|
90
99
|
try:
|
|
91
100
|
logger.info(f"Starting availability update for GPU type: {gpu_type}")
|
|
92
101
|
|
|
@@ -246,6 +255,25 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
246
255
|
full_nodes_available = available_gpus # Each "GPU" represents one CPU node slot
|
|
247
256
|
max_reservable = 1 if available_gpus > 0 else 0 # Max 1 CPU node per reservation
|
|
248
257
|
|
|
258
|
+
# Compute per-size ETAs (when each interesting reservation size first becomes reservable).
|
|
259
|
+
size_etas: Dict[str, int] = {}
|
|
260
|
+
if k8s_client is not None and not is_cpu_type and active_reservations is not None:
|
|
261
|
+
try:
|
|
262
|
+
from kubernetes import client as k8s_lib
|
|
263
|
+
v1 = k8s_lib.CoreV1Api(k8s_client)
|
|
264
|
+
size_etas = compute_size_etas(
|
|
265
|
+
v1=v1,
|
|
266
|
+
gpu_type=gpu_type,
|
|
267
|
+
node_label_value=get_node_label_value(gpu_type),
|
|
268
|
+
resource_name=get_gpu_resource_name(gpu_type),
|
|
269
|
+
gpus_per_instance=int(gpus_per_instance),
|
|
270
|
+
active_reservations=active_reservations,
|
|
271
|
+
)
|
|
272
|
+
logger.info(f"Computed size_etas for {gpu_type}: {size_etas}")
|
|
273
|
+
except Exception as eta_err:
|
|
274
|
+
logger.warning(f"Failed to compute size_etas for {gpu_type}: {eta_err}")
|
|
275
|
+
size_etas = {}
|
|
276
|
+
|
|
249
277
|
# Update DynamoDB table (update_item preserves maintenance fields set manually)
|
|
250
278
|
table = dynamodb.Table(AVAILABILITY_TABLE)
|
|
251
279
|
last_updated = context.aws_request_id if "context" in locals() else "unknown"
|
|
@@ -256,7 +284,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
256
284
|
UpdateExpression=(
|
|
257
285
|
"SET total_gpus = :tg, available_gpus = :ag, max_reservable = :mr, "
|
|
258
286
|
"full_nodes_available = :fn, running_instances = :ri, desired_capacity = :dc, "
|
|
259
|
-
"gpus_per_instance = :gpi, last_updated = :lu, last_updated_timestamp = :lut"
|
|
287
|
+
"gpus_per_instance = :gpi, last_updated = :lu, last_updated_timestamp = :lut, "
|
|
288
|
+
"size_etas = :se"
|
|
260
289
|
),
|
|
261
290
|
ExpressionAttributeValues={
|
|
262
291
|
":tg": total_gpus,
|
|
@@ -268,6 +297,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
268
297
|
":gpi": gpus_per_instance,
|
|
269
298
|
":lu": last_updated,
|
|
270
299
|
":lut": last_updated_ts,
|
|
300
|
+
":se": size_etas,
|
|
271
301
|
},
|
|
272
302
|
)
|
|
273
303
|
|
|
@@ -394,3 +424,168 @@ def get_available_gpus_on_node(v1_api, node, gpu_type: str = None) -> int:
|
|
|
394
424
|
f"Error getting available GPUs on node {node.metadata.name}: {str(e)}"
|
|
395
425
|
)
|
|
396
426
|
return 0
|
|
427
|
+
|
|
428
|
+
def scan_active_reservations():
|
|
429
|
+
"""Return list of active reservation rows from the reservations DDB table.
|
|
430
|
+
|
|
431
|
+
Each row is the raw DDB resource-style dict (keys + native types). Caller is
|
|
432
|
+
responsible for tolerating Decimals and missing fields.
|
|
433
|
+
"""
|
|
434
|
+
table = dynamodb.Table(RESERVATIONS_TABLE)
|
|
435
|
+
items = []
|
|
436
|
+
last_key = None
|
|
437
|
+
while True:
|
|
438
|
+
kwargs = {
|
|
439
|
+
"FilterExpression": "#s = :s",
|
|
440
|
+
"ExpressionAttributeNames": {"#s": "status"},
|
|
441
|
+
"ExpressionAttributeValues": {":s": "active"},
|
|
442
|
+
}
|
|
443
|
+
if last_key:
|
|
444
|
+
kwargs["ExclusiveStartKey"] = last_key
|
|
445
|
+
resp = table.scan(**kwargs)
|
|
446
|
+
items.extend(resp.get("Items", []))
|
|
447
|
+
last_key = resp.get("LastEvaluatedKey")
|
|
448
|
+
if not last_key:
|
|
449
|
+
break
|
|
450
|
+
return items
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
# Multinode-eligible types (mirrors the older multinode_gpu_types list elsewhere in this file).
|
|
454
|
+
_MULTINODE_TYPES = {"h100", "h200", "b200", "a100"}
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def compute_size_etas(v1, gpu_type, node_label_value, resource_name, gpus_per_instance, active_reservations):
|
|
458
|
+
"""For each interesting reservation size, compute when it first becomes reservable.
|
|
459
|
+
|
|
460
|
+
Returns a dict mapping the size (as a string) to a unix timestamp (int).
|
|
461
|
+
A timestamp <= now means the size is currently available; sizes that won't
|
|
462
|
+
fit in any foreseeable future (e.g. cluster too small) are omitted.
|
|
463
|
+
"""
|
|
464
|
+
import time as _time
|
|
465
|
+
now = int(_time.time())
|
|
466
|
+
|
|
467
|
+
# 1) Get nodes and per-node capacity for this resource.
|
|
468
|
+
try:
|
|
469
|
+
nodes = v1.list_node(label_selector=f"GpuType={node_label_value}")
|
|
470
|
+
except Exception as e:
|
|
471
|
+
logger.warning(f"compute_size_etas: list_node failed: {e}")
|
|
472
|
+
return {}
|
|
473
|
+
|
|
474
|
+
node_state = {} # node_name -> {capacity, used_now, expirations: [(ts, gpus)]}
|
|
475
|
+
for node in nodes.items:
|
|
476
|
+
if not is_node_ready_and_schedulable(node):
|
|
477
|
+
continue
|
|
478
|
+
capacity = 0
|
|
479
|
+
try:
|
|
480
|
+
capacity = int((node.status.allocatable or {}).get(resource_name, "0"))
|
|
481
|
+
except (ValueError, TypeError):
|
|
482
|
+
capacity = 0
|
|
483
|
+
if capacity == 0:
|
|
484
|
+
continue
|
|
485
|
+
node_state[node.metadata.name] = {
|
|
486
|
+
"capacity": capacity,
|
|
487
|
+
"used_now": 0,
|
|
488
|
+
"expirations": [],
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if not node_state:
|
|
492
|
+
return {}
|
|
493
|
+
|
|
494
|
+
# 2) Map pods on these nodes to their gpu request and node.
|
|
495
|
+
pod_to_info = {} # pod_name -> (node_name, gpus_requested)
|
|
496
|
+
try:
|
|
497
|
+
pods = v1.list_namespaced_pod("gpu-dev")
|
|
498
|
+
except Exception as e:
|
|
499
|
+
logger.warning(f"compute_size_etas: list_pod failed: {e}")
|
|
500
|
+
return {}
|
|
501
|
+
for pod in pods.items:
|
|
502
|
+
if not pod.spec or not pod.spec.node_name:
|
|
503
|
+
continue
|
|
504
|
+
if pod.spec.node_name not in node_state:
|
|
505
|
+
continue
|
|
506
|
+
if pod.status and pod.status.phase not in ("Running", "Pending"):
|
|
507
|
+
continue
|
|
508
|
+
gpus = 0
|
|
509
|
+
if pod.spec.containers:
|
|
510
|
+
for c in pod.spec.containers:
|
|
511
|
+
if c.resources and c.resources.requests:
|
|
512
|
+
try:
|
|
513
|
+
gpus += int(c.resources.requests.get(resource_name, "0"))
|
|
514
|
+
except (ValueError, TypeError):
|
|
515
|
+
pass
|
|
516
|
+
if gpus > 0:
|
|
517
|
+
pod_to_info[pod.metadata.name] = (pod.spec.node_name, gpus)
|
|
518
|
+
|
|
519
|
+
# 3) Cross-reference active reservations to populate per-node expirations.
|
|
520
|
+
target_gpu_type_lower = gpu_type.lower()
|
|
521
|
+
for r in active_reservations:
|
|
522
|
+
# Reservations table stores gpu_type uppercased ("H100"); compare case-insensitively.
|
|
523
|
+
rgt = r.get("gpu_type", "")
|
|
524
|
+
if isinstance(rgt, str) and rgt.lower() != target_gpu_type_lower:
|
|
525
|
+
continue
|
|
526
|
+
pod_name = r.get("pod_name")
|
|
527
|
+
expires_at = r.get("expires_at")
|
|
528
|
+
if not pod_name or expires_at is None:
|
|
529
|
+
continue
|
|
530
|
+
if pod_name not in pod_to_info:
|
|
531
|
+
continue
|
|
532
|
+
try:
|
|
533
|
+
ts = int(float(expires_at))
|
|
534
|
+
except (ValueError, TypeError):
|
|
535
|
+
continue
|
|
536
|
+
node_name, gpus = pod_to_info[pod_name]
|
|
537
|
+
node_state[node_name]["used_now"] += gpus
|
|
538
|
+
node_state[node_name]["expirations"].append((ts, gpus))
|
|
539
|
+
|
|
540
|
+
# Sort each node's expirations by time.
|
|
541
|
+
for ns in node_state.values():
|
|
542
|
+
ns["expirations"].sort()
|
|
543
|
+
|
|
544
|
+
def first_time_size_fits_single_node(size):
|
|
545
|
+
"""Earliest timestamp at which any single node has `size` GPUs free."""
|
|
546
|
+
earliest = None
|
|
547
|
+
for ns in node_state.values():
|
|
548
|
+
free_now = ns["capacity"] - ns["used_now"]
|
|
549
|
+
if free_now >= size:
|
|
550
|
+
return now
|
|
551
|
+
cum = free_now
|
|
552
|
+
for ts, gpus in ns["expirations"]:
|
|
553
|
+
cum += gpus
|
|
554
|
+
if cum >= size:
|
|
555
|
+
if earliest is None or ts < earliest:
|
|
556
|
+
earliest = ts
|
|
557
|
+
break
|
|
558
|
+
return earliest
|
|
559
|
+
|
|
560
|
+
def first_time_k_full_nodes(k):
|
|
561
|
+
"""Earliest timestamp at which K nodes are simultaneously fully free."""
|
|
562
|
+
free_at = []
|
|
563
|
+
for ns in node_state.values():
|
|
564
|
+
if ns["used_now"] == 0:
|
|
565
|
+
free_at.append(now)
|
|
566
|
+
elif ns["expirations"]:
|
|
567
|
+
free_at.append(max(ts for ts, _ in ns["expirations"]))
|
|
568
|
+
free_at.sort()
|
|
569
|
+
if len(free_at) >= k:
|
|
570
|
+
return free_at[k - 1]
|
|
571
|
+
return None
|
|
572
|
+
|
|
573
|
+
etas = {}
|
|
574
|
+
# Single-node sizes 1, 2, 4, 8 (capped at the per-instance maximum).
|
|
575
|
+
for size in (1, 2, 4, 8):
|
|
576
|
+
if size > gpus_per_instance:
|
|
577
|
+
break
|
|
578
|
+
eta = first_time_size_fits_single_node(size)
|
|
579
|
+
if eta is not None:
|
|
580
|
+
etas[str(size)] = eta
|
|
581
|
+
|
|
582
|
+
# Multinode sizes — only for SXM types with 8 GPUs per node.
|
|
583
|
+
if gpus_per_instance == 8 and target_gpu_type_lower in _MULTINODE_TYPES:
|
|
584
|
+
for k_nodes in (2, 3, 4, 5, 6):
|
|
585
|
+
count = k_nodes * gpus_per_instance
|
|
586
|
+
eta = first_time_k_full_nodes(k_nodes)
|
|
587
|
+
if eta is not None:
|
|
588
|
+
etas[str(count)] = eta
|
|
589
|
+
|
|
590
|
+
return etas
|
|
591
|
+
|
|
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
184
|
-
MIN_CLI_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.5"
|
|
184
|
+
MIN_CLI_VERSION = "0.5.5"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
187
|
}, local.alb_env_vars)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.4 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|