gpu-dev 0.5.3__tar.gz → 0.5.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/PKG-INFO +1 -1
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +46 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/pyproject.toml +1 -1
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/availability.tf +12 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/availability_updater/index.py +199 -4
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/.gitignore +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/CLAUDE.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/PROGRESS.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/TODO.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/admin/README.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/post.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/setup.cfg +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -150,6 +150,17 @@ def select_gpu_type_interactive(
|
|
|
150
150
|
return None
|
|
151
151
|
|
|
152
152
|
|
|
153
|
+
def _format_eta_seconds(delta_seconds: int) -> str:
|
|
154
|
+
"""Format a positive seconds delta as e.g. '12min', '1h24min', '<1min'."""
|
|
155
|
+
if delta_seconds < 60:
|
|
156
|
+
return "<1min"
|
|
157
|
+
if delta_seconds < 3600:
|
|
158
|
+
return f"{delta_seconds // 60}min"
|
|
159
|
+
h = delta_seconds // 3600
|
|
160
|
+
m = (delta_seconds % 3600) // 60
|
|
161
|
+
return f"{h}h" if m == 0 else f"{h}h{m}min"
|
|
162
|
+
|
|
163
|
+
|
|
153
164
|
def select_gpu_count_interactive(
|
|
154
165
|
gpu_type: str,
|
|
155
166
|
max_gpus: int,
|
|
@@ -189,6 +200,15 @@ def select_gpu_count_interactive(
|
|
|
189
200
|
# Add multinode options
|
|
190
201
|
multinode_counts = [16, 24, 32, 40, 48] # multiples of 8
|
|
191
202
|
|
|
203
|
+
# Pull live availability for the parent SKU once — used to annotate every option.
|
|
204
|
+
import time as _time
|
|
205
|
+
parent_info = (availability_info or {}).get(gpu_type, {}) if availability_info else {}
|
|
206
|
+
parent_max_reservable = int(parent_info.get("max_reservable", 0))
|
|
207
|
+
parent_full_nodes = int(parent_info.get("full_nodes_available", 0))
|
|
208
|
+
parent_available = int(parent_info.get("available", 0))
|
|
209
|
+
parent_size_etas = parent_info.get("size_etas", {}) or {}
|
|
210
|
+
_now_ts = int(_time.time())
|
|
211
|
+
|
|
192
212
|
# MIG slice submenu: only for h100. Each tuple is (target_gpu_type, gpu_count, gb_label).
|
|
193
213
|
mig_options = []
|
|
194
214
|
if gpu_type == "h100":
|
|
@@ -237,6 +257,19 @@ def select_gpu_count_interactive(
|
|
|
237
257
|
label = f"1 GPU (single node)"
|
|
238
258
|
else:
|
|
239
259
|
label = f"{count} GPUs (single node)"
|
|
260
|
+
if parent_info:
|
|
261
|
+
if parent_max_reservable >= count:
|
|
262
|
+
label += f" [{parent_available} free]"
|
|
263
|
+
else:
|
|
264
|
+
eta_ts = parent_size_etas.get(str(count))
|
|
265
|
+
try:
|
|
266
|
+
eta_int = int(eta_ts) if eta_ts is not None else None
|
|
267
|
+
except (TypeError, ValueError):
|
|
268
|
+
eta_int = None
|
|
269
|
+
if eta_int is not None and eta_int > _now_ts:
|
|
270
|
+
label += f" [available in {_format_eta_seconds(eta_int - _now_ts)}]"
|
|
271
|
+
else:
|
|
272
|
+
label += " [unavailable now]"
|
|
240
273
|
choices.append(questionary.Choice(title=label, value=count))
|
|
241
274
|
|
|
242
275
|
# Multinode at the bottom.
|
|
@@ -246,6 +279,19 @@ def select_gpu_count_interactive(
|
|
|
246
279
|
for count in multinode_counts:
|
|
247
280
|
nodes = count // max_gpus
|
|
248
281
|
label = f"{count} GPUs ({nodes} nodes × {max_gpus} GPUs)"
|
|
282
|
+
if parent_info:
|
|
283
|
+
if parent_max_reservable >= count:
|
|
284
|
+
label += f" [{parent_full_nodes} full nodes free]"
|
|
285
|
+
else:
|
|
286
|
+
eta_ts = parent_size_etas.get(str(count))
|
|
287
|
+
try:
|
|
288
|
+
eta_int = int(eta_ts) if eta_ts is not None else None
|
|
289
|
+
except (TypeError, ValueError):
|
|
290
|
+
eta_int = None
|
|
291
|
+
if eta_int is not None and eta_int > _now_ts:
|
|
292
|
+
label += f" [available in {_format_eta_seconds(eta_int - _now_ts)}]"
|
|
293
|
+
else:
|
|
294
|
+
label += " [unavailable now]"
|
|
249
295
|
choices.append(questionary.Choice(title=label, value=count))
|
|
250
296
|
|
|
251
297
|
try:
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.5"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -32,6 +32,7 @@ resource "aws_lambda_function" "availability_updater" {
|
|
|
32
32
|
environment {
|
|
33
33
|
variables = {
|
|
34
34
|
AVAILABILITY_TABLE = aws_dynamodb_table.gpu_availability.name
|
|
35
|
+
RESERVATIONS_TABLE = aws_dynamodb_table.gpu_reservations.name
|
|
35
36
|
# Filter out nsight variants - they're counted under base types (h200/b200) via GpuType label mapping
|
|
36
37
|
SUPPORTED_GPU_TYPES = jsonencode({
|
|
37
38
|
for k, v in local.current_config.supported_gpu_types : k => v
|
|
@@ -103,6 +104,17 @@ resource "aws_iam_role_policy" "availability_updater_policy" {
|
|
|
103
104
|
]
|
|
104
105
|
Resource = aws_dynamodb_table.gpu_availability.arn
|
|
105
106
|
},
|
|
107
|
+
{
|
|
108
|
+
Effect = "Allow"
|
|
109
|
+
Action = [
|
|
110
|
+
"dynamodb:Scan",
|
|
111
|
+
"dynamodb:Query"
|
|
112
|
+
]
|
|
113
|
+
Resource = [
|
|
114
|
+
aws_dynamodb_table.gpu_reservations.arn,
|
|
115
|
+
"${aws_dynamodb_table.gpu_reservations.arn}/index/*"
|
|
116
|
+
]
|
|
117
|
+
},
|
|
106
118
|
{
|
|
107
119
|
Effect = "Allow"
|
|
108
120
|
Action = [
|
{gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
@@ -20,6 +20,7 @@ autoscaling = boto3.client("autoscaling")
|
|
|
20
20
|
|
|
21
21
|
# Environment variables
|
|
22
22
|
AVAILABILITY_TABLE = os.environ["AVAILABILITY_TABLE"]
|
|
23
|
+
RESERVATIONS_TABLE = os.environ.get("RESERVATIONS_TABLE", "pytorch-gpu-dev-reservations")
|
|
23
24
|
SUPPORTED_GPU_TYPES = json.loads(os.environ["SUPPORTED_GPU_TYPES"])
|
|
24
25
|
|
|
25
26
|
|
|
@@ -55,12 +56,20 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
|
|
55
56
|
logger.error(f"Failed to setup Kubernetes client: {k8s_setup_error}")
|
|
56
57
|
k8s_client = None
|
|
57
58
|
|
|
59
|
+
# Cache active reservations once for the whole invocation (used for per-size ETAs)
|
|
60
|
+
try:
|
|
61
|
+
active_reservations = scan_active_reservations()
|
|
62
|
+
logger.info(f"Cached {len(active_reservations)} active reservations for ETA computation")
|
|
63
|
+
except Exception as scan_err:
|
|
64
|
+
logger.warning(f"Failed to scan reservations table for ETAs: {scan_err}")
|
|
65
|
+
active_reservations = []
|
|
66
|
+
|
|
58
67
|
# Update availability for ALL GPU types (use any ASG event as trigger to refresh all)
|
|
59
68
|
updated_types = []
|
|
60
69
|
for gpu_type in SUPPORTED_GPU_TYPES.keys():
|
|
61
70
|
try:
|
|
62
71
|
logger.info(f"=== Starting update for GPU type: {gpu_type} ===")
|
|
63
|
-
update_gpu_availability(gpu_type, k8s_client)
|
|
72
|
+
update_gpu_availability(gpu_type, k8s_client, active_reservations=active_reservations)
|
|
64
73
|
updated_types.append(gpu_type)
|
|
65
74
|
logger.info(f"=== Successfully updated availability for GPU type: {gpu_type} ===")
|
|
66
75
|
except Exception as gpu_error:
|
|
@@ -85,8 +94,8 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
|
|
85
94
|
raise
|
|
86
95
|
|
|
87
96
|
|
|
88
|
-
def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
89
|
-
"""Update availability information for a specific GPU type"""
|
|
97
|
+
def update_gpu_availability(gpu_type: str, k8s_client=None, active_reservations=None) -> None:
|
|
98
|
+
"""Update availability information for a specific GPU type."""
|
|
90
99
|
try:
|
|
91
100
|
logger.info(f"Starting availability update for GPU type: {gpu_type}")
|
|
92
101
|
|
|
@@ -246,6 +255,25 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
246
255
|
full_nodes_available = available_gpus # Each "GPU" represents one CPU node slot
|
|
247
256
|
max_reservable = 1 if available_gpus > 0 else 0 # Max 1 CPU node per reservation
|
|
248
257
|
|
|
258
|
+
# Compute per-size ETAs (when each interesting reservation size first becomes reservable).
|
|
259
|
+
size_etas: Dict[str, int] = {}
|
|
260
|
+
if k8s_client is not None and not is_cpu_type and active_reservations is not None:
|
|
261
|
+
try:
|
|
262
|
+
from kubernetes import client as k8s_lib
|
|
263
|
+
v1 = k8s_lib.CoreV1Api(k8s_client)
|
|
264
|
+
size_etas = compute_size_etas(
|
|
265
|
+
v1=v1,
|
|
266
|
+
gpu_type=gpu_type,
|
|
267
|
+
node_label_value=get_node_label_value(gpu_type),
|
|
268
|
+
resource_name=get_gpu_resource_name(gpu_type),
|
|
269
|
+
gpus_per_instance=int(gpus_per_instance),
|
|
270
|
+
active_reservations=active_reservations,
|
|
271
|
+
)
|
|
272
|
+
logger.info(f"Computed size_etas for {gpu_type}: {size_etas}")
|
|
273
|
+
except Exception as eta_err:
|
|
274
|
+
logger.warning(f"Failed to compute size_etas for {gpu_type}: {eta_err}")
|
|
275
|
+
size_etas = {}
|
|
276
|
+
|
|
249
277
|
# Update DynamoDB table (update_item preserves maintenance fields set manually)
|
|
250
278
|
table = dynamodb.Table(AVAILABILITY_TABLE)
|
|
251
279
|
last_updated = context.aws_request_id if "context" in locals() else "unknown"
|
|
@@ -256,7 +284,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
256
284
|
UpdateExpression=(
|
|
257
285
|
"SET total_gpus = :tg, available_gpus = :ag, max_reservable = :mr, "
|
|
258
286
|
"full_nodes_available = :fn, running_instances = :ri, desired_capacity = :dc, "
|
|
259
|
-
"gpus_per_instance = :gpi, last_updated = :lu, last_updated_timestamp = :lut"
|
|
287
|
+
"gpus_per_instance = :gpi, last_updated = :lu, last_updated_timestamp = :lut, "
|
|
288
|
+
"size_etas = :se"
|
|
260
289
|
),
|
|
261
290
|
ExpressionAttributeValues={
|
|
262
291
|
":tg": total_gpus,
|
|
@@ -268,6 +297,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
268
297
|
":gpi": gpus_per_instance,
|
|
269
298
|
":lu": last_updated,
|
|
270
299
|
":lut": last_updated_ts,
|
|
300
|
+
":se": size_etas,
|
|
271
301
|
},
|
|
272
302
|
)
|
|
273
303
|
|
|
@@ -394,3 +424,168 @@ def get_available_gpus_on_node(v1_api, node, gpu_type: str = None) -> int:
|
|
|
394
424
|
f"Error getting available GPUs on node {node.metadata.name}: {str(e)}"
|
|
395
425
|
)
|
|
396
426
|
return 0
|
|
427
|
+
|
|
428
|
+
def scan_active_reservations():
|
|
429
|
+
"""Return list of active reservation rows from the reservations DDB table.
|
|
430
|
+
|
|
431
|
+
Each row is the raw DDB resource-style dict (keys + native types). Caller is
|
|
432
|
+
responsible for tolerating Decimals and missing fields.
|
|
433
|
+
"""
|
|
434
|
+
table = dynamodb.Table(RESERVATIONS_TABLE)
|
|
435
|
+
items = []
|
|
436
|
+
last_key = None
|
|
437
|
+
while True:
|
|
438
|
+
kwargs = {
|
|
439
|
+
"FilterExpression": "#s = :s",
|
|
440
|
+
"ExpressionAttributeNames": {"#s": "status"},
|
|
441
|
+
"ExpressionAttributeValues": {":s": "active"},
|
|
442
|
+
}
|
|
443
|
+
if last_key:
|
|
444
|
+
kwargs["ExclusiveStartKey"] = last_key
|
|
445
|
+
resp = table.scan(**kwargs)
|
|
446
|
+
items.extend(resp.get("Items", []))
|
|
447
|
+
last_key = resp.get("LastEvaluatedKey")
|
|
448
|
+
if not last_key:
|
|
449
|
+
break
|
|
450
|
+
return items
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
# Multinode-eligible types (mirrors the older multinode_gpu_types list elsewhere in this file).
|
|
454
|
+
_MULTINODE_TYPES = {"h100", "h200", "b200", "a100"}
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def compute_size_etas(v1, gpu_type, node_label_value, resource_name, gpus_per_instance, active_reservations):
|
|
458
|
+
"""For each interesting reservation size, compute when it first becomes reservable.
|
|
459
|
+
|
|
460
|
+
Returns a dict mapping the size (as a string) to a unix timestamp (int).
|
|
461
|
+
A timestamp <= now means the size is currently available; sizes that won't
|
|
462
|
+
fit in any foreseeable future (e.g. cluster too small) are omitted.
|
|
463
|
+
"""
|
|
464
|
+
import time as _time
|
|
465
|
+
now = int(_time.time())
|
|
466
|
+
|
|
467
|
+
# 1) Get nodes and per-node capacity for this resource.
|
|
468
|
+
try:
|
|
469
|
+
nodes = v1.list_node(label_selector=f"GpuType={node_label_value}")
|
|
470
|
+
except Exception as e:
|
|
471
|
+
logger.warning(f"compute_size_etas: list_node failed: {e}")
|
|
472
|
+
return {}
|
|
473
|
+
|
|
474
|
+
node_state = {} # node_name -> {capacity, used_now, expirations: [(ts, gpus)]}
|
|
475
|
+
for node in nodes.items:
|
|
476
|
+
if not is_node_ready_and_schedulable(node):
|
|
477
|
+
continue
|
|
478
|
+
capacity = 0
|
|
479
|
+
try:
|
|
480
|
+
capacity = int((node.status.allocatable or {}).get(resource_name, "0"))
|
|
481
|
+
except (ValueError, TypeError):
|
|
482
|
+
capacity = 0
|
|
483
|
+
if capacity == 0:
|
|
484
|
+
continue
|
|
485
|
+
node_state[node.metadata.name] = {
|
|
486
|
+
"capacity": capacity,
|
|
487
|
+
"used_now": 0,
|
|
488
|
+
"expirations": [],
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if not node_state:
|
|
492
|
+
return {}
|
|
493
|
+
|
|
494
|
+
# 2) Map pods on these nodes to their gpu request and node.
|
|
495
|
+
pod_to_info = {} # pod_name -> (node_name, gpus_requested)
|
|
496
|
+
try:
|
|
497
|
+
pods = v1.list_namespaced_pod("gpu-dev")
|
|
498
|
+
except Exception as e:
|
|
499
|
+
logger.warning(f"compute_size_etas: list_pod failed: {e}")
|
|
500
|
+
return {}
|
|
501
|
+
for pod in pods.items:
|
|
502
|
+
if not pod.spec or not pod.spec.node_name:
|
|
503
|
+
continue
|
|
504
|
+
if pod.spec.node_name not in node_state:
|
|
505
|
+
continue
|
|
506
|
+
if pod.status and pod.status.phase not in ("Running", "Pending"):
|
|
507
|
+
continue
|
|
508
|
+
gpus = 0
|
|
509
|
+
if pod.spec.containers:
|
|
510
|
+
for c in pod.spec.containers:
|
|
511
|
+
if c.resources and c.resources.requests:
|
|
512
|
+
try:
|
|
513
|
+
gpus += int(c.resources.requests.get(resource_name, "0"))
|
|
514
|
+
except (ValueError, TypeError):
|
|
515
|
+
pass
|
|
516
|
+
if gpus > 0:
|
|
517
|
+
pod_to_info[pod.metadata.name] = (pod.spec.node_name, gpus)
|
|
518
|
+
|
|
519
|
+
# 3) Cross-reference active reservations to populate per-node expirations.
|
|
520
|
+
target_gpu_type_lower = gpu_type.lower()
|
|
521
|
+
for r in active_reservations:
|
|
522
|
+
# Reservations table stores gpu_type uppercased ("H100"); compare case-insensitively.
|
|
523
|
+
rgt = r.get("gpu_type", "")
|
|
524
|
+
if isinstance(rgt, str) and rgt.lower() != target_gpu_type_lower:
|
|
525
|
+
continue
|
|
526
|
+
pod_name = r.get("pod_name")
|
|
527
|
+
expires_at = r.get("expires_at")
|
|
528
|
+
if not pod_name or expires_at is None:
|
|
529
|
+
continue
|
|
530
|
+
if pod_name not in pod_to_info:
|
|
531
|
+
continue
|
|
532
|
+
try:
|
|
533
|
+
ts = int(float(expires_at))
|
|
534
|
+
except (ValueError, TypeError):
|
|
535
|
+
continue
|
|
536
|
+
node_name, gpus = pod_to_info[pod_name]
|
|
537
|
+
node_state[node_name]["used_now"] += gpus
|
|
538
|
+
node_state[node_name]["expirations"].append((ts, gpus))
|
|
539
|
+
|
|
540
|
+
# Sort each node's expirations by time.
|
|
541
|
+
for ns in node_state.values():
|
|
542
|
+
ns["expirations"].sort()
|
|
543
|
+
|
|
544
|
+
def first_time_size_fits_single_node(size):
|
|
545
|
+
"""Earliest timestamp at which any single node has `size` GPUs free."""
|
|
546
|
+
earliest = None
|
|
547
|
+
for ns in node_state.values():
|
|
548
|
+
free_now = ns["capacity"] - ns["used_now"]
|
|
549
|
+
if free_now >= size:
|
|
550
|
+
return now
|
|
551
|
+
cum = free_now
|
|
552
|
+
for ts, gpus in ns["expirations"]:
|
|
553
|
+
cum += gpus
|
|
554
|
+
if cum >= size:
|
|
555
|
+
if earliest is None or ts < earliest:
|
|
556
|
+
earliest = ts
|
|
557
|
+
break
|
|
558
|
+
return earliest
|
|
559
|
+
|
|
560
|
+
def first_time_k_full_nodes(k):
|
|
561
|
+
"""Earliest timestamp at which K nodes are simultaneously fully free."""
|
|
562
|
+
free_at = []
|
|
563
|
+
for ns in node_state.values():
|
|
564
|
+
if ns["used_now"] == 0:
|
|
565
|
+
free_at.append(now)
|
|
566
|
+
elif ns["expirations"]:
|
|
567
|
+
free_at.append(max(ts for ts, _ in ns["expirations"]))
|
|
568
|
+
free_at.sort()
|
|
569
|
+
if len(free_at) >= k:
|
|
570
|
+
return free_at[k - 1]
|
|
571
|
+
return None
|
|
572
|
+
|
|
573
|
+
etas = {}
|
|
574
|
+
# Single-node sizes 1, 2, 4, 8 (capped at the per-instance maximum).
|
|
575
|
+
for size in (1, 2, 4, 8):
|
|
576
|
+
if size > gpus_per_instance:
|
|
577
|
+
break
|
|
578
|
+
eta = first_time_size_fits_single_node(size)
|
|
579
|
+
if eta is not None:
|
|
580
|
+
etas[str(size)] = eta
|
|
581
|
+
|
|
582
|
+
# Multinode sizes — only for SXM types with 8 GPUs per node.
|
|
583
|
+
if gpus_per_instance == 8 and target_gpu_type_lower in _MULTINODE_TYPES:
|
|
584
|
+
for k_nodes in (2, 3, 4, 5, 6):
|
|
585
|
+
count = k_nodes * gpus_per_instance
|
|
586
|
+
eta = first_time_k_full_nodes(k_nodes)
|
|
587
|
+
if eta is not None:
|
|
588
|
+
etas[str(count)] = eta
|
|
589
|
+
|
|
590
|
+
return etas
|
|
591
|
+
|
|
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
184
|
-
MIN_CLI_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.5"
|
|
184
|
+
MIN_CLI_VERSION = "0.5.5"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
187
|
}, local.alb_env_vars)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.3 → gpu_dev-0.5.5}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|