gpu-dev 0.5.15__tar.gz → 0.5.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/PKG-INFO +1 -1
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +18 -1
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +6 -4
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/pyproject.toml +1 -1
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_processor/index.py +26 -10
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/.gitignore +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/CLAUDE.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/PROGRESS.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/TODO.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/admin/README.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/post.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/setup.cfg +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/mig-config.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -19,7 +19,7 @@ _SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cach
|
|
|
19
19
|
# Cache for authenticate_user. STS GetCallerIdentity is stable per AWS profile and slow under SSO
|
|
20
20
|
# (~500ms-1.5s). Cache for 24h keyed by AWS_PROFILE; if creds rotate the user_id rarely changes,
|
|
21
21
|
# and the next AWS call (DDB/SQS) will surface a credential error if it does.
|
|
22
|
-
_AUTH_CACHE_TTL_SECONDS =
|
|
22
|
+
_AUTH_CACHE_TTL_SECONDS = 60 * 60
|
|
23
23
|
_AUTH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/auth-cache.json"))
|
|
24
24
|
|
|
25
25
|
|
|
@@ -64,6 +64,22 @@ def _save_auth_cache(github_user: str, result: Dict[str, Any]) -> None:
|
|
|
64
64
|
pass
|
|
65
65
|
|
|
66
66
|
|
|
67
|
+
def clear_auth_cache() -> None:
|
|
68
|
+
"""Drop the cached auth entry for the current AWS profile. Call this after a credential
|
|
69
|
+
error to force the next authenticate_user() to re-hit STS."""
|
|
70
|
+
try:
|
|
71
|
+
if not _AUTH_CACHE_PATH.exists():
|
|
72
|
+
return
|
|
73
|
+
with open(_AUTH_CACHE_PATH) as f:
|
|
74
|
+
data = json.load(f)
|
|
75
|
+
if _auth_cache_key() in data:
|
|
76
|
+
del data[_auth_cache_key()]
|
|
77
|
+
with open(_AUTH_CACHE_PATH, "w") as f:
|
|
78
|
+
json.dump(data, f)
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
|
|
67
83
|
def _load_ssh_cache(github_user: str) -> Optional[Dict[str, Any]]:
|
|
68
84
|
"""Return cached validation if it's fresh and matches the configured github_user, else None."""
|
|
69
85
|
try:
|
|
@@ -125,6 +141,7 @@ def authenticate_user(config: Config) -> Dict[str, Any]:
|
|
|
125
141
|
_save_auth_cache(github_user, result)
|
|
126
142
|
return result
|
|
127
143
|
except Exception as e:
|
|
144
|
+
clear_auth_cache()
|
|
128
145
|
raise RuntimeError(f"AWS authentication failed: {e}")
|
|
129
146
|
|
|
130
147
|
|
|
@@ -1542,13 +1542,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1542
1542
|
if "@" in user_id:
|
|
1543
1543
|
user_display = user_id.split("@")[0]
|
|
1544
1544
|
|
|
1545
|
-
# Format GPU information
|
|
1545
|
+
# Format GPU information (MIG-friendly via _format_gpu_display)
|
|
1546
1546
|
if gpu_type and gpu_type not in ["unknown", "Unknown"]:
|
|
1547
|
-
# For CPU nodes (gpu_count = 0), show just the type
|
|
1548
1547
|
if gpu_count == 0:
|
|
1549
1548
|
gpu_display = gpu_type
|
|
1550
1549
|
else:
|
|
1551
|
-
gpu_display =
|
|
1550
|
+
gpu_display = _format_gpu_display(gpu_count, gpu_type)
|
|
1552
1551
|
else:
|
|
1553
1552
|
gpu_display = str(gpu_count)
|
|
1554
1553
|
|
|
@@ -1844,7 +1843,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1844
1843
|
if gpu_count == 0:
|
|
1845
1844
|
gpu_display = gpu_type
|
|
1846
1845
|
else:
|
|
1847
|
-
gpu_display =
|
|
1846
|
+
gpu_display = _format_gpu_display(gpu_count, gpu_type)
|
|
1848
1847
|
else:
|
|
1849
1848
|
gpu_display = str(gpu_count)
|
|
1850
1849
|
|
|
@@ -2417,6 +2416,9 @@ def _format_gpu_display(gpu_count, gpu_type):
|
|
|
2417
2416
|
"h100-mig-3g": "40GB H100 (MIG)",
|
|
2418
2417
|
"h100-mig-4g": "40GB H100 (MIG)",
|
|
2419
2418
|
"h100-mig-7g": "80GB H100 (MIG)",
|
|
2419
|
+
"b200-mig-1g": "23GB B200 (MIG)",
|
|
2420
|
+
"b200-mig-2g": "45GB B200 (MIG)",
|
|
2421
|
+
"b200-mig-3g": "90GB B200 (MIG)",
|
|
2420
2422
|
}
|
|
2421
2423
|
if gt_lower in mig_friendly:
|
|
2422
2424
|
return f"{gpu_count}× {mig_friendly[gt_lower]}"
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.17"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -308,30 +308,35 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
|
|
|
308
308
|
f"Node {node.metadata.name} in {node_az}: {available_gpus} available GPUs")
|
|
309
309
|
|
|
310
310
|
if candidate_nodes:
|
|
311
|
-
#
|
|
311
|
+
# Binpacking: pack into the most-loaded node that still fits the request.
|
|
312
|
+
# Sort by free GPUs ASC so the fullest node comes first; ties broken by node name
|
|
313
|
+
# so the choice is deterministic across Lambda invocations.
|
|
314
|
+
candidate_nodes.sort(key=lambda n: (n['available_gpus'], n['node_name']))
|
|
312
315
|
selected_node = candidate_nodes[0]
|
|
313
316
|
target_az = selected_node['az']
|
|
317
|
+
target_node = selected_node['node_name']
|
|
314
318
|
logger.info(
|
|
315
|
-
f"
|
|
316
|
-
|
|
319
|
+
f"Binpacked target for {gpu_type} {gpus_requested}gpu: "
|
|
320
|
+
f"node={target_node} az={target_az} free={selected_node['available_gpus']} "
|
|
321
|
+
f"(candidates considered: {len(candidate_nodes)})")
|
|
322
|
+
return target_az, target_node
|
|
317
323
|
|
|
318
324
|
if all_ready_nodes:
|
|
319
|
-
# No single node has enough GPUs
|
|
320
|
-
#
|
|
325
|
+
# No single node has enough GPUs — return AZ of the node with the most available GPUs
|
|
326
|
+
# so disk lands in the right AZ. No node hint (pod will Pending until something frees up).
|
|
321
327
|
best_node = max(all_ready_nodes, key=lambda n: n['available_gpus'])
|
|
322
328
|
target_az = best_node['az']
|
|
323
329
|
logger.info(
|
|
324
330
|
f"No single node has {gpus_requested} {gpu_type} GPUs, "
|
|
325
331
|
f"but {len(all_ready_nodes)} nodes exist. Using AZ {target_az} "
|
|
326
332
|
f"from node {best_node['node_name']} ({best_node['available_gpus']} GPUs available)")
|
|
327
|
-
return target_az
|
|
333
|
+
return target_az, None
|
|
328
334
|
|
|
329
335
|
logger.warning(f"No ready/schedulable {gpu_type} nodes found in cluster")
|
|
330
336
|
return None, None
|
|
331
337
|
|
|
332
338
|
except Exception as e:
|
|
333
339
|
logger.error(f"Error determining target AZ for {gpu_type}: {str(e)}")
|
|
334
|
-
# Fallback to primary AZ if detection fails (no node hint — let k8s pick).
|
|
335
340
|
return PRIMARY_AVAILABILITY_ZONE, None
|
|
336
341
|
|
|
337
342
|
|
|
@@ -2722,6 +2727,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2722
2727
|
persistent_volume_id = None
|
|
2723
2728
|
device_name = None
|
|
2724
2729
|
target_az = None # Initialize target_az for use in connection info update
|
|
2730
|
+
target_node = None # Initialize target_node (binpacking hostname pin) for create_pod
|
|
2725
2731
|
is_new_disk = False # Initialize is_new_disk for all code paths
|
|
2726
2732
|
|
|
2727
2733
|
# If we're using persistent disk, immediately mark this reservation as having a volume
|
|
@@ -2749,8 +2755,8 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2749
2755
|
detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else "")
|
|
2750
2756
|
)
|
|
2751
2757
|
|
|
2752
|
-
# Determine target AZ for this reservation
|
|
2753
|
-
target_az = get_target_az_for_reservation(gpu_type, gpu_count)
|
|
2758
|
+
# Determine target AZ + node for this reservation (binpacking)
|
|
2759
|
+
target_az, target_node = get_target_az_for_reservation(gpu_type, gpu_count)
|
|
2754
2760
|
if not target_az:
|
|
2755
2761
|
raise ValueError(f"No {gpu_type} nodes found in cluster")
|
|
2756
2762
|
|
|
@@ -2881,6 +2887,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
|
|
|
2881
2887
|
dockerfile_base64_data=dockerfile_base64_data,
|
|
2882
2888
|
dockerimage=dockerimage,
|
|
2883
2889
|
target_az=target_az,
|
|
2890
|
+
target_node=target_node,
|
|
2884
2891
|
preserve_entrypoint=preserve_entrypoint,
|
|
2885
2892
|
node_labels=node_labels,
|
|
2886
2893
|
trace_data=trace_data,
|
|
@@ -3421,6 +3428,7 @@ def create_kubernetes_resources(
|
|
|
3421
3428
|
recreate_env: bool = False,
|
|
3422
3429
|
efs_filesystem_id: str = None,
|
|
3423
3430
|
is_multinode: bool = False,
|
|
3431
|
+
target_node: str = None,
|
|
3424
3432
|
dockerfile_base64_data: str = None,
|
|
3425
3433
|
dockerimage: str = None,
|
|
3426
3434
|
target_az: str = None,
|
|
@@ -3524,6 +3532,7 @@ def create_kubernetes_resources(
|
|
|
3524
3532
|
dockerfile_base64_data=dockerfile_base64_data,
|
|
3525
3533
|
dockerimage=dockerimage,
|
|
3526
3534
|
target_az=target_az,
|
|
3535
|
+
target_node=target_node,
|
|
3527
3536
|
preserve_entrypoint=preserve_entrypoint,
|
|
3528
3537
|
node_labels=node_labels,
|
|
3529
3538
|
trace_data=trace_data,
|
|
@@ -3610,6 +3619,7 @@ def create_kubernetes_resources(
|
|
|
3610
3619
|
dockerfile_base64_data=dockerfile_base64_data,
|
|
3611
3620
|
dockerimage=dockerimage,
|
|
3612
3621
|
target_az=target_az,
|
|
3622
|
+
target_node=target_node,
|
|
3613
3623
|
preserve_entrypoint=preserve_entrypoint,
|
|
3614
3624
|
node_labels=node_labels,
|
|
3615
3625
|
trace_data=trace_data,
|
|
@@ -3902,6 +3912,7 @@ def create_pod(
|
|
|
3902
3912
|
dockerfile_base64_data: str = None,
|
|
3903
3913
|
dockerimage: str = None,
|
|
3904
3914
|
target_az: str = None,
|
|
3915
|
+
target_node: str = None,
|
|
3905
3916
|
preserve_entrypoint: bool = False,
|
|
3906
3917
|
node_labels: dict = None,
|
|
3907
3918
|
trace_data: dict = None,
|
|
@@ -5309,7 +5320,12 @@ EOF
|
|
|
5309
5320
|
] if _pod_uses_efa(gpu_count, gpu_type, is_multinode) else []),
|
|
5310
5321
|
node_selector={
|
|
5311
5322
|
"GpuType": get_node_gpu_type(gpu_type),
|
|
5312
|
-
**({} if target_az is None else {"topology.kubernetes.io/zone": target_az})
|
|
5323
|
+
**({} if target_az is None else {"topology.kubernetes.io/zone": target_az}),
|
|
5324
|
+
# Hard-pin to the binpacked node when Lambda picked one. Lambda runs
|
|
5325
|
+
# serialized (reserved_concurrent_executions=1), so allocations seen by the
|
|
5326
|
+
# next invocation include this pod. If the node is unavailable, the pod
|
|
5327
|
+
# stays Pending and surfaces the error rather than spreading.
|
|
5328
|
+
**({} if target_node is None else {"kubernetes.io/hostname": target_node}),
|
|
5313
5329
|
},
|
|
5314
5330
|
# Node affinity for profiling-dedicated preference
|
|
5315
5331
|
# If user requests nsight=true, prefer profiling-dedicated nodes
|
|
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
184
|
-
MIN_CLI_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.17"
|
|
184
|
+
MIN_CLI_VERSION = "0.5.16"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
187
|
}, local.alb_env_vars)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|