gpu-dev 0.3.8__tar.gz → 0.3.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.3.8/cli-tools/gpu-dev-cli/gpu_dev.egg-info → gpu_dev-0.3.9}/PKG-INFO +2 -2
- {gpu_dev-0.3.8 → gpu_dev-0.3.9/cli-tools/gpu-dev-cli/gpu_dev.egg-info}/PKG-INFO +2 -2
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +1 -1
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +8 -2
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/pyproject.toml +2 -2
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/eks.tf +19 -3
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/availability_updater/index.py +5 -4
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_processor/index.py +41 -64
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/al2023-user-data.sh +6 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/.gitignore +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/CLAUDE.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/PROGRESS.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/TODO.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/admin/README.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/admin/generate_stats.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/admin/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/post.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/setup.cfg +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: CLI tool for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -12,7 +12,7 @@ Requires-Dist: pydantic>=2.5.0
|
|
|
12
12
|
Requires-Dist: rich>=13.7.0
|
|
13
13
|
Requires-Dist: pyyaml>=6.0.1
|
|
14
14
|
Requires-Dist: questionary>=2.1.1
|
|
15
|
-
Requires-Dist: websockets
|
|
15
|
+
Requires-Dist: websockets>=12.0
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
18
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.9
|
|
4
4
|
Summary: CLI tool for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -12,7 +12,7 @@ Requires-Dist: pydantic>=2.5.0
|
|
|
12
12
|
Requires-Dist: rich>=13.7.0
|
|
13
13
|
Requires-Dist: pyyaml>=6.0.1
|
|
14
14
|
Requires-Dist: questionary>=2.1.1
|
|
15
|
-
Requires-Dist: websockets
|
|
15
|
+
Requires-Dist: websockets>=12.0
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
18
|
|
|
@@ -4,6 +4,7 @@ SSH ProxyCommand helper for tunneling SSH through WebSocket
|
|
|
4
4
|
Used by ssh with: ssh -o ProxyCommand='gpu-dev-ssh-proxy %h %p' user@host
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import os
|
|
7
8
|
import sys
|
|
8
9
|
import asyncio
|
|
9
10
|
import websockets
|
|
@@ -18,6 +19,11 @@ async def tunnel_ssh(target_host: str, target_port: int):
|
|
|
18
19
|
target_host: Target SSH hostname
|
|
19
20
|
target_port: Target SSH port
|
|
20
21
|
"""
|
|
22
|
+
# Bypass corporate/local HTTP proxies for devservers.io - we connect
|
|
23
|
+
# directly to our ALB, and proxies can cause WebSocket handshake timeouts
|
|
24
|
+
for var in ("HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy", "ALL_PROXY", "all_proxy"):
|
|
25
|
+
os.environ.pop(var, None)
|
|
26
|
+
|
|
21
27
|
# Determine proxy URL based on target host
|
|
22
28
|
if ".test.devservers.io" in target_host:
|
|
23
29
|
proxy_host = "ssh.test.devservers.io"
|
|
@@ -31,8 +37,8 @@ async def tunnel_ssh(target_host: str, target_port: int):
|
|
|
31
37
|
ws_url = f"wss://{proxy_host}/tunnel/{target_host}"
|
|
32
38
|
|
|
33
39
|
try:
|
|
34
|
-
# Connect to WebSocket proxy
|
|
35
|
-
async with websockets.connect(ws_url) as websocket:
|
|
40
|
+
# Connect to WebSocket proxy (20s timeout, generous for cold DNS/TLS)
|
|
41
|
+
async with websockets.connect(ws_url, open_timeout=20) as websocket:
|
|
36
42
|
# Set up stdin/stdout for SSH
|
|
37
43
|
loop = asyncio.get_event_loop()
|
|
38
44
|
reader = asyncio.StreamReader()
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.9"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -17,7 +17,7 @@ dependencies = [
|
|
|
17
17
|
"rich>=13.7.0",
|
|
18
18
|
"pyyaml>=6.0.1",
|
|
19
19
|
"questionary>=2.1.1",
|
|
20
|
-
"websockets>=12.0
|
|
20
|
+
"websockets>=12.0",
|
|
21
21
|
"certifi>=2023.7.22",
|
|
22
22
|
"mcp>=1.0.0",
|
|
23
23
|
]
|
|
@@ -344,16 +344,31 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
|
|
|
344
344
|
}
|
|
345
345
|
}
|
|
346
346
|
|
|
347
|
-
#
|
|
347
|
+
# Primary network interface (card 0) - EFA+ENA for GPU instances, regular for CPU/T4-small
|
|
348
348
|
network_interfaces {
|
|
349
|
+
network_card_index = 0
|
|
350
|
+
device_index = 0
|
|
349
351
|
associate_public_ip_address = true
|
|
350
352
|
security_groups = [aws_security_group.gpu_dev_sg.id]
|
|
351
353
|
subnet_id = each.value.gpu_config.use_placement_group ? null : (local.gpu_subnet_assignments[terraform.workspace][each.value.gpu_type] == "secondary" ? aws_subnet.gpu_dev_subnet_secondary.id : aws_subnet.gpu_dev_subnet.id)
|
|
352
|
-
|
|
353
|
-
interface_type = (each.value.gpu_type == "t4-small" || each.value.gpu_config.gpus_per_instance == 0) ? "interface" : "efa"
|
|
354
|
+
interface_type = try(each.value.gpu_config.efa_network_cards, 0) > 0 ? "efa" : "interface"
|
|
354
355
|
delete_on_termination = true
|
|
355
356
|
}
|
|
356
357
|
|
|
358
|
+
# Additional EFA-only interfaces (cards 1-N) for multi-card instances (p5, p5e, p6, p4d)
|
|
359
|
+
dynamic "network_interfaces" {
|
|
360
|
+
for_each = try(each.value.gpu_config.efa_network_cards, 0) > 1 ? range(1, each.value.gpu_config.efa_network_cards) : []
|
|
361
|
+
content {
|
|
362
|
+
network_card_index = network_interfaces.value
|
|
363
|
+
device_index = 1
|
|
364
|
+
associate_public_ip_address = false
|
|
365
|
+
security_groups = [aws_security_group.gpu_dev_sg.id]
|
|
366
|
+
subnet_id = each.value.gpu_config.use_placement_group ? null : (local.gpu_subnet_assignments[terraform.workspace][each.value.gpu_type] == "secondary" ? aws_subnet.gpu_dev_subnet_secondary.id : aws_subnet.gpu_dev_subnet.id)
|
|
367
|
+
interface_type = "efa-only"
|
|
368
|
+
delete_on_termination = true
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
357
372
|
# Conditionally add instance_market_options for capacity block instances (only when capacity reservation exists)
|
|
358
373
|
dynamic "instance_market_options" {
|
|
359
374
|
for_each = each.value.capacity_reservation_id != null ? [1] : []
|
|
@@ -381,6 +396,7 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
|
|
|
381
396
|
region = local.current_config.aws_region
|
|
382
397
|
gpu_type = local.gpu_type_kubernetes_labels[each.value.gpu_type]
|
|
383
398
|
profiling_dedicated = try(each.value.gpu_config.profiling_dedicated, false)
|
|
399
|
+
container_image = local.latest_image_uri
|
|
384
400
|
}))
|
|
385
401
|
|
|
386
402
|
tag_specifications {
|
{gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
@@ -176,8 +176,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
176
176
|
max_reservable = 0 # Maximum GPUs reservable (considering multinode for high-end GPUs)
|
|
177
177
|
if k8s_client is not None and not is_cpu_type:
|
|
178
178
|
try:
|
|
179
|
-
from kubernetes import client
|
|
180
|
-
v1 =
|
|
179
|
+
from kubernetes import client as k8s_client_lib
|
|
180
|
+
v1 = k8s_client_lib.CoreV1Api(k8s_client)
|
|
181
181
|
nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
|
|
182
182
|
|
|
183
183
|
single_node_max = 0 # Max available on any single node
|
|
@@ -216,8 +216,9 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
216
216
|
logger.info(f"Found {full_nodes_available} full nodes available for {gpu_type}, max reservable: {max_reservable} (single node max: {single_node_max})")
|
|
217
217
|
except Exception as e:
|
|
218
218
|
logger.warning(f"Could not calculate full nodes available for {gpu_type}: {str(e)}")
|
|
219
|
-
|
|
220
|
-
|
|
219
|
+
# Fallback: use available_gpus so max_reservable isn't misleadingly 0
|
|
220
|
+
full_nodes_available = available_gpus // gpus_per_instance if gpus_per_instance > 0 else 0
|
|
221
|
+
max_reservable = available_gpus
|
|
221
222
|
elif is_cpu_type:
|
|
222
223
|
# For CPU nodes, each node supports 1 reservation
|
|
223
224
|
full_nodes_available = available_gpus # Each "GPU" represents one CPU node slot
|
{gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -63,19 +63,19 @@ OPERATIONS_TABLE = os.environ.get("OPERATIONS_TABLE", "pytorch-gpu-dev-operation
|
|
|
63
63
|
|
|
64
64
|
# GPU Configuration - single source of truth for all GPU type mappings
|
|
65
65
|
GPU_CONFIG = {
|
|
66
|
-
"t4": {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192},
|
|
67
|
-
"l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192},
|
|
68
|
-
"a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192},
|
|
69
|
-
"t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32},
|
|
70
|
-
"g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32},
|
|
71
|
-
"a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152},
|
|
72
|
-
"h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048},
|
|
73
|
-
"h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048},
|
|
74
|
-
"b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048},
|
|
75
|
-
"cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64},
|
|
76
|
-
"cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64},
|
|
66
|
+
"t4": {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0},
|
|
67
|
+
"l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
|
|
68
|
+
"a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
|
|
69
|
+
"t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
70
|
+
"g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
71
|
+
"a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 1},
|
|
72
|
+
"h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 1},
|
|
73
|
+
"h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 1},
|
|
74
|
+
"b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 1},
|
|
75
|
+
"cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
76
|
+
"cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
|
|
77
77
|
}
|
|
78
|
-
GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192}
|
|
78
|
+
GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0}
|
|
79
79
|
|
|
80
80
|
|
|
81
81
|
def retry_with_backoff(func, *args, max_retries=5, initial_delay=1, max_delay=32, **kwargs):
|
|
@@ -748,48 +748,36 @@ def restore_ebs_from_existing_snapshot(snapshot_id, target_az, user_id):
|
|
|
748
748
|
raise
|
|
749
749
|
|
|
750
750
|
|
|
751
|
+
_efs_cache = {} # Module-level cache: user_id -> efs_id (shared across threads in same invocation)
|
|
752
|
+
|
|
753
|
+
|
|
751
754
|
def create_or_find_user_efs(user_id: str) -> str:
|
|
752
755
|
"""Create or find existing EFS filesystem for user shared storage"""
|
|
756
|
+
if user_id in _efs_cache:
|
|
757
|
+
cached_id = _efs_cache[user_id]
|
|
758
|
+
logger.info(f"Using cached EFS {cached_id} for user {user_id}")
|
|
759
|
+
ensure_efs_mount_target(cached_id)
|
|
760
|
+
return cached_id
|
|
761
|
+
|
|
753
762
|
try:
|
|
754
763
|
logger.info(f"Looking for existing EFS filesystem for user {user_id}")
|
|
755
764
|
|
|
756
|
-
#
|
|
765
|
+
# Tags are included inline in describe_file_systems response - no
|
|
766
|
+
# need for separate describe_tags calls (which get throttled heavily).
|
|
767
|
+
matching_efs = []
|
|
757
768
|
response = efs_client.describe_file_systems()
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
matching_efs = [] # Collect all matching EFS, sorted by creation time
|
|
762
|
-
|
|
763
|
-
for fs in response.get("FileSystems", []):
|
|
764
|
-
fs_id = fs["FileSystemId"]
|
|
765
|
-
|
|
766
|
-
# Get tags for this filesystem
|
|
767
|
-
try:
|
|
768
|
-
tags_response = retry_with_backoff(efs_client.describe_tags, FileSystemId=fs_id)
|
|
769
|
-
tags = {tag["Key"]: tag["Value"]
|
|
770
|
-
for tag in tags_response.get("Tags", [])}
|
|
771
|
-
|
|
769
|
+
while True:
|
|
770
|
+
for fs in response.get("FileSystems", []):
|
|
771
|
+
tags = {tag["Key"]: tag["Value"] for tag in fs.get("Tags", [])}
|
|
772
772
|
if tags.get("gpu-dev-user") == user_id:
|
|
773
773
|
logger.info(
|
|
774
|
-
f"Found existing EFS {
|
|
774
|
+
f"Found existing EFS {fs['FileSystemId']} for user {user_id} (created {fs.get('CreationTime')})")
|
|
775
775
|
matching_efs.append(fs)
|
|
776
|
+
if "NextMarker" not in response:
|
|
777
|
+
break
|
|
778
|
+
response = efs_client.describe_file_systems(Marker=response["NextMarker"])
|
|
776
779
|
|
|
777
|
-
except Exception as tag_error:
|
|
778
|
-
error_str = str(tag_error)
|
|
779
|
-
# Track throttling failures separately
|
|
780
|
-
if "Throttling" in error_str or "RequestLimitExceeded" in error_str or "TooManyRequests" in error_str:
|
|
781
|
-
throttle_failures += 1
|
|
782
|
-
logger.warning(
|
|
783
|
-
f"EFS DescribeTags throttled for {fs_id} ({throttle_failures}/{total_filesystems}): {tag_error}")
|
|
784
|
-
else:
|
|
785
|
-
logger.warning(
|
|
786
|
-
f"Could not get tags for EFS {fs_id}: {tag_error}")
|
|
787
|
-
continue
|
|
788
|
-
|
|
789
|
-
# If we found matching EFS, return the NEWEST one (by CreationTime)
|
|
790
|
-
# Do this BEFORE checking throttling - if we found EFS, throttling doesn't matter
|
|
791
780
|
if matching_efs:
|
|
792
|
-
# Sort by CreationTime descending (newest first)
|
|
793
781
|
matching_efs.sort(key=lambda x: x.get('CreationTime'), reverse=True)
|
|
794
782
|
newest_efs = matching_efs[0]
|
|
795
783
|
fs_id = newest_efs["FileSystemId"]
|
|
@@ -803,24 +791,10 @@ def create_or_find_user_efs(user_id: str) -> str:
|
|
|
803
791
|
else:
|
|
804
792
|
logger.info(f"Using EFS {fs_id} for user {user_id}")
|
|
805
793
|
|
|
806
|
-
# Log throttling as warning but proceed anyway (we have valid EFS)
|
|
807
|
-
if throttle_failures > 0:
|
|
808
|
-
logger.warning(
|
|
809
|
-
f"Had {throttle_failures}/{total_filesystems} throttling errors during scan, "
|
|
810
|
-
f"but found valid EFS {fs_id} - proceeding"
|
|
811
|
-
)
|
|
812
|
-
|
|
813
|
-
# Ensure mount target exists
|
|
814
794
|
ensure_efs_mount_target(fs_id)
|
|
795
|
+
_efs_cache[user_id] = fs_id
|
|
815
796
|
return fs_id
|
|
816
797
|
|
|
817
|
-
# No matching EFS found - check if throttling prevented complete scan
|
|
818
|
-
if throttle_failures > 0:
|
|
819
|
-
raise Exception(
|
|
820
|
-
f"EFS DescribeTags API throttled ({throttle_failures}/{total_filesystems} filesystems). "
|
|
821
|
-
f"Cannot safely create new EFS - retry later to avoid duplicates."
|
|
822
|
-
)
|
|
823
|
-
|
|
824
798
|
# Create new EFS filesystem
|
|
825
799
|
logger.info(f"Creating new EFS filesystem for user {user_id}")
|
|
826
800
|
|
|
@@ -886,6 +860,7 @@ def create_or_find_user_efs(user_id: str) -> str:
|
|
|
886
860
|
# Don't fail EFS creation for this
|
|
887
861
|
|
|
888
862
|
logger.info(f"Created new EFS filesystem {fs_id} for user {user_id}")
|
|
863
|
+
_efs_cache[user_id] = fs_id
|
|
889
864
|
return fs_id
|
|
890
865
|
|
|
891
866
|
except Exception as e:
|
|
@@ -3519,8 +3494,9 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
|
|
|
3519
3494
|
)
|
|
3520
3495
|
|
|
3521
3496
|
if use_efa:
|
|
3522
|
-
|
|
3523
|
-
|
|
3497
|
+
efa_count = config.get("efa_count", 1)
|
|
3498
|
+
limits["vpc.amazonaws.com/efa"] = str(efa_count)
|
|
3499
|
+
logger.info(f"Using EFA ({efa_count} interfaces) for multinode full-node deployment: {gpu_count}/{max_gpus} GPUs")
|
|
3524
3500
|
else:
|
|
3525
3501
|
logger.info(f"Skipping EFA: multinode={is_multinode}, gpu_count={gpu_count}/{max_gpus}, gpu_type={gpu_type}")
|
|
3526
3502
|
|
|
@@ -3560,7 +3536,8 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
|
|
|
3560
3536
|
gpu_count == max_gpus
|
|
3561
3537
|
)
|
|
3562
3538
|
if use_efa:
|
|
3563
|
-
|
|
3539
|
+
efa_count = config.get("efa_count", 1)
|
|
3540
|
+
requests["vpc.amazonaws.com/efa"] = str(efa_count)
|
|
3564
3541
|
|
|
3565
3542
|
return requests
|
|
3566
3543
|
|
|
@@ -3709,8 +3686,8 @@ def create_pod(
|
|
|
3709
3686
|
init_containers=[
|
|
3710
3687
|
client.V1Container(
|
|
3711
3688
|
name="ssh-setup",
|
|
3712
|
-
image="alpine:
|
|
3713
|
-
image_pull_policy="
|
|
3689
|
+
image="alpine:3.21",
|
|
3690
|
+
image_pull_policy="IfNotPresent",
|
|
3714
3691
|
command=["/bin/sh"],
|
|
3715
3692
|
args=[
|
|
3716
3693
|
"-c",
|
|
@@ -3790,7 +3767,7 @@ def create_pod(
|
|
|
3790
3767
|
client.V1Container(
|
|
3791
3768
|
name="gpu-dev",
|
|
3792
3769
|
image=container_image,
|
|
3793
|
-
image_pull_policy="
|
|
3770
|
+
image_pull_policy="IfNotPresent",
|
|
3794
3771
|
**({
|
|
3795
3772
|
"command": ["/bin/bash"],
|
|
3796
3773
|
"args": [
|
|
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.3.
|
|
184
|
-
MIN_CLI_VERSION = "0.3.
|
|
183
|
+
LAMBDA_VERSION = "0.3.9"
|
|
184
|
+
MIN_CLI_VERSION = "0.3.9"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
187
|
}, local.alb_env_vars)
|
|
@@ -117,4 +117,10 @@ net.core.wmem_max=262144000
|
|
|
117
117
|
EOF
|
|
118
118
|
sysctl --system
|
|
119
119
|
|
|
120
|
+
# Pre-pull GPU dev container image and refresh every 30 minutes
|
|
121
|
+
# ECR credentials are handled by kubelet's credential provider
|
|
122
|
+
ECR_IMAGE="${container_image}"
|
|
123
|
+
crictl pull "$ECR_IMAGE" || echo "Initial image pre-pull failed (node may not be ready yet)"
|
|
124
|
+
echo "*/30 * * * * ECR_LOGIN=\$(aws ecr get-login-password --region ${region}) && echo \$ECR_LOGIN | crictl pull --creds AWS:\$ECR_LOGIN $ECR_IMAGE 2>&1 | logger -t gpu-dev-image-pull" | crontab -
|
|
125
|
+
|
|
120
126
|
echo "Amazon Linux 2023 EKS GPU node setup completed"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|