gpu-dev 0.3.8__tar.gz → 0.3.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {gpu_dev-0.3.8/cli-tools/gpu-dev-cli/gpu_dev.egg-info → gpu_dev-0.3.9}/PKG-INFO +2 -2
  2. {gpu_dev-0.3.8 → gpu_dev-0.3.9/cli-tools/gpu-dev-cli/gpu_dev.egg-info}/PKG-INFO +2 -2
  3. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +1 -1
  4. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +8 -2
  5. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/pyproject.toml +2 -2
  6. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/eks.tf +19 -3
  7. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/availability_updater/index.py +5 -4
  8. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_processor/index.py +41 -64
  9. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda.tf +2 -2
  10. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/al2023-user-data.sh +6 -0
  11. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/.github/workflows/publish.yml +0 -0
  12. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/.gitignore +0 -0
  13. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/CLAUDE.md +0 -0
  14. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/PROGRESS.md +0 -0
  15. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/PR_DESCRIPTION.md +0 -0
  16. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/TODO.md +0 -0
  17. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/admin/README.md +0 -0
  18. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/admin/generate_stats.py +0 -0
  19. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/admin/requirements.txt +0 -0
  20. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/README.md +0 -0
  21. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  22. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  23. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  24. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  25. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  26. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  27. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  28. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
  29. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  30. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  31. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  32. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  33. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  34. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  35. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  36. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/docs/USER_GUIDE.md +0 -0
  37. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/docs/devgpu-features.html +0 -0
  38. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/docs/docker-mark-blue.svg +0 -0
  39. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/docs/icons8-cursor-ai.svg +0 -0
  40. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/post.md +0 -0
  41. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/setup.cfg +0 -0
  42. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  43. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  44. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/README.md +0 -0
  45. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/alb.tf +0 -0
  46. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/availability.tf +0 -0
  47. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/backend.tf +0 -0
  48. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  49. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  50. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  51. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/bash_profile +0 -0
  52. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/bashrc +0 -0
  53. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  54. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  55. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  56. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  57. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/motd_script +0 -0
  58. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  59. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/profile +0 -0
  60. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  61. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  62. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  63. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/shell_env +0 -0
  64. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/ssh_config +0 -0
  65. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/zprofile +0 -0
  66. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/zshrc +0 -0
  67. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  68. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker-build.tf +0 -0
  69. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  70. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  71. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ecr.tf +0 -0
  72. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/efs.tf +0 -0
  73. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/expiry.tf +0 -0
  74. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/git-cache.tf +0 -0
  75. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/kubernetes.tf +0 -0
  76. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  77. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  78. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  79. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  80. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  81. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  82. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  83. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  84. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  85. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  86. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  87. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  88. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  89. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/main.tf +0 -0
  90. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  91. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  92. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  93. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  94. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  95. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/monitoring.tf +0 -0
  96. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/outputs.tf +0 -0
  97. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/pyproject.toml +0 -0
  98. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/queue.tf +0 -0
  99. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/route53.tf +0 -0
  100. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  101. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  102. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  103. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  104. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  105. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  106. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  107. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  108. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  109. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  110. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/switch-to.sh +0 -0
  111. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  112. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  113. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  114. {gpu_dev-0.3.8 → gpu_dev-0.3.9}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.3.8
3
+ Version: 0.3.9
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -12,7 +12,7 @@ Requires-Dist: pydantic>=2.5.0
12
12
  Requires-Dist: rich>=13.7.0
13
13
  Requires-Dist: pyyaml>=6.0.1
14
14
  Requires-Dist: questionary>=2.1.1
15
- Requires-Dist: websockets<13.0,>=12.0
15
+ Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
18
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.3.8
3
+ Version: 0.3.9
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -12,7 +12,7 @@ Requires-Dist: pydantic>=2.5.0
12
12
  Requires-Dist: rich>=13.7.0
13
13
  Requires-Dist: pyyaml>=6.0.1
14
14
  Requires-Dist: questionary>=2.1.1
15
- Requires-Dist: websockets<13.0,>=12.0
15
+ Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
18
 
@@ -5,6 +5,6 @@ pydantic>=2.5.0
5
5
  rich>=13.7.0
6
6
  pyyaml>=6.0.1
7
7
  questionary>=2.1.1
8
- websockets<13.0,>=12.0
8
+ websockets>=12.0
9
9
  certifi>=2023.7.22
10
10
  mcp>=1.0.0
@@ -4,6 +4,7 @@ SSH ProxyCommand helper for tunneling SSH through WebSocket
4
4
  Used by ssh with: ssh -o ProxyCommand='gpu-dev-ssh-proxy %h %p' user@host
5
5
  """
6
6
 
7
+ import os
7
8
  import sys
8
9
  import asyncio
9
10
  import websockets
@@ -18,6 +19,11 @@ async def tunnel_ssh(target_host: str, target_port: int):
18
19
  target_host: Target SSH hostname
19
20
  target_port: Target SSH port
20
21
  """
22
+ # Bypass corporate/local HTTP proxies for devservers.io - we connect
23
+ # directly to our ALB, and proxies can cause WebSocket handshake timeouts
24
+ for var in ("HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy", "ALL_PROXY", "all_proxy"):
25
+ os.environ.pop(var, None)
26
+
21
27
  # Determine proxy URL based on target host
22
28
  if ".test.devservers.io" in target_host:
23
29
  proxy_host = "ssh.test.devservers.io"
@@ -31,8 +37,8 @@ async def tunnel_ssh(target_host: str, target_port: int):
31
37
  ws_url = f"wss://{proxy_host}/tunnel/{target_host}"
32
38
 
33
39
  try:
34
- # Connect to WebSocket proxy
35
- async with websockets.connect(ws_url) as websocket:
40
+ # Connect to WebSocket proxy (20s timeout, generous for cold DNS/TLS)
41
+ async with websockets.connect(ws_url, open_timeout=20) as websocket:
36
42
  # Set up stdin/stdout for SSH
37
43
  loop = asyncio.get_event_loop()
38
44
  reader = asyncio.StreamReader()
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.3.8"
7
+ version = "0.3.9"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -17,7 +17,7 @@ dependencies = [
17
17
  "rich>=13.7.0",
18
18
  "pyyaml>=6.0.1",
19
19
  "questionary>=2.1.1",
20
- "websockets>=12.0,<13.0",
20
+ "websockets>=12.0",
21
21
  "certifi>=2023.7.22",
22
22
  "mcp>=1.0.0",
23
23
  ]
@@ -344,16 +344,31 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
344
344
  }
345
345
  }
346
346
 
347
- # Network interface (EFA enabled for supported instance types only)
347
+ # Primary network interface (card 0) - EFA+ENA for GPU instances, regular for CPU/T4-small
348
348
  network_interfaces {
349
+ network_card_index = 0
350
+ device_index = 0
349
351
  associate_public_ip_address = true
350
352
  security_groups = [aws_security_group.gpu_dev_sg.id]
351
353
  subnet_id = each.value.gpu_config.use_placement_group ? null : (local.gpu_subnet_assignments[terraform.workspace][each.value.gpu_type] == "secondary" ? aws_subnet.gpu_dev_subnet_secondary.id : aws_subnet.gpu_dev_subnet.id)
352
- # EFA is not supported on g4dn.2xlarge (t4-small) and CPU instances
353
- interface_type = (each.value.gpu_type == "t4-small" || each.value.gpu_config.gpus_per_instance == 0) ? "interface" : "efa"
354
+ interface_type = try(each.value.gpu_config.efa_network_cards, 0) > 0 ? "efa" : "interface"
354
355
  delete_on_termination = true
355
356
  }
356
357
 
358
+ # Additional EFA-only interfaces (cards 1-N) for multi-card instances (p5, p5e, p6, p4d)
359
+ dynamic "network_interfaces" {
360
+ for_each = try(each.value.gpu_config.efa_network_cards, 0) > 1 ? range(1, each.value.gpu_config.efa_network_cards) : []
361
+ content {
362
+ network_card_index = network_interfaces.value
363
+ device_index = 1
364
+ associate_public_ip_address = false
365
+ security_groups = [aws_security_group.gpu_dev_sg.id]
366
+ subnet_id = each.value.gpu_config.use_placement_group ? null : (local.gpu_subnet_assignments[terraform.workspace][each.value.gpu_type] == "secondary" ? aws_subnet.gpu_dev_subnet_secondary.id : aws_subnet.gpu_dev_subnet.id)
367
+ interface_type = "efa-only"
368
+ delete_on_termination = true
369
+ }
370
+ }
371
+
357
372
  # Conditionally add instance_market_options for capacity block instances (only when capacity reservation exists)
358
373
  dynamic "instance_market_options" {
359
374
  for_each = each.value.capacity_reservation_id != null ? [1] : []
@@ -381,6 +396,7 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
381
396
  region = local.current_config.aws_region
382
397
  gpu_type = local.gpu_type_kubernetes_labels[each.value.gpu_type]
383
398
  profiling_dedicated = try(each.value.gpu_config.profiling_dedicated, false)
399
+ container_image = local.latest_image_uri
384
400
  }))
385
401
 
386
402
  tag_specifications {
@@ -176,8 +176,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
176
176
  max_reservable = 0 # Maximum GPUs reservable (considering multinode for high-end GPUs)
177
177
  if k8s_client is not None and not is_cpu_type:
178
178
  try:
179
- from kubernetes import client
180
- v1 = client.CoreV1Api(k8s_client)
179
+ from kubernetes import client as k8s_client_lib
180
+ v1 = k8s_client_lib.CoreV1Api(k8s_client)
181
181
  nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
182
182
 
183
183
  single_node_max = 0 # Max available on any single node
@@ -216,8 +216,9 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
216
216
  logger.info(f"Found {full_nodes_available} full nodes available for {gpu_type}, max reservable: {max_reservable} (single node max: {single_node_max})")
217
217
  except Exception as e:
218
218
  logger.warning(f"Could not calculate full nodes available for {gpu_type}: {str(e)}")
219
- full_nodes_available = 0
220
- max_reservable = 0
219
+ # Fallback: use available_gpus so max_reservable isn't misleadingly 0
220
+ full_nodes_available = available_gpus // gpus_per_instance if gpus_per_instance > 0 else 0
221
+ max_reservable = available_gpus
221
222
  elif is_cpu_type:
222
223
  # For CPU nodes, each node supports 1 reservation
223
224
  full_nodes_available = available_gpus # Each "GPU" represents one CPU node slot
@@ -63,19 +63,19 @@ OPERATIONS_TABLE = os.environ.get("OPERATIONS_TABLE", "pytorch-gpu-dev-operation
63
63
 
64
64
  # GPU Configuration - single source of truth for all GPU type mappings
65
65
  GPU_CONFIG = {
66
- "t4": {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192},
67
- "l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192},
68
- "a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192},
69
- "t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32},
70
- "g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32},
71
- "a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152},
72
- "h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048},
73
- "h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048},
74
- "b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048},
75
- "cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64},
76
- "cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64},
66
+ "t4": {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0},
67
+ "l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
68
+ "a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
69
+ "t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
70
+ "g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
71
+ "a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 1},
72
+ "h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 1},
73
+ "h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 1},
74
+ "b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 1},
75
+ "cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
76
+ "cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
77
77
  }
78
- GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192}
78
+ GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0}
79
79
 
80
80
 
81
81
  def retry_with_backoff(func, *args, max_retries=5, initial_delay=1, max_delay=32, **kwargs):
@@ -748,48 +748,36 @@ def restore_ebs_from_existing_snapshot(snapshot_id, target_az, user_id):
748
748
  raise
749
749
 
750
750
 
751
+ _efs_cache = {} # Module-level cache: user_id -> efs_id (shared across threads in same invocation)
752
+
753
+
751
754
  def create_or_find_user_efs(user_id: str) -> str:
752
755
  """Create or find existing EFS filesystem for user shared storage"""
756
+ if user_id in _efs_cache:
757
+ cached_id = _efs_cache[user_id]
758
+ logger.info(f"Using cached EFS {cached_id} for user {user_id}")
759
+ ensure_efs_mount_target(cached_id)
760
+ return cached_id
761
+
753
762
  try:
754
763
  logger.info(f"Looking for existing EFS filesystem for user {user_id}")
755
764
 
756
- # Check for existing EFS with user tag
765
+ # Tags are included inline in describe_file_systems response - no
766
+ # need for separate describe_tags calls (which get throttled heavily).
767
+ matching_efs = []
757
768
  response = efs_client.describe_file_systems()
758
-
759
- throttle_failures = 0
760
- total_filesystems = len(response.get("FileSystems", []))
761
- matching_efs = [] # Collect all matching EFS, sorted by creation time
762
-
763
- for fs in response.get("FileSystems", []):
764
- fs_id = fs["FileSystemId"]
765
-
766
- # Get tags for this filesystem
767
- try:
768
- tags_response = retry_with_backoff(efs_client.describe_tags, FileSystemId=fs_id)
769
- tags = {tag["Key"]: tag["Value"]
770
- for tag in tags_response.get("Tags", [])}
771
-
769
+ while True:
770
+ for fs in response.get("FileSystems", []):
771
+ tags = {tag["Key"]: tag["Value"] for tag in fs.get("Tags", [])}
772
772
  if tags.get("gpu-dev-user") == user_id:
773
773
  logger.info(
774
- f"Found existing EFS {fs_id} for user {user_id} (created {fs.get('CreationTime')})")
774
+ f"Found existing EFS {fs['FileSystemId']} for user {user_id} (created {fs.get('CreationTime')})")
775
775
  matching_efs.append(fs)
776
+ if "NextMarker" not in response:
777
+ break
778
+ response = efs_client.describe_file_systems(Marker=response["NextMarker"])
776
779
 
777
- except Exception as tag_error:
778
- error_str = str(tag_error)
779
- # Track throttling failures separately
780
- if "Throttling" in error_str or "RequestLimitExceeded" in error_str or "TooManyRequests" in error_str:
781
- throttle_failures += 1
782
- logger.warning(
783
- f"EFS DescribeTags throttled for {fs_id} ({throttle_failures}/{total_filesystems}): {tag_error}")
784
- else:
785
- logger.warning(
786
- f"Could not get tags for EFS {fs_id}: {tag_error}")
787
- continue
788
-
789
- # If we found matching EFS, return the NEWEST one (by CreationTime)
790
- # Do this BEFORE checking throttling - if we found EFS, throttling doesn't matter
791
780
  if matching_efs:
792
- # Sort by CreationTime descending (newest first)
793
781
  matching_efs.sort(key=lambda x: x.get('CreationTime'), reverse=True)
794
782
  newest_efs = matching_efs[0]
795
783
  fs_id = newest_efs["FileSystemId"]
@@ -803,24 +791,10 @@ def create_or_find_user_efs(user_id: str) -> str:
803
791
  else:
804
792
  logger.info(f"Using EFS {fs_id} for user {user_id}")
805
793
 
806
- # Log throttling as warning but proceed anyway (we have valid EFS)
807
- if throttle_failures > 0:
808
- logger.warning(
809
- f"Had {throttle_failures}/{total_filesystems} throttling errors during scan, "
810
- f"but found valid EFS {fs_id} - proceeding"
811
- )
812
-
813
- # Ensure mount target exists
814
794
  ensure_efs_mount_target(fs_id)
795
+ _efs_cache[user_id] = fs_id
815
796
  return fs_id
816
797
 
817
- # No matching EFS found - check if throttling prevented complete scan
818
- if throttle_failures > 0:
819
- raise Exception(
820
- f"EFS DescribeTags API throttled ({throttle_failures}/{total_filesystems} filesystems). "
821
- f"Cannot safely create new EFS - retry later to avoid duplicates."
822
- )
823
-
824
798
  # Create new EFS filesystem
825
799
  logger.info(f"Creating new EFS filesystem for user {user_id}")
826
800
 
@@ -886,6 +860,7 @@ def create_or_find_user_efs(user_id: str) -> str:
886
860
  # Don't fail EFS creation for this
887
861
 
888
862
  logger.info(f"Created new EFS filesystem {fs_id} for user {user_id}")
863
+ _efs_cache[user_id] = fs_id
889
864
  return fs_id
890
865
 
891
866
  except Exception as e:
@@ -3519,8 +3494,9 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
3519
3494
  )
3520
3495
 
3521
3496
  if use_efa:
3522
- limits["vpc.amazonaws.com/efa"] = "1"
3523
- logger.info(f"Using EFA for multinode full-node deployment: {gpu_count}/{max_gpus} GPUs")
3497
+ efa_count = config.get("efa_count", 1)
3498
+ limits["vpc.amazonaws.com/efa"] = str(efa_count)
3499
+ logger.info(f"Using EFA ({efa_count} interfaces) for multinode full-node deployment: {gpu_count}/{max_gpus} GPUs")
3524
3500
  else:
3525
3501
  logger.info(f"Skipping EFA: multinode={is_multinode}, gpu_count={gpu_count}/{max_gpus}, gpu_type={gpu_type}")
3526
3502
 
@@ -3560,7 +3536,8 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
3560
3536
  gpu_count == max_gpus
3561
3537
  )
3562
3538
  if use_efa:
3563
- requests["vpc.amazonaws.com/efa"] = "1"
3539
+ efa_count = config.get("efa_count", 1)
3540
+ requests["vpc.amazonaws.com/efa"] = str(efa_count)
3564
3541
 
3565
3542
  return requests
3566
3543
 
@@ -3709,8 +3686,8 @@ def create_pod(
3709
3686
  init_containers=[
3710
3687
  client.V1Container(
3711
3688
  name="ssh-setup",
3712
- image="alpine:latest",
3713
- image_pull_policy="Always", # Fail fast if image doesn't exist
3689
+ image="alpine:3.21",
3690
+ image_pull_policy="IfNotPresent",
3714
3691
  command=["/bin/sh"],
3715
3692
  args=[
3716
3693
  "-c",
@@ -3790,7 +3767,7 @@ def create_pod(
3790
3767
  client.V1Container(
3791
3768
  name="gpu-dev",
3792
3769
  image=container_image,
3793
- image_pull_policy="Always", # Always pull to check if image exists, fail fast if not
3770
+ image_pull_policy="IfNotPresent",
3794
3771
  **({
3795
3772
  "command": ["/bin/bash"],
3796
3773
  "args": [
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.3.7"
184
- MIN_CLI_VERSION = "0.3.7"
183
+ LAMBDA_VERSION = "0.3.9"
184
+ MIN_CLI_VERSION = "0.3.9"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
187
  }, local.alb_env_vars)
@@ -117,4 +117,10 @@ net.core.wmem_max=262144000
117
117
  EOF
118
118
  sysctl --system
119
119
 
120
+ # Pre-pull GPU dev container image and refresh every 30 minutes
121
+ # ECR credentials are handled by kubelet's credential provider
122
+ ECR_IMAGE="${container_image}"
123
+ crictl pull "$ECR_IMAGE" || echo "Initial image pre-pull failed (node may not be ready yet)"
124
+ echo "*/30 * * * * ECR_LOGIN=\$(aws ecr get-login-password --region ${region}) && echo \$ECR_LOGIN | crictl pull --creds AWS:\$ECR_LOGIN $ECR_IMAGE 2>&1 | logger -t gpu-dev-image-pull" | crontab -
125
+
120
126
  echo "Amazon Linux 2023 EKS GPU node setup completed"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes