gpu-dev 0.5.15__tar.gz → 0.5.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +18 -1
  4. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +6 -4
  5. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_processor/index.py +26 -10
  7. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda.tf +2 -2
  8. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/.github/workflows/no-gitlinks.yml +0 -0
  9. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/.github/workflows/publish.yml +0 -0
  10. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/.gitignore +0 -0
  11. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/CLAUDE.md +0 -0
  12. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/PROGRESS.md +0 -0
  13. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/PR_DESCRIPTION.md +0 -0
  14. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/TODO.md +0 -0
  15. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/admin/README.md +0 -0
  16. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/admin/generate_stats.py +0 -0
  17. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/admin/requirements.txt +0 -0
  18. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/README.md +0 -0
  19. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  20. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  21. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  22. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  23. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  24. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  25. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  26. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  27. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  28. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  29. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  30. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  31. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  32. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  33. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  34. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/docs/USER_GUIDE.md +0 -0
  35. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/docs/devgpu-features.html +0 -0
  36. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/docs/docker-mark-blue.svg +0 -0
  37. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/docs/icons8-cursor-ai.svg +0 -0
  38. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/post.md +0 -0
  39. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/setup.cfg +0 -0
  40. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  41. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  42. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/README.md +0 -0
  43. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/alb.tf +0 -0
  44. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/availability.tf +0 -0
  45. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/backend.tf +0 -0
  46. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  47. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  48. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  49. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/bash_profile +0 -0
  50. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/bashrc +0 -0
  51. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  52. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  53. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  54. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  55. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/motd_script +0 -0
  56. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  57. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/profile +0 -0
  58. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  59. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  60. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  61. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/shell_env +0 -0
  62. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/ssh_config +0 -0
  63. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/zprofile +0 -0
  64. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/zshrc +0 -0
  65. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  66. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker-build.tf +0 -0
  67. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  68. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  69. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ecr.tf +0 -0
  70. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/efs.tf +0 -0
  71. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/eks.tf +0 -0
  72. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/expiry.tf +0 -0
  73. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/git-cache.tf +0 -0
  74. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/kubernetes.tf +0 -0
  75. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  76. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  77. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  78. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  79. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  80. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  81. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  82. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  83. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  84. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  85. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  86. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  87. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  88. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  89. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/main.tf +0 -0
  90. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/mig-config.tf +0 -0
  91. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  92. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  93. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  94. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  95. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  96. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  97. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/monitoring.tf +0 -0
  98. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/outputs.tf +0 -0
  99. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/pyproject.toml +0 -0
  100. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/queue.tf +0 -0
  101. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/route53.tf +0 -0
  102. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  103. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  104. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  105. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  106. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  107. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  108. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  109. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  110. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  111. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  112. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/switch-to.sh +0 -0
  113. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  114. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  115. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  116. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  117. {gpu_dev-0.5.15 → gpu_dev-0.5.17}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.15
3
+ Version: 0.5.17
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.15
3
+ Version: 0.5.17
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -19,7 +19,7 @@ _SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cach
19
19
  # Cache for authenticate_user. STS GetCallerIdentity is stable per AWS profile and slow under SSO
20
20
  # (~500ms-1.5s). Cache for 24h keyed by AWS_PROFILE; if creds rotate the user_id rarely changes,
21
21
  # and the next AWS call (DDB/SQS) will surface a credential error if it does.
22
- _AUTH_CACHE_TTL_SECONDS = 24 * 60 * 60
22
+ _AUTH_CACHE_TTL_SECONDS = 60 * 60
23
23
  _AUTH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/auth-cache.json"))
24
24
 
25
25
 
@@ -64,6 +64,22 @@ def _save_auth_cache(github_user: str, result: Dict[str, Any]) -> None:
64
64
  pass
65
65
 
66
66
 
67
+ def clear_auth_cache() -> None:
68
+ """Drop the cached auth entry for the current AWS profile. Call this after a credential
69
+ error to force the next authenticate_user() to re-hit STS."""
70
+ try:
71
+ if not _AUTH_CACHE_PATH.exists():
72
+ return
73
+ with open(_AUTH_CACHE_PATH) as f:
74
+ data = json.load(f)
75
+ if _auth_cache_key() in data:
76
+ del data[_auth_cache_key()]
77
+ with open(_AUTH_CACHE_PATH, "w") as f:
78
+ json.dump(data, f)
79
+ except Exception:
80
+ pass
81
+
82
+
67
83
  def _load_ssh_cache(github_user: str) -> Optional[Dict[str, Any]]:
68
84
  """Return cached validation if it's fresh and matches the configured github_user, else None."""
69
85
  try:
@@ -125,6 +141,7 @@ def authenticate_user(config: Config) -> Dict[str, Any]:
125
141
  _save_auth_cache(github_user, result)
126
142
  return result
127
143
  except Exception as e:
144
+ clear_auth_cache()
128
145
  raise RuntimeError(f"AWS authentication failed: {e}")
129
146
 
130
147
 
@@ -1542,13 +1542,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1542
1542
  if "@" in user_id:
1543
1543
  user_display = user_id.split("@")[0]
1544
1544
 
1545
- # Format GPU information
1545
+ # Format GPU information (MIG-friendly via _format_gpu_display)
1546
1546
  if gpu_type and gpu_type not in ["unknown", "Unknown"]:
1547
- # For CPU nodes (gpu_count = 0), show just the type
1548
1547
  if gpu_count == 0:
1549
1548
  gpu_display = gpu_type
1550
1549
  else:
1551
- gpu_display = f"{gpu_count}x {gpu_type}"
1550
+ gpu_display = _format_gpu_display(gpu_count, gpu_type)
1552
1551
  else:
1553
1552
  gpu_display = str(gpu_count)
1554
1553
 
@@ -1844,7 +1843,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1844
1843
  if gpu_count == 0:
1845
1844
  gpu_display = gpu_type
1846
1845
  else:
1847
- gpu_display = f"{gpu_count}x {gpu_type}"
1846
+ gpu_display = _format_gpu_display(gpu_count, gpu_type)
1848
1847
  else:
1849
1848
  gpu_display = str(gpu_count)
1850
1849
 
@@ -2417,6 +2416,9 @@ def _format_gpu_display(gpu_count, gpu_type):
2417
2416
  "h100-mig-3g": "40GB H100 (MIG)",
2418
2417
  "h100-mig-4g": "40GB H100 (MIG)",
2419
2418
  "h100-mig-7g": "80GB H100 (MIG)",
2419
+ "b200-mig-1g": "23GB B200 (MIG)",
2420
+ "b200-mig-2g": "45GB B200 (MIG)",
2421
+ "b200-mig-3g": "90GB B200 (MIG)",
2420
2422
  }
2421
2423
  if gt_lower in mig_friendly:
2422
2424
  return f"{gpu_count}× {mig_friendly[gt_lower]}"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.15"
7
+ version = "0.5.17"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -308,30 +308,35 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
308
308
  f"Node {node.metadata.name} in {node_az}: {available_gpus} available GPUs")
309
309
 
310
310
  if candidate_nodes:
311
- # Return the AZ of the first suitable node (Kubernetes scheduler will make the final decision)
311
+ # Binpacking: pack into the most-loaded node that still fits the request.
312
+ # Sort by free GPUs ASC so the fullest node comes first; ties broken by node name
313
+ # so the choice is deterministic across Lambda invocations.
314
+ candidate_nodes.sort(key=lambda n: (n['available_gpus'], n['node_name']))
312
315
  selected_node = candidate_nodes[0]
313
316
  target_az = selected_node['az']
317
+ target_node = selected_node['node_name']
314
318
  logger.info(
315
- f"Target AZ for {gpu_type} reservation: {target_az} (node: {selected_node['node_name']})")
316
- return target_az
319
+ f"Binpacked target for {gpu_type} {gpus_requested}gpu: "
320
+ f"node={target_node} az={target_az} free={selected_node['available_gpus']} "
321
+ f"(candidates considered: {len(candidate_nodes)})")
322
+ return target_az, target_node
317
323
 
318
324
  if all_ready_nodes:
319
- # No single node has enough GPUs, but nodes exist — return AZ of the node
320
- # with the most available GPUs so the disk is created in the right AZ
325
+ # No single node has enough GPUs — return AZ of the node with the most available GPUs
326
+ # so disk lands in the right AZ. No node hint (pod will Pending until something frees up).
321
327
  best_node = max(all_ready_nodes, key=lambda n: n['available_gpus'])
322
328
  target_az = best_node['az']
323
329
  logger.info(
324
330
  f"No single node has {gpus_requested} {gpu_type} GPUs, "
325
331
  f"but {len(all_ready_nodes)} nodes exist. Using AZ {target_az} "
326
332
  f"from node {best_node['node_name']} ({best_node['available_gpus']} GPUs available)")
327
- return target_az
333
+ return target_az, None
328
334
 
329
335
  logger.warning(f"No ready/schedulable {gpu_type} nodes found in cluster")
330
336
  return None, None
331
337
 
332
338
  except Exception as e:
333
339
  logger.error(f"Error determining target AZ for {gpu_type}: {str(e)}")
334
- # Fallback to primary AZ if detection fails (no node hint — let k8s pick).
335
340
  return PRIMARY_AVAILABILITY_ZONE, None
336
341
 
337
342
 
@@ -2722,6 +2727,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2722
2727
  persistent_volume_id = None
2723
2728
  device_name = None
2724
2729
  target_az = None # Initialize target_az for use in connection info update
2730
+ target_node = None # Initialize target_node (binpacking hostname pin) for create_pod
2725
2731
  is_new_disk = False # Initialize is_new_disk for all code paths
2726
2732
 
2727
2733
  # If we're using persistent disk, immediately mark this reservation as having a volume
@@ -2749,8 +2755,8 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2749
2755
  detailed_status="Setting up persistent disk" + (f" '{disk_name}'" if disk_name else "")
2750
2756
  )
2751
2757
 
2752
- # Determine target AZ for this reservation
2753
- target_az = get_target_az_for_reservation(gpu_type, gpu_count)
2758
+ # Determine target AZ + node for this reservation (binpacking)
2759
+ target_az, target_node = get_target_az_for_reservation(gpu_type, gpu_count)
2754
2760
  if not target_az:
2755
2761
  raise ValueError(f"No {gpu_type} nodes found in cluster")
2756
2762
 
@@ -2881,6 +2887,7 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
2881
2887
  dockerfile_base64_data=dockerfile_base64_data,
2882
2888
  dockerimage=dockerimage,
2883
2889
  target_az=target_az,
2890
+ target_node=target_node,
2884
2891
  preserve_entrypoint=preserve_entrypoint,
2885
2892
  node_labels=node_labels,
2886
2893
  trace_data=trace_data,
@@ -3421,6 +3428,7 @@ def create_kubernetes_resources(
3421
3428
  recreate_env: bool = False,
3422
3429
  efs_filesystem_id: str = None,
3423
3430
  is_multinode: bool = False,
3431
+ target_node: str = None,
3424
3432
  dockerfile_base64_data: str = None,
3425
3433
  dockerimage: str = None,
3426
3434
  target_az: str = None,
@@ -3524,6 +3532,7 @@ def create_kubernetes_resources(
3524
3532
  dockerfile_base64_data=dockerfile_base64_data,
3525
3533
  dockerimage=dockerimage,
3526
3534
  target_az=target_az,
3535
+ target_node=target_node,
3527
3536
  preserve_entrypoint=preserve_entrypoint,
3528
3537
  node_labels=node_labels,
3529
3538
  trace_data=trace_data,
@@ -3610,6 +3619,7 @@ def create_kubernetes_resources(
3610
3619
  dockerfile_base64_data=dockerfile_base64_data,
3611
3620
  dockerimage=dockerimage,
3612
3621
  target_az=target_az,
3622
+ target_node=target_node,
3613
3623
  preserve_entrypoint=preserve_entrypoint,
3614
3624
  node_labels=node_labels,
3615
3625
  trace_data=trace_data,
@@ -3902,6 +3912,7 @@ def create_pod(
3902
3912
  dockerfile_base64_data: str = None,
3903
3913
  dockerimage: str = None,
3904
3914
  target_az: str = None,
3915
+ target_node: str = None,
3905
3916
  preserve_entrypoint: bool = False,
3906
3917
  node_labels: dict = None,
3907
3918
  trace_data: dict = None,
@@ -5309,7 +5320,12 @@ EOF
5309
5320
  ] if _pod_uses_efa(gpu_count, gpu_type, is_multinode) else []),
5310
5321
  node_selector={
5311
5322
  "GpuType": get_node_gpu_type(gpu_type),
5312
- **({} if target_az is None else {"topology.kubernetes.io/zone": target_az})
5323
+ **({} if target_az is None else {"topology.kubernetes.io/zone": target_az}),
5324
+ # Hard-pin to the binpacked node when Lambda picked one. Lambda runs
5325
+ # serialized (reserved_concurrent_executions=1), so allocations seen by the
5326
+ # next invocation include this pod. If the node is unavailable, the pod
5327
+ # stays Pending and surfaces the error rather than spreading.
5328
+ **({} if target_node is None else {"kubernetes.io/hostname": target_node}),
5313
5329
  },
5314
5330
  # Node affinity for profiling-dedicated preference
5315
5331
  # If user requests nsight=true, prefer profiling-dedicated nodes
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.13"
184
- MIN_CLI_VERSION = "0.5.9"
183
+ LAMBDA_VERSION = "0.5.17"
184
+ MIN_CLI_VERSION = "0.5.16"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
187
  }, local.alb_env_vars)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes