gpu-dev 0.5.4__tar.gz → 0.5.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +32 -2
  4. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/pyproject.toml +1 -1
  5. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/availability.tf +12 -0
  6. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/availability_updater/index.py +204 -4
  7. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda.tf +2 -2
  8. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/.github/workflows/no-gitlinks.yml +0 -0
  9. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/.github/workflows/publish.yml +0 -0
  10. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/.gitignore +0 -0
  11. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/CLAUDE.md +0 -0
  12. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/PROGRESS.md +0 -0
  13. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/PR_DESCRIPTION.md +0 -0
  14. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/TODO.md +0 -0
  15. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/admin/README.md +0 -0
  16. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/admin/generate_stats.py +0 -0
  17. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/admin/requirements.txt +0 -0
  18. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/README.md +0 -0
  19. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  20. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  21. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  22. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  23. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  24. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  25. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  26. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  27. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
  28. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  29. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  30. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  31. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  32. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  33. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  34. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  35. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/docs/USER_GUIDE.md +0 -0
  36. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/docs/devgpu-features.html +0 -0
  37. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/docs/docker-mark-blue.svg +0 -0
  38. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/docs/icons8-cursor-ai.svg +0 -0
  39. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/post.md +0 -0
  40. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/setup.cfg +0 -0
  41. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  42. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  43. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/README.md +0 -0
  44. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/alb.tf +0 -0
  45. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/backend.tf +0 -0
  46. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  47. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  48. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  49. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/bash_profile +0 -0
  50. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/bashrc +0 -0
  51. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  52. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  53. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  54. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  55. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/motd_script +0 -0
  56. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  57. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/profile +0 -0
  58. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  59. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  60. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  61. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/shell_env +0 -0
  62. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/ssh_config +0 -0
  63. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/zprofile +0 -0
  64. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/zshrc +0 -0
  65. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  66. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker-build.tf +0 -0
  67. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  68. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  69. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/ecr.tf +0 -0
  70. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/efs.tf +0 -0
  71. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/eks.tf +0 -0
  72. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/expiry.tf +0 -0
  73. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/git-cache.tf +0 -0
  74. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/kubernetes.tf +0 -0
  75. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  76. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  77. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  78. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  79. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  80. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  81. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  82. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  83. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  84. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  85. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  86. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  87. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  88. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  89. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/main.tf +0 -0
  90. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  91. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  92. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  93. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  94. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  95. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/monitoring.tf +0 -0
  96. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/outputs.tf +0 -0
  97. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/pyproject.toml +0 -0
  98. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/queue.tf +0 -0
  99. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/route53.tf +0 -0
  100. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  101. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  102. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  103. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  104. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  105. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  106. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  107. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  108. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  109. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  110. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/switch-to.sh +0 -0
  111. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  112. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  113. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  114. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  115. {gpu_dev-0.5.4 → gpu_dev-0.5.6}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.4
3
+ Version: 0.5.6
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.4
3
+ Version: 0.5.6
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -150,6 +150,17 @@ def select_gpu_type_interactive(
150
150
  return None
151
151
 
152
152
 
153
+ def _format_eta_seconds(delta_seconds: int) -> str:
154
+ """Format a positive seconds delta as e.g. '12min', '1h24min', '<1min'."""
155
+ if delta_seconds < 60:
156
+ return "<1min"
157
+ if delta_seconds < 3600:
158
+ return f"{delta_seconds // 60}min"
159
+ h = delta_seconds // 3600
160
+ m = (delta_seconds % 3600) // 60
161
+ return f"{h}h" if m == 0 else f"{h}h{m}min"
162
+
163
+
153
164
  def select_gpu_count_interactive(
154
165
  gpu_type: str,
155
166
  max_gpus: int,
@@ -190,10 +201,13 @@ def select_gpu_count_interactive(
190
201
  multinode_counts = [16, 24, 32, 40, 48] # multiples of 8
191
202
 
192
203
  # Pull live availability for the parent SKU once — used to annotate every option.
204
+ import time as _time
193
205
  parent_info = (availability_info or {}).get(gpu_type, {}) if availability_info else {}
194
206
  parent_max_reservable = int(parent_info.get("max_reservable", 0))
195
207
  parent_full_nodes = int(parent_info.get("full_nodes_available", 0))
196
208
  parent_available = int(parent_info.get("available", 0))
209
+ parent_size_etas = parent_info.get("size_etas", {}) or {}
210
+ _now_ts = int(_time.time())
197
211
 
198
212
  # MIG slice submenu: only for h100. Each tuple is (target_gpu_type, gpu_count, gb_label).
199
213
  mig_options = []
@@ -247,7 +261,15 @@ def select_gpu_count_interactive(
247
261
  if parent_max_reservable >= count:
248
262
  label += f" [{parent_available} free]"
249
263
  else:
250
- label += " [unavailable now]"
264
+ eta_ts = parent_size_etas.get(str(count))
265
+ try:
266
+ eta_int = int(eta_ts) if eta_ts is not None else None
267
+ except (TypeError, ValueError):
268
+ eta_int = None
269
+ if eta_int is not None and eta_int > _now_ts:
270
+ label += f" [available in {_format_eta_seconds(eta_int - _now_ts)}]"
271
+ else:
272
+ label += " [unavailable now]"
251
273
  choices.append(questionary.Choice(title=label, value=count))
252
274
 
253
275
  # Multinode at the bottom.
@@ -261,7 +283,15 @@ def select_gpu_count_interactive(
261
283
  if parent_max_reservable >= count:
262
284
  label += f" [{parent_full_nodes} full nodes free]"
263
285
  else:
264
- label += " [unavailable now]"
286
+ eta_ts = parent_size_etas.get(str(count))
287
+ try:
288
+ eta_int = int(eta_ts) if eta_ts is not None else None
289
+ except (TypeError, ValueError):
290
+ eta_int = None
291
+ if eta_int is not None and eta_int > _now_ts:
292
+ label += f" [available in {_format_eta_seconds(eta_int - _now_ts)}]"
293
+ else:
294
+ label += " [unavailable now]"
265
295
  choices.append(questionary.Choice(title=label, value=count))
266
296
 
267
297
  try:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.4"
7
+ version = "0.5.6"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -32,6 +32,7 @@ resource "aws_lambda_function" "availability_updater" {
32
32
  environment {
33
33
  variables = {
34
34
  AVAILABILITY_TABLE = aws_dynamodb_table.gpu_availability.name
35
+ RESERVATIONS_TABLE = aws_dynamodb_table.gpu_reservations.name
35
36
  # Filter out nsight variants - they're counted under base types (h200/b200) via GpuType label mapping
36
37
  SUPPORTED_GPU_TYPES = jsonencode({
37
38
  for k, v in local.current_config.supported_gpu_types : k => v
@@ -103,6 +104,17 @@ resource "aws_iam_role_policy" "availability_updater_policy" {
103
104
  ]
104
105
  Resource = aws_dynamodb_table.gpu_availability.arn
105
106
  },
107
+ {
108
+ Effect = "Allow"
109
+ Action = [
110
+ "dynamodb:Scan",
111
+ "dynamodb:Query"
112
+ ]
113
+ Resource = [
114
+ aws_dynamodb_table.gpu_reservations.arn,
115
+ "${aws_dynamodb_table.gpu_reservations.arn}/index/*"
116
+ ]
117
+ },
106
118
  {
107
119
  Effect = "Allow"
108
120
  Action = [
@@ -20,6 +20,7 @@ autoscaling = boto3.client("autoscaling")
20
20
 
21
21
  # Environment variables
22
22
  AVAILABILITY_TABLE = os.environ["AVAILABILITY_TABLE"]
23
+ RESERVATIONS_TABLE = os.environ.get("RESERVATIONS_TABLE", "pytorch-gpu-dev-reservations")
23
24
  SUPPORTED_GPU_TYPES = json.loads(os.environ["SUPPORTED_GPU_TYPES"])
24
25
 
25
26
 
@@ -55,12 +56,20 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
55
56
  logger.error(f"Failed to setup Kubernetes client: {k8s_setup_error}")
56
57
  k8s_client = None
57
58
 
59
+ # Cache active reservations once for the whole invocation (used for per-size ETAs)
60
+ try:
61
+ active_reservations = scan_active_reservations()
62
+ logger.info(f"Cached {len(active_reservations)} active reservations for ETA computation")
63
+ except Exception as scan_err:
64
+ logger.warning(f"Failed to scan reservations table for ETAs: {scan_err}")
65
+ active_reservations = []
66
+
58
67
  # Update availability for ALL GPU types (use any ASG event as trigger to refresh all)
59
68
  updated_types = []
60
69
  for gpu_type in SUPPORTED_GPU_TYPES.keys():
61
70
  try:
62
71
  logger.info(f"=== Starting update for GPU type: {gpu_type} ===")
63
- update_gpu_availability(gpu_type, k8s_client)
72
+ update_gpu_availability(gpu_type, k8s_client, active_reservations=active_reservations)
64
73
  updated_types.append(gpu_type)
65
74
  logger.info(f"=== Successfully updated availability for GPU type: {gpu_type} ===")
66
75
  except Exception as gpu_error:
@@ -85,8 +94,8 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
85
94
  raise
86
95
 
87
96
 
88
- def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
89
- """Update availability information for a specific GPU type"""
97
+ def update_gpu_availability(gpu_type: str, k8s_client=None, active_reservations=None) -> None:
98
+ """Update availability information for a specific GPU type."""
90
99
  try:
91
100
  logger.info(f"Starting availability update for GPU type: {gpu_type}")
92
101
 
@@ -246,6 +255,25 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
246
255
  full_nodes_available = available_gpus # Each "GPU" represents one CPU node slot
247
256
  max_reservable = 1 if available_gpus > 0 else 0 # Max 1 CPU node per reservation
248
257
 
258
+ # Compute per-size ETAs (when each interesting reservation size first becomes reservable).
259
+ size_etas: Dict[str, int] = {}
260
+ if k8s_client is not None and not is_cpu_type and active_reservations is not None:
261
+ try:
262
+ from kubernetes import client as k8s_lib
263
+ v1 = k8s_lib.CoreV1Api(k8s_client)
264
+ size_etas = compute_size_etas(
265
+ v1=v1,
266
+ gpu_type=gpu_type,
267
+ node_label_value=get_node_label_value(gpu_type),
268
+ resource_name=get_gpu_resource_name(gpu_type),
269
+ gpus_per_instance=int(gpus_per_instance),
270
+ active_reservations=active_reservations,
271
+ )
272
+ logger.info(f"Computed size_etas for {gpu_type}: {size_etas}")
273
+ except Exception as eta_err:
274
+ logger.warning(f"Failed to compute size_etas for {gpu_type}: {eta_err}")
275
+ size_etas = {}
276
+
249
277
  # Update DynamoDB table (update_item preserves maintenance fields set manually)
250
278
  table = dynamodb.Table(AVAILABILITY_TABLE)
251
279
  last_updated = context.aws_request_id if "context" in locals() else "unknown"
@@ -256,7 +284,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
256
284
  UpdateExpression=(
257
285
  "SET total_gpus = :tg, available_gpus = :ag, max_reservable = :mr, "
258
286
  "full_nodes_available = :fn, running_instances = :ri, desired_capacity = :dc, "
259
- "gpus_per_instance = :gpi, last_updated = :lu, last_updated_timestamp = :lut"
287
+ "gpus_per_instance = :gpi, last_updated = :lu, last_updated_timestamp = :lut, "
288
+ "size_etas = :se"
260
289
  ),
261
290
  ExpressionAttributeValues={
262
291
  ":tg": total_gpus,
@@ -268,6 +297,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
268
297
  ":gpi": gpus_per_instance,
269
298
  ":lu": last_updated,
270
299
  ":lut": last_updated_ts,
300
+ ":se": size_etas,
271
301
  },
272
302
  )
273
303
 
@@ -394,3 +424,173 @@ def get_available_gpus_on_node(v1_api, node, gpu_type: str = None) -> int:
394
424
  f"Error getting available GPUs on node {node.metadata.name}: {str(e)}"
395
425
  )
396
426
  return 0
427
+
428
+ def scan_active_reservations():
429
+ """Return list of active reservation rows from the reservations DDB table.
430
+
431
+ Each row is the raw DDB resource-style dict (keys + native types). Caller is
432
+ responsible for tolerating Decimals and missing fields.
433
+ """
434
+ table = dynamodb.Table(RESERVATIONS_TABLE)
435
+ items = []
436
+ last_key = None
437
+ while True:
438
+ kwargs = {
439
+ "FilterExpression": "#s = :s",
440
+ "ExpressionAttributeNames": {"#s": "status"},
441
+ "ExpressionAttributeValues": {":s": "active"},
442
+ }
443
+ if last_key:
444
+ kwargs["ExclusiveStartKey"] = last_key
445
+ resp = table.scan(**kwargs)
446
+ items.extend(resp.get("Items", []))
447
+ last_key = resp.get("LastEvaluatedKey")
448
+ if not last_key:
449
+ break
450
+ return items
451
+
452
+
453
+ # Multinode-eligible types (mirrors the older multinode_gpu_types list elsewhere in this file).
454
+ _MULTINODE_TYPES = {"h100", "h200", "b200", "a100"}
455
+
456
+
457
+ def compute_size_etas(v1, gpu_type, node_label_value, resource_name, gpus_per_instance, active_reservations):
458
+ """For each interesting reservation size, compute when it first becomes reservable.
459
+
460
+ Returns a dict mapping the size (as a string) to a unix timestamp (int).
461
+ A timestamp <= now means the size is currently available; sizes that won't
462
+ fit in any foreseeable future (e.g. cluster too small) are omitted.
463
+ """
464
+ import time as _time
465
+ now = int(_time.time())
466
+
467
+ # 1) Get nodes and per-node capacity for this resource.
468
+ try:
469
+ nodes = v1.list_node(label_selector=f"GpuType={node_label_value}")
470
+ except Exception as e:
471
+ logger.warning(f"compute_size_etas: list_node failed: {e}")
472
+ return {}
473
+
474
+ node_state = {} # node_name -> {capacity, used_now, expirations: [(ts, gpus)]}
475
+ for node in nodes.items:
476
+ if not is_node_ready_and_schedulable(node):
477
+ continue
478
+ capacity = 0
479
+ try:
480
+ capacity = int((node.status.allocatable or {}).get(resource_name, "0"))
481
+ except (ValueError, TypeError):
482
+ capacity = 0
483
+ if capacity == 0:
484
+ continue
485
+ node_state[node.metadata.name] = {
486
+ "capacity": capacity,
487
+ "used_now": 0,
488
+ "expirations": [],
489
+ }
490
+
491
+ if not node_state:
492
+ return {}
493
+
494
+ # 2) Map pods on these nodes to their gpu request and node.
495
+ pod_to_info = {} # pod_name -> (node_name, gpus_requested)
496
+ try:
497
+ pods = v1.list_namespaced_pod("gpu-dev")
498
+ except Exception as e:
499
+ logger.warning(f"compute_size_etas: list_pod failed: {e}")
500
+ return {}
501
+ for pod in pods.items:
502
+ if not pod.spec or not pod.spec.node_name:
503
+ continue
504
+ if pod.spec.node_name not in node_state:
505
+ continue
506
+ if pod.status and pod.status.phase not in ("Running", "Pending"):
507
+ continue
508
+ gpus = 0
509
+ if pod.spec.containers:
510
+ for c in pod.spec.containers:
511
+ if c.resources and c.resources.requests:
512
+ try:
513
+ gpus += int(c.resources.requests.get(resource_name, "0"))
514
+ except (ValueError, TypeError):
515
+ pass
516
+ if gpus > 0:
517
+ pod_to_info[pod.metadata.name] = (pod.spec.node_name, gpus)
518
+ # used_now is the k8s ground-truth — count every running/pending pod, not just those
519
+ # we can match to a reservation row. Otherwise pods without DDB rows look like free GPUs.
520
+ node_state[pod.spec.node_name]["used_now"] += gpus
521
+
522
+ # 3) Cross-reference active reservations to attach expiry timestamps to each known pod.
523
+ # Pods without a matching reservation row keep their GPUs marked as used_now but have no
524
+ # expiration → they're treated as "never expiring" by the simulation, which is the safe
525
+ # fallback (we don't fabricate ETAs for usage we can't trace).
526
+ target_gpu_type_lower = gpu_type.lower()
527
+ for r in active_reservations:
528
+ # Reservations table stores gpu_type uppercased ("H100"); compare case-insensitively.
529
+ rgt = r.get("gpu_type", "")
530
+ if isinstance(rgt, str) and rgt.lower() != target_gpu_type_lower:
531
+ continue
532
+ pod_name = r.get("pod_name")
533
+ expires_at = r.get("expires_at")
534
+ if not pod_name or expires_at is None:
535
+ continue
536
+ if pod_name not in pod_to_info:
537
+ continue
538
+ try:
539
+ ts = int(float(expires_at))
540
+ except (ValueError, TypeError):
541
+ continue
542
+ node_name, gpus = pod_to_info[pod_name]
543
+ node_state[node_name]["expirations"].append((ts, gpus))
544
+
545
+ # Sort each node's expirations by time.
546
+ for ns in node_state.values():
547
+ ns["expirations"].sort()
548
+
549
+ def first_time_size_fits_single_node(size):
550
+ """Earliest timestamp at which any single node has `size` GPUs free."""
551
+ earliest = None
552
+ for ns in node_state.values():
553
+ free_now = ns["capacity"] - ns["used_now"]
554
+ if free_now >= size:
555
+ return now
556
+ cum = free_now
557
+ for ts, gpus in ns["expirations"]:
558
+ cum += gpus
559
+ if cum >= size:
560
+ if earliest is None or ts < earliest:
561
+ earliest = ts
562
+ break
563
+ return earliest
564
+
565
+ def first_time_k_full_nodes(k):
566
+ """Earliest timestamp at which K nodes are simultaneously fully free."""
567
+ free_at = []
568
+ for ns in node_state.values():
569
+ if ns["used_now"] == 0:
570
+ free_at.append(now)
571
+ elif ns["expirations"]:
572
+ free_at.append(max(ts for ts, _ in ns["expirations"]))
573
+ free_at.sort()
574
+ if len(free_at) >= k:
575
+ return free_at[k - 1]
576
+ return None
577
+
578
+ etas = {}
579
+ # Single-node sizes 1, 2, 4, 8 (capped at the per-instance maximum).
580
+ for size in (1, 2, 4, 8):
581
+ if size > gpus_per_instance:
582
+ break
583
+ eta = first_time_size_fits_single_node(size)
584
+ if eta is not None:
585
+ etas[str(size)] = eta
586
+
587
+ # Multinode sizes — only for SXM types with 8 GPUs per node.
588
+ if gpus_per_instance == 8 and target_gpu_type_lower in _MULTINODE_TYPES:
589
+ for k_nodes in (2, 3, 4, 5, 6):
590
+ count = k_nodes * gpus_per_instance
591
+ eta = first_time_k_full_nodes(k_nodes)
592
+ if eta is not None:
593
+ etas[str(count)] = eta
594
+
595
+ return etas
596
+
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.4"
184
- MIN_CLI_VERSION = "0.5.2"
183
+ LAMBDA_VERSION = "0.5.6"
184
+ MIN_CLI_VERSION = "0.5.5"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
187
  }, local.alb_env_vars)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes