gpu-dev 0.5.7__tar.gz → 0.5.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +53 -2
  4. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +45 -19
  5. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +6 -0
  6. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/pyproject.toml +1 -1
  7. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/availability.tf +9 -2
  8. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/availability_updater/index.py +41 -3
  9. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda.tf +1 -1
  10. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/.github/workflows/no-gitlinks.yml +0 -0
  11. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/.github/workflows/publish.yml +0 -0
  12. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/.gitignore +0 -0
  13. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/CLAUDE.md +0 -0
  14. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/PROGRESS.md +0 -0
  15. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/PR_DESCRIPTION.md +0 -0
  16. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/TODO.md +0 -0
  17. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/admin/README.md +0 -0
  18. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/admin/generate_stats.py +0 -0
  19. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/admin/requirements.txt +0 -0
  20. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/README.md +0 -0
  21. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  22. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  23. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  24. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  25. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  26. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  27. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  28. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  29. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  30. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  31. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  32. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  33. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  34. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  35. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/docs/USER_GUIDE.md +0 -0
  36. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/docs/devgpu-features.html +0 -0
  37. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/docs/docker-mark-blue.svg +0 -0
  38. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/docs/icons8-cursor-ai.svg +0 -0
  39. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/post.md +0 -0
  40. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/setup.cfg +0 -0
  41. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  42. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  43. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/README.md +0 -0
  44. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/alb.tf +0 -0
  45. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/backend.tf +0 -0
  46. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  47. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  48. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  49. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/bash_profile +0 -0
  50. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/bashrc +0 -0
  51. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  52. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  53. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  54. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  55. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/motd_script +0 -0
  56. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  57. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/profile +0 -0
  58. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  59. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  60. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  61. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/shell_env +0 -0
  62. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/ssh_config +0 -0
  63. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/zprofile +0 -0
  64. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/zshrc +0 -0
  65. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  66. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker-build.tf +0 -0
  67. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  68. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  69. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ecr.tf +0 -0
  70. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/efs.tf +0 -0
  71. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/eks.tf +0 -0
  72. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/expiry.tf +0 -0
  73. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/git-cache.tf +0 -0
  74. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/kubernetes.tf +0 -0
  75. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  76. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  77. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  78. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  79. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  80. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  81. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  82. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  83. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  84. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  85. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  86. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  87. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  88. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  89. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/main.tf +0 -0
  90. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  91. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  92. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  93. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  94. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  95. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/monitoring.tf +0 -0
  96. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/outputs.tf +0 -0
  97. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/pyproject.toml +0 -0
  98. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/queue.tf +0 -0
  99. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/route53.tf +0 -0
  100. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  101. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  102. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  103. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  104. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  105. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  106. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  107. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  108. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  109. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  110. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/switch-to.sh +0 -0
  111. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  112. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  113. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  114. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  115. {gpu_dev-0.5.7 → gpu_dev-0.5.9}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.7
3
+ Version: 0.5.9
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.7
3
+ Version: 0.5.9
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,11 +1,53 @@
1
1
  """Minimal AWS-only authentication for GPU Dev CLI"""
2
2
 
3
+ import json
4
+ import os
3
5
  import subprocess
4
6
  import re
5
- from typing import Dict, Any
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Dict, Any, Optional
6
10
  from .config import Config
7
11
  from rich.spinner import Spinner
8
12
 
13
+ # SSH validation result is cached locally for 24h. New keys pushed to GitHub still take effect
14
+ # at reservation time (pods fetch live keys via init container) — caching only skips the
15
+ # pre-flight "are you who you say you are" check.
16
+ _SSH_CACHE_TTL_SECONDS = 24 * 60 * 60
17
+ _SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cache.json"))
18
+
19
+
20
+ def _load_ssh_cache(github_user: str) -> Optional[Dict[str, Any]]:
21
+ """Return cached validation if it's fresh and matches the configured github_user, else None."""
22
+ try:
23
+ if not _SSH_CACHE_PATH.exists():
24
+ return None
25
+ with open(_SSH_CACHE_PATH) as f:
26
+ data = json.load(f)
27
+ if data.get("configured_user") != github_user:
28
+ return None
29
+ if time.time() - float(data.get("ts", 0)) > _SSH_CACHE_TTL_SECONDS:
30
+ return None
31
+ return data.get("result")
32
+ except Exception:
33
+ return None
34
+
35
+
36
+ def _save_ssh_cache(github_user: str, result: Dict[str, Any]) -> None:
37
+ """Persist a successful validation result. Failures are not cached (so they can recover)."""
38
+ if not result.get("valid"):
39
+ return
40
+ try:
41
+ _SSH_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
42
+ with open(_SSH_CACHE_PATH, "w") as f:
43
+ json.dump({
44
+ "configured_user": github_user,
45
+ "ts": int(time.time()),
46
+ "result": result,
47
+ }, f)
48
+ except Exception:
49
+ pass
50
+
9
51
 
10
52
  def authenticate_user(config: Config) -> Dict[str, Any]:
11
53
  """Authenticate using AWS credentials - if you can call AWS, you're authorized"""
@@ -59,6 +101,13 @@ def validate_ssh_key_matches_github_user(config: Config, live=None) -> Dict[str,
59
101
  "error": "GitHub username not configured. Run: gpu-dev config set github_user <username>",
60
102
  }
61
103
 
104
+ # Cache short-circuit — skip the SSH handshake (~1-3s) if we recently validated this user.
105
+ # Cache TTL is 24h. New keys pushed to GitHub still take effect at reservation time
106
+ # (pods fetch live keys via init container), so caching the pre-flight check is safe.
107
+ cached = _load_ssh_cache(github_user)
108
+ if cached is not None:
109
+ return cached
110
+
62
111
  # Run ssh git@github.com with interactive host verification support
63
112
  ssh_output = None
64
113
 
@@ -139,7 +188,7 @@ def validate_ssh_key_matches_github_user(config: Config, live=None) -> Dict[str,
139
188
  # Compare usernames (case-insensitive)
140
189
  is_valid = ssh_detected_user.lower() == github_user.lower()
141
190
 
142
- return {
191
+ result = {
143
192
  "valid": is_valid,
144
193
  "configured_user": github_user,
145
194
  "ssh_user": ssh_detected_user,
@@ -147,6 +196,8 @@ def validate_ssh_key_matches_github_user(config: Config, live=None) -> Dict[str,
147
196
  if is_valid
148
197
  else f"SSH key belongs to '{ssh_detected_user}' but configured user is '{github_user}'",
149
198
  }
199
+ _save_ssh_cache(github_user, result)
200
+ return result
150
201
 
151
202
  except Exception as e:
152
203
  return {
@@ -681,29 +681,55 @@ def reserve(
681
681
  rprint(
682
682
  "[dim]Use --no-interactive flag to disable interactive mode[/dim]\n")
683
683
 
684
- # Setup config early for availability check
684
+ # Run auth + SSH validation + availability fetch in parallel — they're independent
685
+ # and total wall-clock time drops from sum to max(each).
686
+ from concurrent.futures import ThreadPoolExecutor
687
+ config = load_config()
688
+
685
689
  with Live(
686
- Spinner("dots", text="📡 Loading GPU availability..."), console=console
690
+ Spinner("dots", text="🚀 Loading"), console=console
687
691
  ) as live:
688
- config = load_config()
689
- try:
690
- user_info = authenticate_user(config)
691
- except RuntimeError as e:
692
- live.stop()
693
- rprint(f"[red]❌ {str(e)}[/red]")
694
- return
695
-
696
- # Validate SSH key matches configured GitHub username
697
- live.update(Spinner("dots", text="🔐 Validating SSH key..."))
698
- if not _validate_ssh_key_or_exit(config, live):
699
- return
692
+ with ThreadPoolExecutor(max_workers=3) as ex:
693
+ f_auth = ex.submit(authenticate_user, config)
694
+ # SSH validation may invoke `ssh git@github.com` interactively for password-protected keys;
695
+ # do it on the main thread when the cache is cold so prompts work. Probe cache first.
696
+ from .auth import _load_ssh_cache, validate_ssh_key_matches_github_user
697
+ cached_ssh = _load_ssh_cache(config.get_github_username() or "")
698
+ if cached_ssh is not None:
699
+ f_ssh = None
700
+ ssh_result = cached_ssh
701
+ else:
702
+ f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
703
+ ssh_result = None
704
+ f_avail = ex.submit(
705
+ lambda: ReservationManager(config).get_gpu_availability_by_type()
706
+ )
700
707
 
701
- live.update(
702
- Spinner("dots", text="📡 Loading GPU availability..."))
703
- reservation_mgr = ReservationManager(config)
704
- availability_info = reservation_mgr.get_gpu_availability_by_type()
708
+ # Surface auth failure first (most actionable).
709
+ try:
710
+ user_info = f_auth.result()
711
+ except RuntimeError as e:
712
+ live.stop()
713
+ rprint(f"[red]❌ {str(e)}[/red]")
714
+ return
705
715
 
706
- live.stop()
716
+ if ssh_result is None:
717
+ ssh_result = f_ssh.result()
718
+ availability_info = f_avail.result()
719
+
720
+ # Surface SSH validation failure with the same UX as before.
721
+ if not ssh_result.get("valid"):
722
+ rprint("[red]❌ Github SSH key validation failed[/red]")
723
+ if ssh_result.get("ssh_user") and ssh_result.get("configured_user"):
724
+ rprint("\n[yellow]💡 Fix by updating your config:[/yellow]")
725
+ rprint(f" [cyan]gpu-dev config set github_user {ssh_result['ssh_user']}[/cyan]")
726
+ elif not ssh_result.get("configured_user"):
727
+ rprint("\n[yellow]💡 Fix by configuring your GitHub username:[/yellow]")
728
+ rprint(" [cyan]gpu-dev config set github_user <your-github-username>[/cyan]")
729
+ else:
730
+ rprint("\n[yellow]💡 gpu-dev utilizes Github keys for auth![/yellow]")
731
+ rprint("[yellow]💡 Check https://fburl.com/gh-ssh for info on how to add your ssh key to Github[/yellow]")
732
+ return
707
733
 
708
734
  if not availability_info:
709
735
  rprint("[red]❌ Could not get GPU availability information[/red]")
@@ -1032,6 +1032,11 @@ class ReservationManager:
1032
1032
  queue_length = self._get_queue_length_for_gpu_type(gpu_type)
1033
1033
  estimated_wait = queue_length * 15 if queue_length > 0 else 0
1034
1034
 
1035
+ # size_etas is a DDB Map of {size_str: epoch_seconds (Decimal)} — pass through
1036
+ # so the interactive count menu can render "[available in 1h24m]" labels.
1037
+ raw_etas = item.get("size_etas", {}) or {}
1038
+ size_etas = {str(k): int(v) for k, v in raw_etas.items()} if raw_etas else {}
1039
+
1035
1040
  availability_info[gpu_type] = {
1036
1041
  "available": int(item.get("available_gpus", 0)),
1037
1042
  "total": int(item.get("total_gpus", 0)),
@@ -1045,6 +1050,7 @@ class ReservationManager:
1045
1050
  "last_updated": item.get("last_updated_timestamp", 0),
1046
1051
  "maintenance": bool(item.get("maintenance", False)),
1047
1052
  "maintenance_reason": item.get("maintenance_reason", ""),
1053
+ "size_etas": size_etas,
1048
1054
  }
1049
1055
 
1050
1056
  return availability_info
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.7"
7
+ version = "0.5.9"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -26,8 +26,15 @@ resource "aws_lambda_function" "availability_updater" {
26
26
  role = aws_iam_role.availability_updater_role.arn
27
27
  handler = "index.handler"
28
28
  runtime = "python3.11"
29
- timeout = 300
30
- source_code_hash = null_resource.availability_updater_build.triggers.code_hash
29
+ timeout = 300
30
+ # 1769 MB is the sweet spot — Lambda allocates one full vCPU at this threshold.
31
+ # Beyond 1769 MB you get fractional second vCPUs (less linear gain), and our work is single-threaded.
32
+ memory_size = 1769
33
+ # Cap concurrent invocations at 1: each run does ~30 EKS API calls per gpu_type, and
34
+ # uncapped concurrency was hammering the cluster API into throttling, leaving later
35
+ # gpu_types in each run timing out and never producing size_etas.
36
+ reserved_concurrent_executions = 1
37
+ source_code_hash = null_resource.availability_updater_build.triggers.code_hash
31
38
 
32
39
  environment {
33
40
  variables = {
@@ -24,6 +24,45 @@ RESERVATIONS_TABLE = os.environ.get("RESERVATIONS_TABLE", "pytorch-gpu-dev-reser
24
24
  SUPPORTED_GPU_TYPES = json.loads(os.environ["SUPPORTED_GPU_TYPES"])
25
25
 
26
26
 
27
+ def _parse_expires_at(value):
28
+ """Parse the reservations table's `expires_at` field to a unix epoch (int).
29
+
30
+ DDB stores it as either an ISO-8601 datetime string ("2026-05-02T00:12:03.674845") OR
31
+ occasionally a numeric epoch — handle both. Returns None if unparseable.
32
+ """
33
+ if value is None:
34
+ return None
35
+ # Numeric (Decimal/int/float) → epoch seconds directly.
36
+ if not isinstance(value, str):
37
+ try:
38
+ return int(float(value))
39
+ except (ValueError, TypeError):
40
+ return None
41
+ s = value.strip()
42
+ if not s:
43
+ return None
44
+ # ISO-8601 first (the actual production format).
45
+ try:
46
+ from datetime import datetime, timezone
47
+ # `fromisoformat` accepts microseconds; tolerate optional 'Z' suffix.
48
+ if s.endswith("Z"):
49
+ s2 = s[:-1] + "+00:00"
50
+ else:
51
+ s2 = s
52
+ dt = datetime.fromisoformat(s2)
53
+ if dt.tzinfo is None:
54
+ # Convention in this codebase: timestamps written via datetime.utcnow().isoformat() are UTC.
55
+ dt = dt.replace(tzinfo=timezone.utc)
56
+ return int(dt.timestamp())
57
+ except (ValueError, TypeError):
58
+ pass
59
+ # Numeric-as-string fallback.
60
+ try:
61
+ return int(float(s))
62
+ except (ValueError, TypeError):
63
+ return None
64
+
65
+
27
66
  def get_gpu_resource_name(gpu_type: str) -> str:
28
67
  return SUPPORTED_GPU_TYPES.get(gpu_type, {}).get("k8s_resource", "nvidia.com/gpu")
29
68
 
@@ -542,9 +581,8 @@ def compute_size_etas(v1, gpu_type, node_label_value, resource_name, gpus_per_in
542
581
  continue
543
582
  if pod_name not in pod_to_info:
544
583
  continue
545
- try:
546
- ts = int(float(expires_at))
547
- except (ValueError, TypeError):
584
+ ts = _parse_expires_at(expires_at)
585
+ if ts is None:
548
586
  continue
549
587
  node_name, gpus = pod_to_info[pod_name]
550
588
  node_state[node_name]["expirations"].append((ts, gpus))
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.7"
183
+ LAMBDA_VERSION = "0.5.9"
184
184
  MIN_CLI_VERSION = "0.5.5"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes