gpu-dev 0.5.13__tar.gz → 0.5.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +69 -20
  4. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +8 -9
  5. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +2 -2
  6. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/pyproject.toml +1 -1
  7. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/mig-config.tf +23 -6
  8. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/.github/workflows/no-gitlinks.yml +0 -0
  9. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/.github/workflows/publish.yml +0 -0
  10. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/.gitignore +0 -0
  11. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/CLAUDE.md +0 -0
  12. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/PROGRESS.md +0 -0
  13. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/PR_DESCRIPTION.md +0 -0
  14. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/TODO.md +0 -0
  15. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/admin/README.md +0 -0
  16. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/admin/generate_stats.py +0 -0
  17. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/admin/requirements.txt +0 -0
  18. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/README.md +0 -0
  19. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  20. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  21. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  22. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  23. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  24. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  25. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  26. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  27. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  28. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  29. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  30. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  31. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  32. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  33. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/docs/USER_GUIDE.md +0 -0
  34. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/docs/devgpu-features.html +0 -0
  35. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/docs/docker-mark-blue.svg +0 -0
  36. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/docs/icons8-cursor-ai.svg +0 -0
  37. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/post.md +0 -0
  38. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/setup.cfg +0 -0
  39. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  40. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  41. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/README.md +0 -0
  42. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/alb.tf +0 -0
  43. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/availability.tf +0 -0
  44. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/backend.tf +0 -0
  45. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  46. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  47. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  48. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/bash_profile +0 -0
  49. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/bashrc +0 -0
  50. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  51. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  52. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  53. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  54. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/motd_script +0 -0
  55. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  56. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/profile +0 -0
  57. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  58. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  59. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  60. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/shell_env +0 -0
  61. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/ssh_config +0 -0
  62. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/zprofile +0 -0
  63. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/zshrc +0 -0
  64. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  65. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker-build.tf +0 -0
  66. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  67. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  68. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ecr.tf +0 -0
  69. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/efs.tf +0 -0
  70. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/eks.tf +0 -0
  71. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/expiry.tf +0 -0
  72. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/git-cache.tf +0 -0
  73. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/kubernetes.tf +0 -0
  74. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  75. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  76. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  77. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  78. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  79. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  80. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  81. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  82. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  83. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  84. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  85. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  86. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  87. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  88. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  89. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/lambda.tf +0 -0
  90. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/main.tf +0 -0
  91. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  92. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  93. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  94. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  95. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  96. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  97. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/monitoring.tf +0 -0
  98. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/outputs.tf +0 -0
  99. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/pyproject.toml +0 -0
  100. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/queue.tf +0 -0
  101. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/route53.tf +0 -0
  102. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  103. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  104. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  105. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  106. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  107. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  108. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  109. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  110. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  111. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  112. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/switch-to.sh +0 -0
  113. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  114. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  115. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  116. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  117. {gpu_dev-0.5.13 → gpu_dev-0.5.15}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.13
3
+ Version: 0.5.15
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.13
3
+ Version: 0.5.15
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -16,6 +16,53 @@ from rich.spinner import Spinner
16
16
  _SSH_CACHE_TTL_SECONDS = 24 * 60 * 60
17
17
  _SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cache.json"))
18
18
 
19
+ # Cache for authenticate_user. STS GetCallerIdentity is stable per AWS profile and slow under SSO
20
+ # (~500ms-1.5s). Cache for 24h keyed by AWS_PROFILE; if creds rotate the user_id rarely changes,
21
+ # and the next AWS call (DDB/SQS) will surface a credential error if it does.
22
+ _AUTH_CACHE_TTL_SECONDS = 24 * 60 * 60
23
+ _AUTH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/auth-cache.json"))
24
+
25
+
26
+ def _auth_cache_key() -> str:
27
+ return os.environ.get("AWS_PROFILE", "default")
28
+
29
+
30
+ def _load_auth_cache(github_user: str) -> Optional[Dict[str, Any]]:
31
+ try:
32
+ if not _AUTH_CACHE_PATH.exists():
33
+ return None
34
+ with open(_AUTH_CACHE_PATH) as f:
35
+ data = json.load(f)
36
+ entry = data.get(_auth_cache_key())
37
+ if not entry or entry.get("github_user") != github_user:
38
+ return None
39
+ if time.time() - float(entry.get("ts", 0)) > _AUTH_CACHE_TTL_SECONDS:
40
+ return None
41
+ return entry.get("result")
42
+ except Exception:
43
+ return None
44
+
45
+
46
+ def _save_auth_cache(github_user: str, result: Dict[str, Any]) -> None:
47
+ try:
48
+ _AUTH_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
49
+ data = {}
50
+ if _AUTH_CACHE_PATH.exists():
51
+ try:
52
+ with open(_AUTH_CACHE_PATH) as f:
53
+ data = json.load(f)
54
+ except Exception:
55
+ data = {}
56
+ data[_auth_cache_key()] = {
57
+ "github_user": github_user,
58
+ "ts": int(time.time()),
59
+ "result": result,
60
+ }
61
+ with open(_AUTH_CACHE_PATH, "w") as f:
62
+ json.dump(data, f)
63
+ except Exception:
64
+ pass
65
+
19
66
 
20
67
  def _load_ssh_cache(github_user: str) -> Optional[Dict[str, Any]]:
21
68
  """Return cached validation if it's fresh and matches the configured github_user, else None."""
@@ -50,31 +97,33 @@ def _save_ssh_cache(github_user: str, result: Dict[str, Any]) -> None:
50
97
 
51
98
 
52
99
  def authenticate_user(config: Config) -> Dict[str, Any]:
53
- """Authenticate using AWS credentials - if you can call AWS, you're authorized"""
54
- try:
55
- # Test AWS access by getting caller identity
56
- identity = config.get_user_identity()
100
+ """Authenticate using AWS credentials - if you can call AWS, you're authorized.
57
101
 
58
- # Test specific resource access by trying to get queue URL
59
- config.get_queue_url()
60
-
61
- # Extract user info from AWS ARN
62
- arn = identity["arn"]
63
- user_name = arn.split("/")[-1] # Extract username from ARN
102
+ Cached for 24h per AWS profile. The previous SQS get_queue_url probe was dropped:
103
+ it's a redundant permission check; reserve/cancel call SQS directly and surface
104
+ failures themselves, while list/show/avail don't touch SQS at all.
105
+ """
106
+ github_user = config.get_github_username()
107
+ if not github_user:
108
+ raise RuntimeError(
109
+ "GitHub username not configured. Please run: gpu-dev config set github_user <your-github-username>"
110
+ )
64
111
 
65
- # Get GitHub username from config
66
- github_user = config.get_github_username()
67
- if not github_user:
68
- raise RuntimeError(
69
- f"GitHub username not configured. Please run: gpu-dev config set github_user <your-github-username>"
70
- )
112
+ cached = _load_auth_cache(github_user)
113
+ if cached is not None:
114
+ return cached
71
115
 
72
- return {
73
- "user_id": user_name, # AWS username for reservation ownership
74
- "github_user": github_user, # GitHub username for SSH keys
116
+ try:
117
+ identity = config.get_user_identity()
118
+ arn = identity["arn"]
119
+ user_name = arn.split("/")[-1]
120
+ result = {
121
+ "user_id": user_name,
122
+ "github_user": github_user,
75
123
  "arn": arn,
76
124
  }
77
-
125
+ _save_auth_cache(github_user, result)
126
+ return result
78
127
  except Exception as e:
79
128
  raise RuntimeError(f"AWS authentication failed: {e}")
80
129
 
@@ -688,6 +688,7 @@ def reserve(
688
688
  # and total wall-clock time drops from sum to max(each).
689
689
  from concurrent.futures import ThreadPoolExecutor
690
690
  config = load_config()
691
+ reservation_mgr = ReservationManager(config)
691
692
 
692
693
  with Live(
693
694
  Spinner("dots", text="🚀 Loading…"), console=console
@@ -704,9 +705,7 @@ def reserve(
704
705
  else:
705
706
  f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
706
707
  ssh_result = None
707
- f_avail = ex.submit(
708
- lambda: ReservationManager(config).get_gpu_availability_by_type()
709
- )
708
+ f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
710
709
 
711
710
  # Surface auth failure first (most actionable).
712
711
  try:
@@ -2496,10 +2495,10 @@ def _show_availability() -> None:
2496
2495
  table = Table(
2497
2496
  title="GPU Availability by Type (numbers are GPUs, not nodes)")
2498
2497
  table.add_column("GPU Type", style="cyan")
2499
- table.add_column("Available", style="green")
2500
- table.add_column("Max Reservable", style="bright_green")
2498
+ table.add_column("Avail", style="green")
2499
+ table.add_column("Max\nReservable", style="bright_green")
2501
2500
  table.add_column("Total", style="blue")
2502
- table.add_column("Queue Length", style="yellow")
2501
+ table.add_column("Queue\nLength", style="yellow")
2503
2502
  table.add_column("Architecture", style="dim")
2504
2503
  table.add_column("Est. Wait Time", style="magenta")
2505
2504
 
@@ -2657,10 +2656,10 @@ def _show_availability_watch(interval: int) -> None:
2657
2656
  table = Table(
2658
2657
  title="GPU Availability by Type (numbers are GPUs, not nodes)")
2659
2658
  table.add_column("GPU Type", style="cyan")
2660
- table.add_column("Available", style="green")
2661
- table.add_column("Max Reservable", style="blue")
2659
+ table.add_column("Avail", style="green")
2660
+ table.add_column("Max\nReservable", style="blue")
2662
2661
  table.add_column("Total", style="blue")
2663
- table.add_column("Queue Length", style="yellow")
2662
+ table.add_column("Queue\nLength", style="yellow")
2664
2663
  table.add_column("Architecture", style="dim")
2665
2664
  table.add_column("Est. Wait Time", style="magenta")
2666
2665
 
@@ -88,9 +88,9 @@ def select_gpu_type_interactive(
88
88
  console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
89
89
  table = Table()
90
90
  table.add_column("GPU Type", style="cyan")
91
- table.add_column("Available", style="green")
91
+ table.add_column("Avail", style="green")
92
92
  table.add_column("Total", style="blue")
93
- table.add_column("Queue Length", style="yellow")
93
+ table.add_column("Queue\nLength", style="yellow")
94
94
  table.add_column("Est. Wait Time", style="magenta")
95
95
 
96
96
  choices = []
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.13"
7
+ version = "0.5.15"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -25,23 +25,40 @@ resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
25
25
  depends_on = [helm_release.nvidia_gpu_operator]
26
26
  }
27
27
 
28
- # Optional declarative B200 MIG node label. Set b200_mig_node_name in tfvars (or override the
29
- # variable's default below) to dedicate a specific B200 node to the mixed profile. Empty string
30
- # means "no node currently labelled" — the existing all-disabled stays in effect.
28
+ # Declarative B200 MIG node label. Set b200_mig_node_name (per workspace via the locals lookup
29
+ # below, or override via tfvars / -var) to dedicate a specific B200 node to the mixed profile.
30
+ # Empty string means "no node labelled" — every B200 stays full.
31
+ #
32
+ # Future cleanup: when we split a B200 CR into two ASGs (one with mig_profile, one without),
33
+ # the user_data path will set this label at boot for any instance in the MIG-dedicated ASG —
34
+ # matching the H100 cr3 pattern. Until then, this declarative label pins the role to a hostname.
35
+ locals {
36
+ # Workspace-scoped defaults so the resource is a no-op in non-prod and no apply ever tries to
37
+ # label a node that doesn't exist.
38
+ default_b200_mig_node_by_workspace = {
39
+ prod = "ip-10-0-67-125.us-east-2.compute.internal"
40
+ }
41
+ b200_mig_node_effective = (
42
+ var.b200_mig_node_name != ""
43
+ ? var.b200_mig_node_name
44
+ : lookup(local.default_b200_mig_node_by_workspace, terraform.workspace, "")
45
+ )
46
+ }
47
+
31
48
  variable "b200_mig_node_name" {
32
- description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to skip."
49
+ description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to use the per-workspace default in mig-config.tf."
33
50
  type = string
34
51
  default = ""
35
52
  }
36
53
 
37
54
  resource "kubernetes_labels" "b200_mig_node" {
38
- count = var.b200_mig_node_name == "" ? 0 : 1
55
+ count = local.b200_mig_node_effective == "" ? 0 : 1
39
56
 
40
57
  api_version = "v1"
41
58
  kind = "Node"
42
59
 
43
60
  metadata {
44
- name = var.b200_mig_node_name
61
+ name = local.b200_mig_node_effective
45
62
  }
46
63
 
47
64
  labels = {
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes