gpu-dev 0.5.6__tar.gz → 0.5.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +26 -10
  4. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +14 -0
  5. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/Dockerfile +23 -4
  7. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/bashrc_ext +14 -0
  8. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/zshrc_ext +14 -0
  9. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/availability_updater/index.py +33 -0
  10. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/reservation_processor/index.py +60 -3
  11. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda.tf +1 -1
  12. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/.github/workflows/no-gitlinks.yml +0 -0
  13. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/.github/workflows/publish.yml +0 -0
  14. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/.gitignore +0 -0
  15. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/CLAUDE.md +0 -0
  16. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/PROGRESS.md +0 -0
  17. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/PR_DESCRIPTION.md +0 -0
  18. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/TODO.md +0 -0
  19. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/admin/README.md +0 -0
  20. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/admin/generate_stats.py +0 -0
  21. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/admin/requirements.txt +0 -0
  22. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/README.md +0 -0
  23. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  24. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  25. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  26. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  27. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  28. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  29. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  30. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  31. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  32. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  33. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  34. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  35. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  36. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  37. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  38. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/docs/USER_GUIDE.md +0 -0
  39. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/docs/devgpu-features.html +0 -0
  40. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/docs/docker-mark-blue.svg +0 -0
  41. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/docs/icons8-cursor-ai.svg +0 -0
  42. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/post.md +0 -0
  43. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/setup.cfg +0 -0
  44. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  45. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  46. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/README.md +0 -0
  47. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/alb.tf +0 -0
  48. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/availability.tf +0 -0
  49. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/backend.tf +0 -0
  50. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  51. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  52. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/bash_profile +0 -0
  53. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/bashrc +0 -0
  54. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  55. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  56. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  57. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/motd_script +0 -0
  58. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  59. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/profile +0 -0
  60. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  61. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  62. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  63. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/shell_env +0 -0
  64. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/ssh_config +0 -0
  65. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/zprofile +0 -0
  66. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker/zshrc +0 -0
  67. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker-build.tf +0 -0
  68. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  69. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  70. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/ecr.tf +0 -0
  71. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/efs.tf +0 -0
  72. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/eks.tf +0 -0
  73. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/expiry.tf +0 -0
  74. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/git-cache.tf +0 -0
  75. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/kubernetes.tf +0 -0
  76. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  77. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  78. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  79. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  80. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  81. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  82. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  83. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  84. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  85. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  86. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  87. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  88. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  89. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/main.tf +0 -0
  90. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  91. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  92. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  93. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  94. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  95. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/monitoring.tf +0 -0
  96. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/outputs.tf +0 -0
  97. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/pyproject.toml +0 -0
  98. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/queue.tf +0 -0
  99. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/route53.tf +0 -0
  100. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  101. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  102. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  103. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  104. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  105. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  106. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  107. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  108. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  109. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  110. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/switch-to.sh +0 -0
  111. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  112. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  113. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  114. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  115. {gpu_dev-0.5.6 → gpu_dev-0.5.7}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.6
3
+ Version: 0.5.7
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.6
3
+ Version: 0.5.7
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -161,11 +161,8 @@ def _show_single_reservation(connection_info: dict) -> None:
161
161
  gpu_type = connection_info.get("gpu_type", "Unknown")
162
162
  instance_type = connection_info.get("instance_type", "unknown")
163
163
 
164
- # Format GPU information
165
- if gpu_type != "Unknown" and gpu_type != "unknown":
166
- gpu_info = f"{gpu_count}x {gpu_type}"
167
- else:
168
- gpu_info = f"{gpu_count} GPU(s)"
164
+ # Format GPU information (MIG-aware)
165
+ gpu_info = _format_gpu_display(gpu_count, gpu_type)
169
166
 
170
167
  # Format timestamps - only show launched_at (started time), not created time
171
168
  launched_at = connection_info.get("launched_at", "N/A")
@@ -2042,11 +2039,8 @@ def cancel(
2042
2039
  status = reservation.get("status", "unknown")
2043
2040
  created_at = reservation.get("created_at", "N/A")
2044
2041
 
2045
- # Format GPU information
2046
- if gpu_type and gpu_type not in ["unknown", "Unknown"]:
2047
- gpu_display = f"{gpu_count}x {gpu_type.upper()}"
2048
- else:
2049
- gpu_display = str(gpu_count)
2042
+ # Format GPU information (MIG-aware)
2043
+ gpu_display = _format_gpu_display(gpu_count, gpu_type)
2050
2044
 
2051
2045
  # Format created_at
2052
2046
  created_formatted = "N/A"
@@ -2379,6 +2373,28 @@ def show(ctx: click.Context, reservation_id: Optional[str]) -> None:
2379
2373
  rprint(f"[red]❌ Error: {str(e)}[/red]")
2380
2374
 
2381
2375
 
2376
+
2377
+ def _format_gpu_display(gpu_count, gpu_type):
2378
+ """Render a friendly '{N}× {type}' string for reservation listings.
2379
+
2380
+ For MIG slice SKUs, surface the slice memory + the underlying physical type
2381
+ (e.g. '2× 10GB H100 (MIG)') instead of the raw 'H100-MIG-1G' identifier.
2382
+ """
2383
+ if not gpu_type or str(gpu_type).lower() in ("unknown", ""):
2384
+ return f"{gpu_count} GPU(s)"
2385
+ gt_lower = str(gpu_type).lower()
2386
+ mig_friendly = {
2387
+ "h100-mig-1g": "10GB H100 (MIG)",
2388
+ "h100-mig-2g": "20GB H100 (MIG)",
2389
+ "h100-mig-3g": "40GB H100 (MIG)",
2390
+ "h100-mig-4g": "40GB H100 (MIG)",
2391
+ "h100-mig-7g": "80GB H100 (MIG)",
2392
+ }
2393
+ if gt_lower in mig_friendly:
2394
+ return f"{gpu_count}× {mig_friendly[gt_lower]}"
2395
+ return f"{gpu_count}x {str(gpu_type).upper()}"
2396
+
2397
+
2382
2398
  def _show_availability() -> None:
2383
2399
  """Shared function to show GPU availability"""
2384
2400
  try:
@@ -64,6 +64,18 @@ def select_gpu_type_interactive(
64
64
  if "-mig-" not in gt
65
65
  }
66
66
 
67
+ # Aggregate MIG slice availability so we can hint it on the h100 row of this picker.
68
+ mig_total_available = sum(
69
+ int(info.get("available", 0))
70
+ for gt, info in (availability_info or {}).items()
71
+ if gt.startswith("h100-mig-")
72
+ )
73
+ mig_total_capacity = sum(
74
+ int(info.get("total", 0))
75
+ for gt, info in (availability_info or {}).items()
76
+ if gt.startswith("h100-mig-")
77
+ )
78
+
67
79
  # Display availability table first
68
80
  console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
69
81
  table = Table()
@@ -132,6 +144,8 @@ def select_gpu_type_interactive(
132
144
  )
133
145
  if queue_length > 0:
134
146
  choice_label += f" - {queue_length} in queue"
147
+ if gpu_type == "h100" and mig_total_capacity > 0:
148
+ choice_label += f" — also {mig_total_available}/{mig_total_capacity} MIG slices"
135
149
 
136
150
  choices.append(questionary.Choice(title=choice_label, value=gpu_type))
137
151
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.6"
7
+ version = "0.5.7"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -132,13 +132,32 @@ RUN mkdir -p /run/sshd /var/run/sshd && \
132
132
  # Create SSH config
133
133
  COPY ssh_config /etc/ssh/sshd_config
134
134
 
135
- # Set up npm global directory for dev user and install Claude CLI
135
+ # Install Claude Code SYSTEM-WIDE via the official native installer (as root).
136
+ # Lives in /opt/claude with a /usr/local/bin/claude symlink, survives every reservation
137
+ # regardless of persistent-disk state, and can't be shadowed by stale npm installs on
138
+ # user disks. Image rebuilds are the only update path — controlled, reproducible.
139
+ USER root
140
+ RUN curl -fsSL https://claude.ai/install.sh | HOME=/opt/claude bash || echo "Claude native install failed (non-fatal at build time)"
141
+ RUN if [ -e /opt/claude/.local/bin/claude ]; then \
142
+ ln -sf /opt/claude/.local/bin/claude /usr/local/bin/claude; \
143
+ chmod -R a+rX /opt/claude; \
144
+ fi
145
+
146
+ # Set up npm global directory for dev user (kept for ad-hoc dev-installed CLIs).
136
147
  USER dev
137
148
  WORKDIR /home/dev
138
-
139
149
  RUN mkdir -p ~/.npm-global && \
140
- npm config set prefix ~/.npm-global && \
141
- npm install -g @anthropic-ai/claude-code || echo "Claude CLI install failed, will retry at runtime"
150
+ npm config set prefix ~/.npm-global
151
+
152
+ # OpenAI Codex CLI installed SYSTEM-WIDE as root (parallels the Claude install above).
153
+ # Auth is per-user: either `export OPENAI_API_KEY=sk-…` in their shell, or `codex login`
154
+ # for the browser/ChatGPT-account flow. AWS Bedrock now serves Codex (announced 2026-04-28
155
+ # at openai.com/index/openai-on-aws) — once we have early access, swap in the Bedrock-backed
156
+ # install + flip an env var; users won't need their own keys anymore.
157
+ USER root
158
+ RUN npm install -g --prefix /usr/local @openai/codex || echo "Codex CLI install failed (non-fatal at build time)"
159
+
160
+ USER dev
142
161
 
143
162
  # Install oh-my-zsh for dev user
144
163
  RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
@@ -24,3 +24,17 @@ check_warnings() {
24
24
 
25
25
  # Run warning check before every command prompt
26
26
  PROMPT_COMMAND="check_warnings; $PROMPT_COMMAND"
27
+
28
+ # Auto-cleanup of deprecated ANTHROPIC_MODEL pinning. An older docker image
29
+ # hardcoded ANTHROPIC_MODEL=us.anthropic.claude-sonnet-4-20250514-v1:0 in
30
+ # .shell_env, which then got cached on persistent disks. We unset it in the
31
+ # current session and strip the line from disk so it doesn't come back.
32
+ # Self-healing — once cleaned, the case statement no longer matches.
33
+ case "${ANTHROPIC_MODEL:-}" in
34
+ *sonnet-4-20250514*)
35
+ unset ANTHROPIC_MODEL
36
+ if [ -f "$HOME/.shell_env" ]; then
37
+ sed -i '/ANTHROPIC_MODEL.*sonnet-4-20250514/d' "$HOME/.shell_env" 2>/dev/null || true
38
+ fi
39
+ ;;
40
+ esac
@@ -25,3 +25,17 @@ check_warnings() {
25
25
 
26
26
  # Run warning check before every command prompt (zsh hook)
27
27
  precmd() { check_warnings }
28
+
29
+ # Auto-cleanup of deprecated ANTHROPIC_MODEL pinning. An older docker image
30
+ # hardcoded ANTHROPIC_MODEL=us.anthropic.claude-sonnet-4-20250514-v1:0 in
31
+ # .shell_env, which then got cached on persistent disks. We unset it in the
32
+ # current session and strip the line from disk so it doesn't come back.
33
+ # Self-healing — once cleaned, the case statement no longer matches.
34
+ case "${ANTHROPIC_MODEL:-}" in
35
+ *sonnet-4-20250514*)
36
+ unset ANTHROPIC_MODEL
37
+ if [ -f "$HOME/.shell_env" ]; then
38
+ sed -i '/ANTHROPIC_MODEL.*sonnet-4-20250514/d' "$HOME/.shell_env" 2>/dev/null || true
39
+ fi
40
+ ;;
41
+ esac
@@ -76,6 +76,13 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
76
76
  logger.error(f"=== Failed to update availability for {gpu_type}: {gpu_error} ===")
77
77
  # Continue with other GPU types
78
78
 
79
+ # Best-effort: delete stale rows for SKUs no longer in SUPPORTED_GPU_TYPES
80
+ # (e.g. after a GPU type rename like g7e -> rtxpro6000).
81
+ try:
82
+ cleanup_stale_availability_rows()
83
+ except Exception as cleanup_err:
84
+ logger.warning(f"Stale-row cleanup failed: {cleanup_err}")
85
+
79
86
  return {
80
87
  "statusCode": 200,
81
88
  "body": json.dumps(
@@ -594,3 +601,29 @@ def compute_size_etas(v1, gpu_type, node_label_value, resource_name, gpus_per_in
594
601
 
595
602
  return etas
596
603
 
604
+
605
+ def cleanup_stale_availability_rows():
606
+ """Delete rows in the availability table whose gpu_type isn't in SUPPORTED_GPU_TYPES.
607
+
608
+ Triggered on every Lambda invocation. Idempotent. Used to garbage-collect renamed
609
+ SKUs (e.g. g7e -> rtxpro6000) that would otherwise linger as zero rows.
610
+ """
611
+ table = dynamodb.Table(AVAILABILITY_TABLE)
612
+ valid_keys = set(SUPPORTED_GPU_TYPES.keys())
613
+ last_key = None
614
+ deleted = []
615
+ while True:
616
+ kwargs = {"ProjectionExpression": "gpu_type"}
617
+ if last_key:
618
+ kwargs["ExclusiveStartKey"] = last_key
619
+ resp = table.scan(**kwargs)
620
+ for item in resp.get("Items", []):
621
+ gt = item.get("gpu_type")
622
+ if gt and gt not in valid_keys:
623
+ table.delete_item(Key={"gpu_type": gt})
624
+ deleted.append(gt)
625
+ last_key = resp.get("LastEvaluatedKey")
626
+ if not last_key:
627
+ break
628
+ if deleted:
629
+ logger.info(f"Deleted {len(deleted)} stale availability rows: {deleted}")
@@ -4030,7 +4030,9 @@ def create_pod(
4030
4030
  client.V1Container(
4031
4031
  name="gpu-dev",
4032
4032
  image=container_image,
4033
- image_pull_policy="IfNotPresent",
4033
+ # :latest is a moving tag — always re-pull so a new docker image
4034
+ # rolled out to ECR is picked up by every fresh reservation.
4035
+ image_pull_policy="Always",
4034
4036
  **({
4035
4037
  "command": ["/bin/bash"],
4036
4038
  "args": [
@@ -4443,6 +4445,34 @@ EOF_ZSHRC_EXT
4443
4445
  done
4444
4446
  echo "[STARTUP] ✓ Shell extension sourcing configured"
4445
4447
 
4448
+ # Surgically remove the deprecated ANTHROPIC_MODEL pinning that older docker
4449
+ # images baked into shell_env. claude-sonnet-4-20250514 is being deprecated
4450
+ # by Anthropic, and an old hardcoded value lingers on persistent disks.
4451
+ # Idempotent — strips only the matching line, leaves any user-customized value alone.
4452
+ if [ -f /home/dev/.shell_env ] && grep -q "ANTHROPIC_MODEL.*sonnet-4-20250514" /home/dev/.shell_env 2>/dev/null; then
4453
+ echo "[STARTUP] Removing deprecated ANTHROPIC_MODEL=sonnet-4-20250514 line from .shell_env"
4454
+ sed -i '/ANTHROPIC_MODEL.*sonnet-4-20250514/d' /home/dev/.shell_env || true
4455
+ fi
4456
+
4457
+ # Remove the legacy npm-based Claude install (~/.npm-global/bin/claude).
4458
+ # Older docker images ran `npm install -g @anthropic-ai/claude-code` and that
4459
+ # binary still lingers on persistent disks where it shadows the system-wide
4460
+ # /usr/local/bin/claude (because $HOME/.npm-global/bin precedes /usr/local/bin
4461
+ # on PATH). The system-wide install is kept current via image rebuilds; the
4462
+ # user's own ~/.local/bin/claude (from `claude install` in their home, or
4463
+ # Claude's self-update mechanism) is left intact so users can opt into newer
4464
+ # versions ahead of our image refresh.
4465
+ if [ -e /home/dev/.npm-global/bin/claude ]; then
4466
+ echo "[STARTUP] Removing legacy npm-installed claude at /home/dev/.npm-global/bin/claude"
4467
+ rm -f /home/dev/.npm-global/bin/claude || true
4468
+ fi
4469
+ # Also drop the npm package files for @anthropic-ai/claude-code so npm doesn't
4470
+ # think it's still installed on next `npm list -g`.
4471
+ if [ -d /home/dev/.npm-global/lib/node_modules/@anthropic-ai/claude-code ]; then
4472
+ echo "[STARTUP] Removing legacy @anthropic-ai/claude-code npm package files"
4473
+ rm -rf /home/dev/.npm-global/lib/node_modules/@anthropic-ai/claude-code || true
4474
+ fi
4475
+
4446
4476
  # Fix ownership - recursive only for new disks (fast, empty disk)
4447
4477
  # For existing disks, only fix the specific files we just created/modified
4448
4478
  if [ "$CREATE_SH_ENV" = "true" ]; then
@@ -6233,7 +6263,13 @@ def should_use_persistent_disk(user_id: str, current_reservation_id: str) -> boo
6233
6263
 
6234
6264
 
6235
6265
  def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]:
6236
- """Get instance type and GPU type from the node where pod is scheduled"""
6266
+ """Get instance type and GPU type from the node where pod is scheduled.
6267
+
6268
+ For MIG slice pods, the SKU is derived from the pod's resource request
6269
+ (nvidia.com/mig-Ng.NNgb) rather than the host instance type, so the DDB
6270
+ record reflects the actual partition the user reserved instead of the
6271
+ physical card.
6272
+ """
6237
6273
  try:
6238
6274
  v1 = client.CoreV1Api(k8s_client)
6239
6275
 
@@ -6250,7 +6286,27 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6250
6286
  "node.kubernetes.io/instance-type", "unknown"
6251
6287
  )
6252
6288
 
6253
- # Map instance type to GPU type
6289
+ # If the pod requests a MIG slice resource, the GPU type is the slice SKU.
6290
+ # Map back to the canonical SKU stored elsewhere in this code (matches CLI flag).
6291
+ mig_resource_to_sku = {
6292
+ "nvidia.com/mig-1g.10gb": "h100-mig-1g",
6293
+ "nvidia.com/mig-1g.20gb": "h100-mig-1g", # memory-doubled variant, same SKU bucket
6294
+ "nvidia.com/mig-2g.20gb": "h100-mig-2g",
6295
+ "nvidia.com/mig-3g.40gb": "h100-mig-3g",
6296
+ "nvidia.com/mig-4g.40gb": "h100-mig-4g",
6297
+ "nvidia.com/mig-7g.80gb": "h100-mig-7g",
6298
+ }
6299
+ if pod.spec.containers:
6300
+ for c in pod.spec.containers:
6301
+ reqs = (c.resources.requests if c.resources and c.resources.requests else {}) or {}
6302
+ for r_name, sku in mig_resource_to_sku.items():
6303
+ if reqs.get(r_name):
6304
+ logger.info(
6305
+ f"Pod {pod_name} on {node_name} requests {r_name} -> MIG SKU {sku}"
6306
+ )
6307
+ return instance_type, sku
6308
+
6309
+ # Map instance type to GPU type for non-MIG (full GPU) pods
6254
6310
  gpu_type_mapping = {
6255
6311
  "g4dn.4xlarge": "T4",
6256
6312
  "g4dn.8xlarge": "T4",
@@ -6261,6 +6317,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6261
6317
  "g6.12xlarge": "L4",
6262
6318
  "g6.16xlarge": "L4",
6263
6319
  "g6.24xlarge": "L4",
6320
+ "g7e.24xlarge": "rtxpro6000",
6264
6321
  "p4d.24xlarge": "A100",
6265
6322
  "p5.48xlarge": "H100",
6266
6323
  "p5e.48xlarge": "H200",
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.6"
183
+ LAMBDA_VERSION = "0.5.7"
184
184
  MIN_CLI_VERSION = "0.5.5"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes