gpu-dev 0.5.6__tar.gz → 0.5.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/PKG-INFO +1 -1
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +26 -10
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +14 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/pyproject.toml +1 -1
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/Dockerfile +23 -4
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/bashrc_ext +14 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/zshrc_ext +14 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/availability_updater/index.py +74 -3
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/reservation_processor/index.py +60 -3
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/.gitignore +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/CLAUDE.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/PROGRESS.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/TODO.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/admin/README.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/post.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/setup.cfg +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -161,11 +161,8 @@ def _show_single_reservation(connection_info: dict) -> None:
|
|
|
161
161
|
gpu_type = connection_info.get("gpu_type", "Unknown")
|
|
162
162
|
instance_type = connection_info.get("instance_type", "unknown")
|
|
163
163
|
|
|
164
|
-
# Format GPU information
|
|
165
|
-
|
|
166
|
-
gpu_info = f"{gpu_count}x {gpu_type}"
|
|
167
|
-
else:
|
|
168
|
-
gpu_info = f"{gpu_count} GPU(s)"
|
|
164
|
+
# Format GPU information (MIG-aware)
|
|
165
|
+
gpu_info = _format_gpu_display(gpu_count, gpu_type)
|
|
169
166
|
|
|
170
167
|
# Format timestamps - only show launched_at (started time), not created time
|
|
171
168
|
launched_at = connection_info.get("launched_at", "N/A")
|
|
@@ -2042,11 +2039,8 @@ def cancel(
|
|
|
2042
2039
|
status = reservation.get("status", "unknown")
|
|
2043
2040
|
created_at = reservation.get("created_at", "N/A")
|
|
2044
2041
|
|
|
2045
|
-
# Format GPU information
|
|
2046
|
-
|
|
2047
|
-
gpu_display = f"{gpu_count}x {gpu_type.upper()}"
|
|
2048
|
-
else:
|
|
2049
|
-
gpu_display = str(gpu_count)
|
|
2042
|
+
# Format GPU information (MIG-aware)
|
|
2043
|
+
gpu_display = _format_gpu_display(gpu_count, gpu_type)
|
|
2050
2044
|
|
|
2051
2045
|
# Format created_at
|
|
2052
2046
|
created_formatted = "N/A"
|
|
@@ -2379,6 +2373,28 @@ def show(ctx: click.Context, reservation_id: Optional[str]) -> None:
|
|
|
2379
2373
|
rprint(f"[red]❌ Error: {str(e)}[/red]")
|
|
2380
2374
|
|
|
2381
2375
|
|
|
2376
|
+
|
|
2377
|
+
def _format_gpu_display(gpu_count, gpu_type):
|
|
2378
|
+
"""Render a friendly '{N}× {type}' string for reservation listings.
|
|
2379
|
+
|
|
2380
|
+
For MIG slice SKUs, surface the slice memory + the underlying physical type
|
|
2381
|
+
(e.g. '2× 10GB H100 (MIG)') instead of the raw 'H100-MIG-1G' identifier.
|
|
2382
|
+
"""
|
|
2383
|
+
if not gpu_type or str(gpu_type).lower() in ("unknown", ""):
|
|
2384
|
+
return f"{gpu_count} GPU(s)"
|
|
2385
|
+
gt_lower = str(gpu_type).lower()
|
|
2386
|
+
mig_friendly = {
|
|
2387
|
+
"h100-mig-1g": "10GB H100 (MIG)",
|
|
2388
|
+
"h100-mig-2g": "20GB H100 (MIG)",
|
|
2389
|
+
"h100-mig-3g": "40GB H100 (MIG)",
|
|
2390
|
+
"h100-mig-4g": "40GB H100 (MIG)",
|
|
2391
|
+
"h100-mig-7g": "80GB H100 (MIG)",
|
|
2392
|
+
}
|
|
2393
|
+
if gt_lower in mig_friendly:
|
|
2394
|
+
return f"{gpu_count}× {mig_friendly[gt_lower]}"
|
|
2395
|
+
return f"{gpu_count}x {str(gpu_type).upper()}"
|
|
2396
|
+
|
|
2397
|
+
|
|
2382
2398
|
def _show_availability() -> None:
|
|
2383
2399
|
"""Shared function to show GPU availability"""
|
|
2384
2400
|
try:
|
|
@@ -64,6 +64,18 @@ def select_gpu_type_interactive(
|
|
|
64
64
|
if "-mig-" not in gt
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
+
# Aggregate MIG slice availability so we can hint it on the h100 row of this picker.
|
|
68
|
+
mig_total_available = sum(
|
|
69
|
+
int(info.get("available", 0))
|
|
70
|
+
for gt, info in (availability_info or {}).items()
|
|
71
|
+
if gt.startswith("h100-mig-")
|
|
72
|
+
)
|
|
73
|
+
mig_total_capacity = sum(
|
|
74
|
+
int(info.get("total", 0))
|
|
75
|
+
for gt, info in (availability_info or {}).items()
|
|
76
|
+
if gt.startswith("h100-mig-")
|
|
77
|
+
)
|
|
78
|
+
|
|
67
79
|
# Display availability table first
|
|
68
80
|
console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
|
|
69
81
|
table = Table()
|
|
@@ -132,6 +144,8 @@ def select_gpu_type_interactive(
|
|
|
132
144
|
)
|
|
133
145
|
if queue_length > 0:
|
|
134
146
|
choice_label += f" - {queue_length} in queue"
|
|
147
|
+
if gpu_type == "h100" and mig_total_capacity > 0:
|
|
148
|
+
choice_label += f" — also {mig_total_available}/{mig_total_capacity} MIG slices"
|
|
135
149
|
|
|
136
150
|
choices.append(questionary.Choice(title=choice_label, value=gpu_type))
|
|
137
151
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.8"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -132,13 +132,32 @@ RUN mkdir -p /run/sshd /var/run/sshd && \
|
|
|
132
132
|
# Create SSH config
|
|
133
133
|
COPY ssh_config /etc/ssh/sshd_config
|
|
134
134
|
|
|
135
|
-
#
|
|
135
|
+
# Install Claude Code SYSTEM-WIDE via the official native installer (as root).
|
|
136
|
+
# Lives in /opt/claude with a /usr/local/bin/claude symlink, survives every reservation
|
|
137
|
+
# regardless of persistent-disk state, and can't be shadowed by stale npm installs on
|
|
138
|
+
# user disks. Image rebuilds are the only update path — controlled, reproducible.
|
|
139
|
+
USER root
|
|
140
|
+
RUN curl -fsSL https://claude.ai/install.sh | HOME=/opt/claude bash || echo "Claude native install failed (non-fatal at build time)"
|
|
141
|
+
RUN if [ -e /opt/claude/.local/bin/claude ]; then \
|
|
142
|
+
ln -sf /opt/claude/.local/bin/claude /usr/local/bin/claude; \
|
|
143
|
+
chmod -R a+rX /opt/claude; \
|
|
144
|
+
fi
|
|
145
|
+
|
|
146
|
+
# Set up npm global directory for dev user (kept for ad-hoc dev-installed CLIs).
|
|
136
147
|
USER dev
|
|
137
148
|
WORKDIR /home/dev
|
|
138
|
-
|
|
139
149
|
RUN mkdir -p ~/.npm-global && \
|
|
140
|
-
npm config set prefix ~/.npm-global
|
|
141
|
-
|
|
150
|
+
npm config set prefix ~/.npm-global
|
|
151
|
+
|
|
152
|
+
# OpenAI Codex CLI installed SYSTEM-WIDE as root (parallels the Claude install above).
|
|
153
|
+
# Auth is per-user: either `export OPENAI_API_KEY=sk-…` in their shell, or `codex login`
|
|
154
|
+
# for the browser/ChatGPT-account flow. AWS Bedrock now serves Codex (announced 2026-04-28
|
|
155
|
+
# at openai.com/index/openai-on-aws) — once we have early access, swap in the Bedrock-backed
|
|
156
|
+
# install + flip an env var; users won't need their own keys anymore.
|
|
157
|
+
USER root
|
|
158
|
+
RUN npm install -g --prefix /usr/local @openai/codex || echo "Codex CLI install failed (non-fatal at build time)"
|
|
159
|
+
|
|
160
|
+
USER dev
|
|
142
161
|
|
|
143
162
|
# Install oh-my-zsh for dev user
|
|
144
163
|
RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended
|
|
@@ -24,3 +24,17 @@ check_warnings() {
|
|
|
24
24
|
|
|
25
25
|
# Run warning check before every command prompt
|
|
26
26
|
PROMPT_COMMAND="check_warnings; $PROMPT_COMMAND"
|
|
27
|
+
|
|
28
|
+
# Auto-cleanup of deprecated ANTHROPIC_MODEL pinning. An older docker image
|
|
29
|
+
# hardcoded ANTHROPIC_MODEL=us.anthropic.claude-sonnet-4-20250514-v1:0 in
|
|
30
|
+
# .shell_env, which then got cached on persistent disks. We unset it in the
|
|
31
|
+
# current session and strip the line from disk so it doesn't come back.
|
|
32
|
+
# Self-healing — once cleaned, the case statement no longer matches.
|
|
33
|
+
case "${ANTHROPIC_MODEL:-}" in
|
|
34
|
+
*sonnet-4-20250514*)
|
|
35
|
+
unset ANTHROPIC_MODEL
|
|
36
|
+
if [ -f "$HOME/.shell_env" ]; then
|
|
37
|
+
sed -i '/ANTHROPIC_MODEL.*sonnet-4-20250514/d' "$HOME/.shell_env" 2>/dev/null || true
|
|
38
|
+
fi
|
|
39
|
+
;;
|
|
40
|
+
esac
|
|
@@ -25,3 +25,17 @@ check_warnings() {
|
|
|
25
25
|
|
|
26
26
|
# Run warning check before every command prompt (zsh hook)
|
|
27
27
|
precmd() { check_warnings }
|
|
28
|
+
|
|
29
|
+
# Auto-cleanup of deprecated ANTHROPIC_MODEL pinning. An older docker image
|
|
30
|
+
# hardcoded ANTHROPIC_MODEL=us.anthropic.claude-sonnet-4-20250514-v1:0 in
|
|
31
|
+
# .shell_env, which then got cached on persistent disks. We unset it in the
|
|
32
|
+
# current session and strip the line from disk so it doesn't come back.
|
|
33
|
+
# Self-healing — once cleaned, the case statement no longer matches.
|
|
34
|
+
case "${ANTHROPIC_MODEL:-}" in
|
|
35
|
+
*sonnet-4-20250514*)
|
|
36
|
+
unset ANTHROPIC_MODEL
|
|
37
|
+
if [ -f "$HOME/.shell_env" ]; then
|
|
38
|
+
sed -i '/ANTHROPIC_MODEL.*sonnet-4-20250514/d' "$HOME/.shell_env" 2>/dev/null || true
|
|
39
|
+
fi
|
|
40
|
+
;;
|
|
41
|
+
esac
|
{gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
@@ -24,6 +24,45 @@ RESERVATIONS_TABLE = os.environ.get("RESERVATIONS_TABLE", "pytorch-gpu-dev-reser
|
|
|
24
24
|
SUPPORTED_GPU_TYPES = json.loads(os.environ["SUPPORTED_GPU_TYPES"])
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
def _parse_expires_at(value):
|
|
28
|
+
"""Parse the reservations table's `expires_at` field to a unix epoch (int).
|
|
29
|
+
|
|
30
|
+
DDB stores it as either an ISO-8601 datetime string ("2026-05-02T00:12:03.674845") OR
|
|
31
|
+
occasionally a numeric epoch — handle both. Returns None if unparseable.
|
|
32
|
+
"""
|
|
33
|
+
if value is None:
|
|
34
|
+
return None
|
|
35
|
+
# Numeric (Decimal/int/float) → epoch seconds directly.
|
|
36
|
+
if not isinstance(value, str):
|
|
37
|
+
try:
|
|
38
|
+
return int(float(value))
|
|
39
|
+
except (ValueError, TypeError):
|
|
40
|
+
return None
|
|
41
|
+
s = value.strip()
|
|
42
|
+
if not s:
|
|
43
|
+
return None
|
|
44
|
+
# ISO-8601 first (the actual production format).
|
|
45
|
+
try:
|
|
46
|
+
from datetime import datetime, timezone
|
|
47
|
+
# `fromisoformat` accepts microseconds; tolerate optional 'Z' suffix.
|
|
48
|
+
if s.endswith("Z"):
|
|
49
|
+
s2 = s[:-1] + "+00:00"
|
|
50
|
+
else:
|
|
51
|
+
s2 = s
|
|
52
|
+
dt = datetime.fromisoformat(s2)
|
|
53
|
+
if dt.tzinfo is None:
|
|
54
|
+
# Convention in this codebase: timestamps written via datetime.utcnow().isoformat() are UTC.
|
|
55
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
56
|
+
return int(dt.timestamp())
|
|
57
|
+
except (ValueError, TypeError):
|
|
58
|
+
pass
|
|
59
|
+
# Numeric-as-string fallback.
|
|
60
|
+
try:
|
|
61
|
+
return int(float(s))
|
|
62
|
+
except (ValueError, TypeError):
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
27
66
|
def get_gpu_resource_name(gpu_type: str) -> str:
|
|
28
67
|
return SUPPORTED_GPU_TYPES.get(gpu_type, {}).get("k8s_resource", "nvidia.com/gpu")
|
|
29
68
|
|
|
@@ -76,6 +115,13 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
|
|
76
115
|
logger.error(f"=== Failed to update availability for {gpu_type}: {gpu_error} ===")
|
|
77
116
|
# Continue with other GPU types
|
|
78
117
|
|
|
118
|
+
# Best-effort: delete stale rows for SKUs no longer in SUPPORTED_GPU_TYPES
|
|
119
|
+
# (e.g. after a GPU type rename like g7e -> rtxpro6000).
|
|
120
|
+
try:
|
|
121
|
+
cleanup_stale_availability_rows()
|
|
122
|
+
except Exception as cleanup_err:
|
|
123
|
+
logger.warning(f"Stale-row cleanup failed: {cleanup_err}")
|
|
124
|
+
|
|
79
125
|
return {
|
|
80
126
|
"statusCode": 200,
|
|
81
127
|
"body": json.dumps(
|
|
@@ -535,9 +581,8 @@ def compute_size_etas(v1, gpu_type, node_label_value, resource_name, gpus_per_in
|
|
|
535
581
|
continue
|
|
536
582
|
if pod_name not in pod_to_info:
|
|
537
583
|
continue
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
except (ValueError, TypeError):
|
|
584
|
+
ts = _parse_expires_at(expires_at)
|
|
585
|
+
if ts is None:
|
|
541
586
|
continue
|
|
542
587
|
node_name, gpus = pod_to_info[pod_name]
|
|
543
588
|
node_state[node_name]["expirations"].append((ts, gpus))
|
|
@@ -594,3 +639,29 @@ def compute_size_etas(v1, gpu_type, node_label_value, resource_name, gpus_per_in
|
|
|
594
639
|
|
|
595
640
|
return etas
|
|
596
641
|
|
|
642
|
+
|
|
643
|
+
def cleanup_stale_availability_rows():
|
|
644
|
+
"""Delete rows in the availability table whose gpu_type isn't in SUPPORTED_GPU_TYPES.
|
|
645
|
+
|
|
646
|
+
Triggered on every Lambda invocation. Idempotent. Used to garbage-collect renamed
|
|
647
|
+
SKUs (e.g. g7e -> rtxpro6000) that would otherwise linger as zero rows.
|
|
648
|
+
"""
|
|
649
|
+
table = dynamodb.Table(AVAILABILITY_TABLE)
|
|
650
|
+
valid_keys = set(SUPPORTED_GPU_TYPES.keys())
|
|
651
|
+
last_key = None
|
|
652
|
+
deleted = []
|
|
653
|
+
while True:
|
|
654
|
+
kwargs = {"ProjectionExpression": "gpu_type"}
|
|
655
|
+
if last_key:
|
|
656
|
+
kwargs["ExclusiveStartKey"] = last_key
|
|
657
|
+
resp = table.scan(**kwargs)
|
|
658
|
+
for item in resp.get("Items", []):
|
|
659
|
+
gt = item.get("gpu_type")
|
|
660
|
+
if gt and gt not in valid_keys:
|
|
661
|
+
table.delete_item(Key={"gpu_type": gt})
|
|
662
|
+
deleted.append(gt)
|
|
663
|
+
last_key = resp.get("LastEvaluatedKey")
|
|
664
|
+
if not last_key:
|
|
665
|
+
break
|
|
666
|
+
if deleted:
|
|
667
|
+
logger.info(f"Deleted {len(deleted)} stale availability rows: {deleted}")
|
{gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -4030,7 +4030,9 @@ def create_pod(
|
|
|
4030
4030
|
client.V1Container(
|
|
4031
4031
|
name="gpu-dev",
|
|
4032
4032
|
image=container_image,
|
|
4033
|
-
|
|
4033
|
+
# :latest is a moving tag — always re-pull so a new docker image
|
|
4034
|
+
# rolled out to ECR is picked up by every fresh reservation.
|
|
4035
|
+
image_pull_policy="Always",
|
|
4034
4036
|
**({
|
|
4035
4037
|
"command": ["/bin/bash"],
|
|
4036
4038
|
"args": [
|
|
@@ -4443,6 +4445,34 @@ EOF_ZSHRC_EXT
|
|
|
4443
4445
|
done
|
|
4444
4446
|
echo "[STARTUP] ✓ Shell extension sourcing configured"
|
|
4445
4447
|
|
|
4448
|
+
# Surgically remove the deprecated ANTHROPIC_MODEL pinning that older docker
|
|
4449
|
+
# images baked into shell_env. claude-sonnet-4-20250514 is being deprecated
|
|
4450
|
+
# by Anthropic, and an old hardcoded value lingers on persistent disks.
|
|
4451
|
+
# Idempotent — strips only the matching line, leaves any user-customized value alone.
|
|
4452
|
+
if [ -f /home/dev/.shell_env ] && grep -q "ANTHROPIC_MODEL.*sonnet-4-20250514" /home/dev/.shell_env 2>/dev/null; then
|
|
4453
|
+
echo "[STARTUP] Removing deprecated ANTHROPIC_MODEL=sonnet-4-20250514 line from .shell_env"
|
|
4454
|
+
sed -i '/ANTHROPIC_MODEL.*sonnet-4-20250514/d' /home/dev/.shell_env || true
|
|
4455
|
+
fi
|
|
4456
|
+
|
|
4457
|
+
# Remove the legacy npm-based Claude install (~/.npm-global/bin/claude).
|
|
4458
|
+
# Older docker images ran `npm install -g @anthropic-ai/claude-code` and that
|
|
4459
|
+
# binary still lingers on persistent disks where it shadows the system-wide
|
|
4460
|
+
# /usr/local/bin/claude (because $HOME/.npm-global/bin precedes /usr/local/bin
|
|
4461
|
+
# on PATH). The system-wide install is kept current via image rebuilds; the
|
|
4462
|
+
# user's own ~/.local/bin/claude (from `claude install` in their home, or
|
|
4463
|
+
# Claude's self-update mechanism) is left intact so users can opt into newer
|
|
4464
|
+
# versions ahead of our image refresh.
|
|
4465
|
+
if [ -e /home/dev/.npm-global/bin/claude ]; then
|
|
4466
|
+
echo "[STARTUP] Removing legacy npm-installed claude at /home/dev/.npm-global/bin/claude"
|
|
4467
|
+
rm -f /home/dev/.npm-global/bin/claude || true
|
|
4468
|
+
fi
|
|
4469
|
+
# Also drop the npm package files for @anthropic-ai/claude-code so npm doesn't
|
|
4470
|
+
# think it's still installed on next `npm list -g`.
|
|
4471
|
+
if [ -d /home/dev/.npm-global/lib/node_modules/@anthropic-ai/claude-code ]; then
|
|
4472
|
+
echo "[STARTUP] Removing legacy @anthropic-ai/claude-code npm package files"
|
|
4473
|
+
rm -rf /home/dev/.npm-global/lib/node_modules/@anthropic-ai/claude-code || true
|
|
4474
|
+
fi
|
|
4475
|
+
|
|
4446
4476
|
# Fix ownership - recursive only for new disks (fast, empty disk)
|
|
4447
4477
|
# For existing disks, only fix the specific files we just created/modified
|
|
4448
4478
|
if [ "$CREATE_SH_ENV" = "true" ]; then
|
|
@@ -6233,7 +6263,13 @@ def should_use_persistent_disk(user_id: str, current_reservation_id: str) -> boo
|
|
|
6233
6263
|
|
|
6234
6264
|
|
|
6235
6265
|
def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]:
|
|
6236
|
-
"""Get instance type and GPU type from the node where pod is scheduled
|
|
6266
|
+
"""Get instance type and GPU type from the node where pod is scheduled.
|
|
6267
|
+
|
|
6268
|
+
For MIG slice pods, the SKU is derived from the pod's resource request
|
|
6269
|
+
(nvidia.com/mig-Ng.NNgb) rather than the host instance type, so the DDB
|
|
6270
|
+
record reflects the actual partition the user reserved instead of the
|
|
6271
|
+
physical card.
|
|
6272
|
+
"""
|
|
6237
6273
|
try:
|
|
6238
6274
|
v1 = client.CoreV1Api(k8s_client)
|
|
6239
6275
|
|
|
@@ -6250,7 +6286,27 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6250
6286
|
"node.kubernetes.io/instance-type", "unknown"
|
|
6251
6287
|
)
|
|
6252
6288
|
|
|
6253
|
-
#
|
|
6289
|
+
# If the pod requests a MIG slice resource, the GPU type is the slice SKU.
|
|
6290
|
+
# Map back to the canonical SKU stored elsewhere in this code (matches CLI flag).
|
|
6291
|
+
mig_resource_to_sku = {
|
|
6292
|
+
"nvidia.com/mig-1g.10gb": "h100-mig-1g",
|
|
6293
|
+
"nvidia.com/mig-1g.20gb": "h100-mig-1g", # memory-doubled variant, same SKU bucket
|
|
6294
|
+
"nvidia.com/mig-2g.20gb": "h100-mig-2g",
|
|
6295
|
+
"nvidia.com/mig-3g.40gb": "h100-mig-3g",
|
|
6296
|
+
"nvidia.com/mig-4g.40gb": "h100-mig-4g",
|
|
6297
|
+
"nvidia.com/mig-7g.80gb": "h100-mig-7g",
|
|
6298
|
+
}
|
|
6299
|
+
if pod.spec.containers:
|
|
6300
|
+
for c in pod.spec.containers:
|
|
6301
|
+
reqs = (c.resources.requests if c.resources and c.resources.requests else {}) or {}
|
|
6302
|
+
for r_name, sku in mig_resource_to_sku.items():
|
|
6303
|
+
if reqs.get(r_name):
|
|
6304
|
+
logger.info(
|
|
6305
|
+
f"Pod {pod_name} on {node_name} requests {r_name} -> MIG SKU {sku}"
|
|
6306
|
+
)
|
|
6307
|
+
return instance_type, sku
|
|
6308
|
+
|
|
6309
|
+
# Map instance type to GPU type for non-MIG (full GPU) pods
|
|
6254
6310
|
gpu_type_mapping = {
|
|
6255
6311
|
"g4dn.4xlarge": "T4",
|
|
6256
6312
|
"g4dn.8xlarge": "T4",
|
|
@@ -6261,6 +6317,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
|
|
|
6261
6317
|
"g6.12xlarge": "L4",
|
|
6262
6318
|
"g6.16xlarge": "L4",
|
|
6263
6319
|
"g6.24xlarge": "L4",
|
|
6320
|
+
"g7e.24xlarge": "rtxpro6000",
|
|
6264
6321
|
"p4d.24xlarge": "A100",
|
|
6265
6322
|
"p5.48xlarge": "H100",
|
|
6266
6323
|
"p5e.48xlarge": "H200",
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.8"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.5"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.6 → gpu_dev-0.5.8}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|