gpu-dev 0.3.8__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_dev-0.4.0/.github/workflows/no-gitlinks.yml +22 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/.gitignore +2 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/CLAUDE.md +27 -0
- {gpu_dev-0.3.8/cli-tools/gpu-dev-cli/gpu_dev.egg-info → gpu_dev-0.4.0}/PKG-INFO +2 -2
- {gpu_dev-0.3.8 → gpu_dev-0.4.0/cli-tools/gpu-dev-cli/gpu_dev.egg-info}/PKG-INFO +2 -2
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +1 -1
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -1
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +85 -72
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +9 -9
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +23 -10
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +103 -159
- gpu_dev-0.4.0/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +142 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/pyproject.toml +2 -2
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/alb.tf +1 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/Dockerfile +28 -64
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/build-with-efa.sh +16 -10
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/shell_env +7 -3
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/eks.tf +103 -61
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/availability_updater/index.py +27 -21
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py +539 -184
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/dns_utils.py +53 -19
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/main.tf +166 -31
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/queue.tf +7 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy/proxy.py +4 -1
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy-service.tf +1 -1
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/templates/al2023-user-data.sh +53 -3
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/variables.tf +12 -11
- gpu_dev-0.3.8/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -106
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/PROGRESS.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/TODO.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/admin/README.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/admin/generate_stats.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/admin/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/post.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/setup.cfg +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.3.8 → gpu_dev-0.4.0}/terraform-gpu-devservers/templates/user-data.sh +0 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: Validate repository structure
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- main
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
no-gitlinks:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- name: Checkout
|
|
14
|
+
uses: actions/checkout@v4
|
|
15
|
+
- name: Ensure no gitlinks are tracked
|
|
16
|
+
run: |
|
|
17
|
+
gitlinks=$(git ls-files -s | awk "$1 == 160000 {print}")
|
|
18
|
+
if [ -n "$gitlinks" ]; then
|
|
19
|
+
echo "Unexpected gitlinks found:"
|
|
20
|
+
echo "$gitlinks"
|
|
21
|
+
exit 1
|
|
22
|
+
fi
|
|
@@ -8,6 +8,8 @@ This will help both you, the agent, but also other agents down the road that sha
|
|
|
8
8
|
- NEVER run `terraform apply` or any destructive terraform commands
|
|
9
9
|
- You can run read-only terraform commands like `terraform plan`, `terraform state show`, etc.
|
|
10
10
|
- You can run AWS CLI commands for read-only resource fetching and analysis
|
|
11
|
+
- NEVER run destructive AWS CLI commands: `aws ec2 terminate-instances`, `aws ec2 stop-instances`, `aws autoscaling set-desired-capacity` (to 0), `aws ec2 delete-*`, `aws dynamodb delete-table`, etc. On 2026-03-09 an agent accidentally terminated 10 EC2 instances including 6 pet H100 instances from another team's capacity reservations. This must never happen again.
|
|
12
|
+
- NEVER run `kubectl delete node`, `kubectl drain`, `kubectl cordon`, or any command that removes/disrupts running workloads
|
|
11
13
|
- User will handle all infrastructure deployments themselves
|
|
12
14
|
- Note: We use OpenTofu, so user runs `opentofu apply` or `tf apply` locally (tf is aliased to opentofu)
|
|
13
15
|
- we use k for kubectl and have kubens configured to namespace gpu-dev
|
|
@@ -73,6 +75,31 @@ Currently we're working on a developer servers with GPUs in AWS. This means we'l
|
|
|
73
75
|
|
|
74
76
|
**K8s Decision:** EKS with GPU-optimized EC2 node groups (Fargate has no GPU support)
|
|
75
77
|
|
|
78
|
+
## Multi-Node NCCL Communication (Mar 2026)
|
|
79
|
+
|
|
80
|
+
**Working Configuration (SENDRECV protocol):**
|
|
81
|
+
- Protocol: `OFI_NCCL_PROTOCOL=SENDRECV` (host-staged EFA, avoids RDMA mr_regattr failures)
|
|
82
|
+
- GDR disabled: `FI_EFA_USE_DEVICE_RDMA=0`, `NCCL_NET_GDR_LEVEL=0`
|
|
83
|
+
- Socket interface: `NCCL_SOCKET_IFNAME=^lo,docker` (H100 nodes use enp71s0/enp72s0, NOT eth0)
|
|
84
|
+
- Algorithm: `NCCL_ALGO=ring,tree` (NCCL auto-selects tree for large messages, ~2x faster)
|
|
85
|
+
- Exclude Mellanox: `NCCL_IB_HCA=^mlx`
|
|
86
|
+
- OpenMPI lib path: `/opt/amazon/openmpi/lib` (NOT lib64 — EFA installer puts it in lib)
|
|
87
|
+
|
|
88
|
+
**Benchmark Results (2x p5.48xlarge, 16 GPUs):**
|
|
89
|
+
- Ring algorithm: ~9.5 GB/s avg bus bandwidth, ~13.4 GB/s peak
|
|
90
|
+
- Tree algorithm: ~21.4 GB/s avg bus bandwidth, ~33.6 GB/s peak
|
|
91
|
+
- Ring+tree combined: ~21.0 GB/s avg (NCCL auto-selects tree for large msgs)
|
|
92
|
+
- Single-node NVLink: ~34 GB/s (for reference)
|
|
93
|
+
|
|
94
|
+
**GDR Status (NOT working — future optimization):**
|
|
95
|
+
- EFA RDMA protocol fails: `fi_mr_regattr` returns EFAULT for flush buffer (even host memory)
|
|
96
|
+
- EFA device version: 6 (above aws-ofi-nccl blocklist threshold of 1-3)
|
|
97
|
+
- EFA kernel driver: 2.17.2a (need 2.17.3+ which has "Support P2P with NVIDIA 580 drivers")
|
|
98
|
+
- nvidia-peermem: NOT available (module not found for kernel 6.12.68)
|
|
99
|
+
- efa-nv-peermem: NOT installed (available in amzn-drivers repo, works with open NVIDIA drivers)
|
|
100
|
+
- To enable GDR in future: install efa-nv-peermem module on host nodes, or update EFA kernel driver
|
|
101
|
+
- Expected GDR improvement: ~300-370 GB/s bus bandwidth (vs ~33 GB/s current)
|
|
102
|
+
|
|
76
103
|
## Implementation Status (Jan 11, 2025)
|
|
77
104
|
|
|
78
105
|
### ✅ Completed and Working
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: CLI tool for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -12,7 +12,7 @@ Requires-Dist: pydantic>=2.5.0
|
|
|
12
12
|
Requires-Dist: rich>=13.7.0
|
|
13
13
|
Requires-Dist: pyyaml>=6.0.1
|
|
14
14
|
Requires-Dist: questionary>=2.1.1
|
|
15
|
-
Requires-Dist: websockets
|
|
15
|
+
Requires-Dist: websockets>=12.0
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
18
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-dev
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: CLI tool for PyTorch GPU developer server reservations
|
|
5
5
|
Author: PyTorch Team
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -12,7 +12,7 @@ Requires-Dist: pydantic>=2.5.0
|
|
|
12
12
|
Requires-Dist: rich>=13.7.0
|
|
13
13
|
Requires-Dist: pyyaml>=6.0.1
|
|
14
14
|
Requires-Dist: questionary>=2.1.1
|
|
15
|
-
Requires-Dist: websockets
|
|
15
|
+
Requires-Dist: websockets>=12.0
|
|
16
16
|
Requires-Dist: certifi>=2023.7.22
|
|
17
17
|
Requires-Dist: mcp>=1.0.0
|
|
18
18
|
|
|
@@ -95,7 +95,6 @@ def validate_ssh_key_matches_github_user(config: Config, live=None) -> Dict[str,
|
|
|
95
95
|
# Restart the spinner
|
|
96
96
|
if live:
|
|
97
97
|
live.start()
|
|
98
|
-
live.update(Spinner("dots", text="🔐 Validating SSH key..."))
|
|
99
98
|
|
|
100
99
|
# Check if we got the expected GitHub response
|
|
101
100
|
if "Hi " in ssh_output and "You've successfully authenticated" in ssh_output:
|
|
@@ -310,6 +310,12 @@ def _show_single_reservation(connection_info: dict) -> None:
|
|
|
310
310
|
oom_time_display = format_timestamp(last_oom_at) if last_oom_at else "Unknown"
|
|
311
311
|
oom_section = f"\n[red]⚠️ OOM Events:[/red] [red]{oom_count} OOM(s) detected (last: {oom_time_display})[/red]"
|
|
312
312
|
|
|
313
|
+
# Show pod internal IP for multinode reservations
|
|
314
|
+
pod_ip_info = ""
|
|
315
|
+
pod_ip = connection_info.get("pod_ip")
|
|
316
|
+
if pod_ip and connection_info.get("is_multinode"):
|
|
317
|
+
pod_ip_info = f"[blue]Internal IP:[/blue] {pod_ip}\n"
|
|
318
|
+
|
|
313
319
|
panel_content = (
|
|
314
320
|
f"[green]Reservation Details[/green]\n\n"
|
|
315
321
|
f"[blue]Quick Connect:[/blue] {connect_command}\n"
|
|
@@ -317,7 +323,8 @@ def _show_single_reservation(connection_info: dict) -> None:
|
|
|
317
323
|
+ vscode_info
|
|
318
324
|
+ jupyter_info
|
|
319
325
|
+ f"[blue]Pod Name:[/blue] {connection_info['pod_name']}\n"
|
|
320
|
-
|
|
326
|
+
+ pod_ip_info
|
|
327
|
+
+ f"[blue]GPUs:[/blue] {gpu_info}\n"
|
|
321
328
|
f"[blue]Instance Type:[/blue] {instance_type}\n"
|
|
322
329
|
+ secondary_users_info
|
|
323
330
|
+ f"[blue]Storage:[/blue] {disk_status}\n"
|
|
@@ -1408,59 +1415,42 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1408
1415
|
|
|
1409
1416
|
statuses_to_include = requested_statuses
|
|
1410
1417
|
else:
|
|
1411
|
-
# Default:
|
|
1418
|
+
# Default: active statuses only (fast path)
|
|
1419
|
+
# failed/cancelled are fetched separately and filtered to last hour
|
|
1412
1420
|
statuses_to_include = [
|
|
1413
|
-
"active", "preparing", "queued", "pending"
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1421
|
+
"active", "preparing", "queued", "pending"]
|
|
1422
|
+
|
|
1423
|
+
# For default view, fetch active statuses + recent failures in parallel
|
|
1424
|
+
if not status:
|
|
1425
|
+
from datetime import datetime, timezone, timedelta
|
|
1426
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
1427
|
+
one_hour_ago = (datetime.now(timezone.utc) - timedelta(hours=1)).isoformat()
|
|
1428
|
+
|
|
1429
|
+
def fetch_active():
|
|
1430
|
+
return reservation_mgr.list_reservations(
|
|
1431
|
+
user_filter=user_filter, statuses_to_include=statuses_to_include)
|
|
1432
|
+
|
|
1433
|
+
def fetch_recent_failures():
|
|
1434
|
+
return reservation_mgr.list_reservations(
|
|
1435
|
+
user_filter=user_filter,
|
|
1436
|
+
statuses_to_include=["failed", "cancelled"],
|
|
1437
|
+
created_after=one_hour_ago)
|
|
1438
|
+
|
|
1439
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
1440
|
+
active_future = executor.submit(fetch_active)
|
|
1441
|
+
failures_future = executor.submit(fetch_recent_failures)
|
|
1442
|
+
reservations = active_future.result() + failures_future.result()
|
|
1443
|
+
else:
|
|
1444
|
+
reservations = reservation_mgr.list_reservations(
|
|
1445
|
+
user_filter=user_filter, statuses_to_include=statuses_to_include
|
|
1446
|
+
)
|
|
1418
1447
|
except RuntimeError as e:
|
|
1419
1448
|
rprint(f"[red]❌ {str(e)}[/red]")
|
|
1420
1449
|
return False
|
|
1421
1450
|
|
|
1422
1451
|
# Filter failed/cancelled reservations to only show recent ones (last hour)
|
|
1423
1452
|
if not status or "all" not in (status.split(",") if status else []):
|
|
1424
|
-
|
|
1425
|
-
from datetime import datetime, timezone, timedelta
|
|
1426
|
-
now = datetime.now(timezone.utc)
|
|
1427
|
-
one_hour_ago = now - timedelta(hours=1)
|
|
1428
|
-
|
|
1429
|
-
filtered_reservations = []
|
|
1430
|
-
for reservation in reservations:
|
|
1431
|
-
reservation_status = reservation.get("status", "unknown")
|
|
1432
|
-
if reservation_status in ["active", "preparing", "queued", "pending"]:
|
|
1433
|
-
# Always show active/pending reservations
|
|
1434
|
-
filtered_reservations.append(reservation)
|
|
1435
|
-
elif reservation_status in ["failed", "cancelled"]:
|
|
1436
|
-
# Only show failed/cancelled from last hour
|
|
1437
|
-
created_at = reservation.get("created_at")
|
|
1438
|
-
if created_at:
|
|
1439
|
-
try:
|
|
1440
|
-
if isinstance(created_at, str):
|
|
1441
|
-
if created_at.endswith("Z"):
|
|
1442
|
-
created_dt = datetime.fromisoformat(
|
|
1443
|
-
created_at.replace("Z", "+00:00"))
|
|
1444
|
-
elif "+" in created_at or created_at.endswith("00:00"):
|
|
1445
|
-
created_dt = datetime.fromisoformat(
|
|
1446
|
-
created_at)
|
|
1447
|
-
else:
|
|
1448
|
-
naive_dt = datetime.fromisoformat(
|
|
1449
|
-
created_at)
|
|
1450
|
-
created_dt = naive_dt.replace(
|
|
1451
|
-
tzinfo=timezone.utc)
|
|
1452
|
-
else:
|
|
1453
|
-
created_dt = datetime.fromtimestamp(
|
|
1454
|
-
created_at, tz=timezone.utc)
|
|
1455
|
-
|
|
1456
|
-
if created_dt >= one_hour_ago:
|
|
1457
|
-
filtered_reservations.append(reservation)
|
|
1458
|
-
except (ValueError, TypeError):
|
|
1459
|
-
# If timestamp parsing fails, include it to be safe
|
|
1460
|
-
filtered_reservations.append(reservation)
|
|
1461
|
-
else:
|
|
1462
|
-
# Include other statuses as-is
|
|
1463
|
-
filtered_reservations.append(reservation)
|
|
1453
|
+
filtered_reservations = reservations
|
|
1464
1454
|
|
|
1465
1455
|
reservations = filtered_reservations
|
|
1466
1456
|
|
|
@@ -1556,20 +1546,25 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
|
|
|
1556
1546
|
else:
|
|
1557
1547
|
queue_info = "Calculating..."
|
|
1558
1548
|
elif res_status == "active":
|
|
1559
|
-
# Show SSH
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
ssh_command.split("dev@")[1].split()[0]
|
|
1565
|
-
if "dev@" in ssh_command
|
|
1566
|
-
else "Ready"
|
|
1567
|
-
)
|
|
1568
|
-
queue_info = f"Ready: {node_info}"
|
|
1569
|
-
except (IndexError, AttributeError):
|
|
1570
|
-
queue_info = "Ready"
|
|
1549
|
+
# Show pod IP for multinode, SSH hint for single-node
|
|
1550
|
+
pod_ip = reservation.get("pod_ip", "")
|
|
1551
|
+
is_multinode = reservation.get("is_multinode", False)
|
|
1552
|
+
if is_multinode and pod_ip:
|
|
1553
|
+
queue_info = f"IP: {pod_ip}"
|
|
1571
1554
|
else:
|
|
1572
|
-
|
|
1555
|
+
ssh_command = reservation.get("ssh_command", "")
|
|
1556
|
+
if ssh_command and "dev@" in ssh_command:
|
|
1557
|
+
try:
|
|
1558
|
+
node_info = (
|
|
1559
|
+
ssh_command.split("dev@")[1].split()[0]
|
|
1560
|
+
if "dev@" in ssh_command
|
|
1561
|
+
else "Ready"
|
|
1562
|
+
)
|
|
1563
|
+
queue_info = f"Ready: {node_info}"
|
|
1564
|
+
except (IndexError, AttributeError):
|
|
1565
|
+
queue_info = "Ready"
|
|
1566
|
+
else:
|
|
1567
|
+
queue_info = "Ready"
|
|
1573
1568
|
|
|
1574
1569
|
# Format storage indicator - show disk name if available
|
|
1575
1570
|
disk_name = reservation.get("disk_name")
|
|
@@ -2471,11 +2466,14 @@ def _show_availability() -> None:
|
|
|
2471
2466
|
else:
|
|
2472
2467
|
wait_display = f"{hours}h {minutes}min"
|
|
2473
2468
|
|
|
2474
|
-
#
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
if
|
|
2469
|
+
# Check maintenance mode
|
|
2470
|
+
is_maintenance = info.get("maintenance", False)
|
|
2471
|
+
maintenance_reason = info.get("maintenance_reason", "")
|
|
2472
|
+
|
|
2473
|
+
if is_maintenance:
|
|
2474
|
+
available_display = f"[red]MAINTENANCE[/red]"
|
|
2475
|
+
wait_display = maintenance_reason or "Under maintenance"
|
|
2476
|
+
elif available == 0:
|
|
2479
2477
|
available_display = f"[red]{available}[/red]"
|
|
2480
2478
|
elif full_nodes_available > 0:
|
|
2481
2479
|
available_display = f"[green]{available}[/green]"
|
|
@@ -2485,9 +2483,9 @@ def _show_availability() -> None:
|
|
|
2485
2483
|
table.add_row(
|
|
2486
2484
|
gpu_type.upper(),
|
|
2487
2485
|
available_display,
|
|
2488
|
-
str(max_reservable),
|
|
2486
|
+
str(max_reservable) if not is_maintenance else "-",
|
|
2489
2487
|
str(total),
|
|
2490
|
-
str(queue_length),
|
|
2488
|
+
str(queue_length) if not is_maintenance else "-",
|
|
2491
2489
|
arch,
|
|
2492
2490
|
wait_display,
|
|
2493
2491
|
)
|
|
@@ -2576,6 +2574,7 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2576
2574
|
title="GPU Availability by Type (numbers are GPUs, not nodes)")
|
|
2577
2575
|
table.add_column("GPU Type", style="cyan")
|
|
2578
2576
|
table.add_column("Available", style="green")
|
|
2577
|
+
table.add_column("Max Reservable", style="blue")
|
|
2579
2578
|
table.add_column("Total", style="blue")
|
|
2580
2579
|
table.add_column("Queue Length", style="yellow")
|
|
2581
2580
|
table.add_column("Architecture", style="dim")
|
|
@@ -2588,11 +2587,13 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2588
2587
|
# Add separator before CPU section
|
|
2589
2588
|
if last_arch and not last_arch.startswith("CPU") and arch.startswith("CPU"):
|
|
2590
2589
|
table.add_row("---", "---", "---",
|
|
2591
|
-
"---", "---", "---")
|
|
2590
|
+
"---", "---", "---", "---")
|
|
2592
2591
|
|
|
2593
2592
|
last_arch = arch
|
|
2594
2593
|
available = info.get("available", 0)
|
|
2594
|
+
max_reservable = info.get("max_reservable", 0)
|
|
2595
2595
|
total = info.get("total", 0)
|
|
2596
|
+
full_nodes_available = info.get("full_nodes_available", 0)
|
|
2596
2597
|
queue_length = info.get("queue_length", 0)
|
|
2597
2598
|
est_wait = info.get("estimated_wait_minutes", 0)
|
|
2598
2599
|
|
|
@@ -2611,17 +2612,26 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2611
2612
|
else:
|
|
2612
2613
|
wait_display = f"{hours}h {minutes}min"
|
|
2613
2614
|
|
|
2614
|
-
#
|
|
2615
|
-
|
|
2615
|
+
# Check maintenance mode
|
|
2616
|
+
is_maintenance = info.get("maintenance", False)
|
|
2617
|
+
maintenance_reason = info.get("maintenance_reason", "")
|
|
2618
|
+
|
|
2619
|
+
if is_maintenance:
|
|
2620
|
+
available_display = f"[red]MAINTENANCE[/red]"
|
|
2621
|
+
wait_display = maintenance_reason or "Under maintenance"
|
|
2622
|
+
elif available == 0:
|
|
2623
|
+
available_display = f"[red]{available}[/red]"
|
|
2624
|
+
elif full_nodes_available > 0:
|
|
2616
2625
|
available_display = f"[green]{available}[/green]"
|
|
2617
2626
|
else:
|
|
2618
|
-
available_display = f"[
|
|
2627
|
+
available_display = f"[yellow]{available}[/yellow]"
|
|
2619
2628
|
|
|
2620
2629
|
table.add_row(
|
|
2621
2630
|
gpu_type.upper(),
|
|
2622
2631
|
available_display,
|
|
2632
|
+
str(max_reservable) if not is_maintenance else "-",
|
|
2623
2633
|
str(total),
|
|
2624
|
-
str(queue_length),
|
|
2634
|
+
str(queue_length) if not is_maintenance else "-",
|
|
2625
2635
|
arch,
|
|
2626
2636
|
wait_display,
|
|
2627
2637
|
)
|
|
@@ -3505,6 +3515,9 @@ def edit(
|
|
|
3505
3515
|
# Stop spinner before validation and operations
|
|
3506
3516
|
live.stop()
|
|
3507
3517
|
|
|
3518
|
+
# Use the full reservation_id from connection_info (not the user-provided prefix)
|
|
3519
|
+
reservation_id = connection_info["reservation_id"]
|
|
3520
|
+
|
|
3508
3521
|
if connection_info["status"] != "active":
|
|
3509
3522
|
rprint(
|
|
3510
3523
|
f"[red]❌ Can only edit active reservations (current status: {connection_info['status']})[/red]"
|
|
@@ -67,15 +67,15 @@ class Config:
|
|
|
67
67
|
|
|
68
68
|
def _create_aws_session(self):
|
|
69
69
|
"""Create AWS session with profile support"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
70
|
+
available_profiles = boto3.Session().available_profiles
|
|
71
|
+
if "gpu-dev" in available_profiles:
|
|
72
|
+
try:
|
|
73
|
+
session = boto3.Session(profile_name="gpu-dev")
|
|
74
|
+
session.get_credentials()
|
|
75
|
+
return session
|
|
76
|
+
except Exception:
|
|
77
|
+
pass
|
|
78
|
+
return boto3.Session()
|
|
79
79
|
|
|
80
80
|
@property
|
|
81
81
|
def sts_client(self):
|
|
@@ -92,8 +92,14 @@ def select_gpu_type_interactive(
|
|
|
92
92
|
wait_display = f"{hours}h {minutes}min"
|
|
93
93
|
status_indicator = "⏳"
|
|
94
94
|
|
|
95
|
-
#
|
|
96
|
-
|
|
95
|
+
# Check maintenance mode
|
|
96
|
+
is_maintenance = info.get("maintenance", False)
|
|
97
|
+
maintenance_reason = info.get("maintenance_reason", "")
|
|
98
|
+
|
|
99
|
+
if is_maintenance:
|
|
100
|
+
available_display = f"[red]MAINTENANCE[/red]"
|
|
101
|
+
wait_display = maintenance_reason or "Under maintenance"
|
|
102
|
+
elif available > 0:
|
|
97
103
|
available_display = f"[green]{available}[/green]"
|
|
98
104
|
else:
|
|
99
105
|
available_display = f"[red]{available}[/red]"
|
|
@@ -102,18 +108,25 @@ def select_gpu_type_interactive(
|
|
|
102
108
|
gpu_type.upper(),
|
|
103
109
|
available_display,
|
|
104
110
|
str(total),
|
|
105
|
-
str(queue_length),
|
|
111
|
+
str(queue_length) if not is_maintenance else "-",
|
|
106
112
|
wait_display,
|
|
107
113
|
)
|
|
108
114
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
+
if is_maintenance:
|
|
116
|
+
choices.append(questionary.Choice(
|
|
117
|
+
title=f"🔧 {gpu_type.upper()} - MAINTENANCE: {maintenance_reason}",
|
|
118
|
+
value=gpu_type,
|
|
119
|
+
disabled="Under maintenance",
|
|
120
|
+
))
|
|
121
|
+
else:
|
|
122
|
+
# Create choice label with status
|
|
123
|
+
choice_label = (
|
|
124
|
+
f"{status_indicator} {gpu_type.upper()} ({available}/{total} available)"
|
|
125
|
+
)
|
|
126
|
+
if queue_length > 0:
|
|
127
|
+
choice_label += f" - {queue_length} in queue"
|
|
115
128
|
|
|
116
|
-
|
|
129
|
+
choices.append(questionary.Choice(title=choice_label, value=gpu_type))
|
|
117
130
|
|
|
118
131
|
console.print(table)
|
|
119
132
|
console.print()
|