gpu-dev 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_dev-0.3.5/.gitignore +73 -0
- gpu_dev-0.3.5/CLAUDE.md +288 -0
- gpu_dev-0.3.5/PKG-INFO +687 -0
- gpu_dev-0.3.5/admin/README.md +50 -0
- gpu_dev-0.3.5/admin/generate_stats.py +1004 -0
- gpu_dev-0.3.5/admin/requirements.txt +5 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/README.md +669 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +87 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +687 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +104 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +1 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +4 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +10 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +1 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +9 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +158 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +3754 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +248 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +523 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +702 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +117 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +2231 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +106 -0
- gpu_dev-0.3.5/cli-tools/gpu-dev-cli/minimal-iam-policy.json +32 -0
- gpu_dev-0.3.5/cli-tools/scripts/clear_stale_disk_locks.py +215 -0
- gpu_dev-0.3.5/docs/USER_GUIDE.md +1413 -0
- gpu_dev-0.3.5/docs/devgpu-features.html +537 -0
- gpu_dev-0.3.5/docs/docker-mark-blue.svg +12 -0
- gpu_dev-0.3.5/docs/icons8-cursor-ai.svg +1 -0
- gpu_dev-0.3.5/pyproject.toml +35 -0
- gpu_dev-0.3.5/setup.cfg +4 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/.terraform.lock.hcl +108 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/README.md +333 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/alb.tf +297 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/availability.tf +264 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/backend.tf +9 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/.dockerignore +22 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/Dockerfile +186 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/backup-dotfiles +74 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/bash_profile +15 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/bashrc +22 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/bashrc_ext +26 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +8 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/list-dotfile-versions +69 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/motd_script +116 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/nproc_wrapper +32 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/profile +15 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/restore-dotfiles +38 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/restore-dotfiles-version +79 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/setup-dotfiles-persistence +83 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/shell_env +19 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/ssh_config +14 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/zprofile +10 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/zshrc +58 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker/zshrc_ext +27 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker-build.tf +150 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker-example/Dockerfile +95 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/docker-example/hello.txt +1 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/ecr.tf +169 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/efs.tf +119 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/eks.tf +559 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/expiry.tf +232 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/kubernetes.tf +592 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/availability_updater/index.py +367 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +2 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +183 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/reservation_expiry/index.py +1846 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +3 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +481 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/reservation_processor/index.py +7904 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +3 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/shared/__init__.py +8 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/shared/alb_utils.py +331 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/shared/dns_utils.py +456 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/shared/k8s_client.py +125 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +255 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/shared/requirements.txt +3 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +567 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/lambda.tf +300 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/main.tf +565 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +382 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +337 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/migrations/check_snapshots.py +62 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +454 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/migrations/run_backfill.sh +183 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/monitoring.tf +988 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/outputs.tf +82 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/pyproject.toml +43 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/queue.tf +142 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/route53.tf +203 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/s3-disk-contents.tf +69 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +140 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +111 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +145 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/scripts/inspect_user_data.sh +258 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/ssh-proxy/Dockerfile +25 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/ssh-proxy/proxy.py +231 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/ssh-proxy/requirements.txt +2 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/ssh-proxy-service.tf +383 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/ssh-proxy.tf +67 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/switch-to.sh +67 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +57 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/templates/al2023-user-data.sh +116 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/templates/user-data-self-managed.sh +55 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/templates/user-data.sh +30 -0
- gpu_dev-0.3.5/terraform-gpu-devservers/variables.tf +174 -0
gpu_dev-0.3.5/.gitignore
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
pip-wheel-metadata/
|
|
20
|
+
share/python-wheels/
|
|
21
|
+
*.egg-info/
|
|
22
|
+
.installed.cfg
|
|
23
|
+
*.egg
|
|
24
|
+
MANIFEST
|
|
25
|
+
|
|
26
|
+
# Virtual environments
|
|
27
|
+
venv/
|
|
28
|
+
env/
|
|
29
|
+
ENV/
|
|
30
|
+
env.bak/
|
|
31
|
+
venv.bak/
|
|
32
|
+
|
|
33
|
+
# Terraform
|
|
34
|
+
*.tfstate
|
|
35
|
+
*.tfstate.*
|
|
36
|
+
*.tfvars
|
|
37
|
+
.terraform/
|
|
38
|
+
.terraform.lock.hcl
|
|
39
|
+
crash.log
|
|
40
|
+
crash.*.log
|
|
41
|
+
override.tf
|
|
42
|
+
override.tf.json
|
|
43
|
+
*_override.tf
|
|
44
|
+
*_override.tf.json
|
|
45
|
+
.terraformrc
|
|
46
|
+
terraform.rc
|
|
47
|
+
|
|
48
|
+
# Lambda packages
|
|
49
|
+
*.zip
|
|
50
|
+
lambda/*/package/
|
|
51
|
+
|
|
52
|
+
# IDE
|
|
53
|
+
.vscode/
|
|
54
|
+
.idea/
|
|
55
|
+
*.swp
|
|
56
|
+
*.swo
|
|
57
|
+
*~
|
|
58
|
+
.DS_Store
|
|
59
|
+
|
|
60
|
+
# Logs
|
|
61
|
+
*.log
|
|
62
|
+
|
|
63
|
+
# Local config
|
|
64
|
+
.env
|
|
65
|
+
.env.local
|
|
66
|
+
|
|
67
|
+
# Grafana Cloud secrets
|
|
68
|
+
**/grafana-cloud.tfvars
|
|
69
|
+
**/grafana-cloud.auto.tfvars
|
|
70
|
+
**/secrets.auto.tfvars
|
|
71
|
+
|
|
72
|
+
# Admin output files
|
|
73
|
+
admin/output/
|
gpu_dev-0.3.5/CLAUDE.md
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
# Agent notes
|
|
2
|
+
|
|
3
|
+
the first part of this doc is the devs description of the repo. Everything under the 'AGENT SECTION' is for you, the agent, to update state, tricky things, what we're working on and more.
|
|
4
|
+
This will help both you, the agent, but also other agents down the road that share the responsibility of this repo management to navigate the repo.
|
|
5
|
+
|
|
6
|
+
## Agent restrictions
|
|
7
|
+
|
|
8
|
+
- NEVER run `terraform apply` or any destructive terraform commands
|
|
9
|
+
- You can run read-only terraform commands like `terraform plan`, `terraform state show`, etc.
|
|
10
|
+
- You can run AWS CLI commands for read-only resource fetching and analysis
|
|
11
|
+
- User will handle all infrastructure deployments themselves
|
|
12
|
+
- Note: We use OpenTofu, so user runs `opentofu apply` or `tf apply` locally (tf is aliased to opentofu)
|
|
13
|
+
- we use k for kubectl and have kubens configured to namespace gpu-dev
|
|
14
|
+
|
|
15
|
+
## Development style
|
|
16
|
+
|
|
17
|
+
We like compact code, comments when needed, but only if they add value. For example, a variable called 'number_of_threads' does not need a comment that is contains number of threads.
|
|
18
|
+
We like tested code.
|
|
19
|
+
|
|
20
|
+
For frontend code we use yarn, yarn format, yarn tsc. yarn dev to run code, but leave it up to the dev to run that one.
|
|
21
|
+
For terraform, we use opentofu, don't ever run tf apply directly. You're free to run tf state/plan and other non-breaking commands though.
|
|
22
|
+
|
|
23
|
+
**Python Code Style:**
|
|
24
|
+
|
|
25
|
+
- Always put imports at the top of the file, never inside functions or methods
|
|
26
|
+
- Group imports in standard order: standard library, third-party, local imports
|
|
27
|
+
- Use absolute imports when possible
|
|
28
|
+
|
|
29
|
+
## Content
|
|
30
|
+
|
|
31
|
+
- torchci - a next.js app containing a PyTorch CI tracker
|
|
32
|
+
- aws - a bunch of lambdas & amis that are used in the tf module
|
|
33
|
+
- terraform-aws-github-runner - the definition of repos tofu modules. These modules are used in another repo to be deployed.
|
|
34
|
+
- cli-tools - the home of the gpu-dev cli tool that is used for creating/listing/cancelling reservations
|
|
35
|
+
|
|
36
|
+
## Current challenge and WIP
|
|
37
|
+
|
|
38
|
+
Currently we're working on a developer servers with GPUs in AWS. This means we'll need:
|
|
39
|
+
|
|
40
|
+
- a CLI tool for devs to reserve a server [DONE]
|
|
41
|
+
- a queue of open requests [DONE]
|
|
42
|
+
- a reservation for 2 EC2 H100 servers
|
|
43
|
+
- a way for devs to specify if they want 1/2/4/8 GPUs of a server [DONE]
|
|
44
|
+
- later, a way for devs to specify 2x8 GPUs, so they want a connected 2 server setup reserved for X hours
|
|
45
|
+
- we care about NIC connection - NVLINK or as fast as possible in one region / subregion.
|
|
46
|
+
- a lambda to process items from the queue if servers are available [DONE]
|
|
47
|
+
- a managed k8s to reserve, start a pod, interactive, and reserve that one for X hours for the dev (configurable) [DONE]
|
|
48
|
+
- auth can be through github public keys, all devs already have those exposed. This should be for devs with commit access to pytorch/pytorch only though. And part of metamates group in Github. [DONE]
|
|
49
|
+
|
|
50
|
+
# AGENT SECTION
|
|
51
|
+
|
|
52
|
+
## Issues I found with the description above
|
|
53
|
+
|
|
54
|
+
- I am not sure terraform-aws-github-runner is correctly described. Next time I go over this code for maintenance or adding something, I'll inform the user of what I think should change. This is not an active goal though, just a sidequest.
|
|
55
|
+
- The user asked for NIC connections. I still need to figure out how fast and what's avaiable @ AWS, When I do that, I'll update this section below:
|
|
56
|
+
|
|
57
|
+
## NIC explanation in AWS
|
|
58
|
+
|
|
59
|
+
**EFA (Elastic Fabric Adapter):**
|
|
60
|
+
|
|
61
|
+
- Low-latency, high-throughput networking for HPC/AI workloads
|
|
62
|
+
- 3200 Gbps bandwidth on p5.48xlarge instances
|
|
63
|
+
- RDMA support, bypasses kernel for direct hardware access
|
|
64
|
+
- Integrates with NVIDIA NCCL for multi-GPU communication
|
|
65
|
+
- **Critical limitation**: Cannot cross Availability Zones - all instances must be in same AZ
|
|
66
|
+
|
|
67
|
+
**H100 Instance Performance (p5.48xlarge):**
|
|
68
|
+
|
|
69
|
+
- 8x NVIDIA H100 GPUs (80GB each = 640GB total GPU memory)
|
|
70
|
+
- Within instance: GPUs use NVLINK folr direct communication
|
|
71
|
+
- Between instances: EFA provides fastest networking option
|
|
72
|
+
- Single AZ placement group recommended for best performance
|
|
73
|
+
|
|
74
|
+
**K8s Decision:** EKS with GPU-optimized EC2 node groups (Fargate has no GPU support)
|
|
75
|
+
|
|
76
|
+
## Implementation Status (Jan 11, 2025)
|
|
77
|
+
|
|
78
|
+
### ✅ Completed and Working
|
|
79
|
+
|
|
80
|
+
- **Infrastructure**: Dual-mode EKS with managed vs self-managed node groups for faster development
|
|
81
|
+
- **Networking**: Full DNS resolution and internet access for pods (CoreDNS + security groups fixed)
|
|
82
|
+
- **SSH Access**: Complete SSH server setup with proper package installation and daemon startup
|
|
83
|
+
- **Authentication**: GitHub public key fetching (ALL user keys, not just first one)
|
|
84
|
+
- **CLI Features**: Float hours support (e.g., --hours 0.25 for 15 minutes)
|
|
85
|
+
- **Reservation Display**: CLI list command shows formatted expiration times (YYYY-MM-DD HH:MM:SS)
|
|
86
|
+
- **Security Groups**: Full connectivity - kubelet (10250), control plane (443), DNS (53), NodePort (30000-32767)
|
|
87
|
+
- **Python CLI tool**: Commands: reserve, list, config with real-time polling
|
|
88
|
+
- **SQS + Lambda**: Async queue processing system with DynamoDB state tracking
|
|
89
|
+
- **Kubernetes**: Pod creation with GPU allocation, NodePort services, init containers
|
|
90
|
+
- **Expiry System**: Timestamp-based expiration tracking with historical records (TTL disabled)
|
|
91
|
+
- **DynamoDB**: Reservations kept as historical records, not auto-deleted
|
|
92
|
+
- **SSORole + instructions for that** - Implement SSO role authentication and provide setup instructions
|
|
93
|
+
- **Rename G6 to L4** - Update G6 references to L4 (similar to T4 GPU type naming)
|
|
94
|
+
- **Add network drive (EFS)** - Implement 20TB EFS shared storage mounted at /shared with user folders
|
|
95
|
+
- **GPU Profiling Support** - Added NVIDIA profiling capabilities for all pods:
|
|
96
|
+
- Node-level: Added `options nvidia NVreg_RestrictProfilingToAdminUsers=0` to `/etc/modprobe.d/nvprof.conf` in node bootstrap script - automatically configured on ALL new GPU nodes
|
|
97
|
+
- Bootstrap: Configuration added at `terraform-gpu-devservers/templates/al2023-user-data.sh:17-19` (applied BEFORE NVIDIA driver installation to avoid auto-load issue)
|
|
98
|
+
- Pod-level: Added Linux capability `SYS_ADMIN` to all GPU pods (required for NVIDIA profiling tools like ncu/nsys)
|
|
99
|
+
- Environment: Set `NVIDIA_DRIVER_CAPABILITIES=compute,utility` (note: `profile` is NOT supported by NVIDIA device plugin)
|
|
100
|
+
- Location: `terraform-gpu-devservers/lambda/reservation_processor/index.py:4000` and `:3984`
|
|
101
|
+
- **GPU Monitoring with Grafana** - Added full GPU monitoring stack:
|
|
102
|
+
- DCGM Exporter enabled in GPU Operator with anti-affinity for profiling nodes
|
|
103
|
+
- kube-prometheus-stack deployed with 50GB persistent storage (15-day retention)
|
|
104
|
+
- Grafana accessible via NodePort 30080 on any node IP
|
|
105
|
+
- Pre-loaded NVIDIA DCGM dashboard (Grafana ID 12239) + custom GPU Overview dashboard
|
|
106
|
+
- Configuration: `terraform-gpu-devservers/monitoring.tf`
|
|
107
|
+
|
|
108
|
+
## GPU Monitoring & Profiling Node Setup (Dec 2025)
|
|
109
|
+
|
|
110
|
+
**Architecture:**
|
|
111
|
+
- DCGM Exporter runs on ALL GPU nodes EXCEPT profiling-dedicated nodes
|
|
112
|
+
- Profiling-dedicated nodes: ONE H100 and ONE B200 node reserved for Nsight profiling
|
|
113
|
+
- DCGM and Nsight conflict because both need exclusive GPU access
|
|
114
|
+
|
|
115
|
+
**Profiling Node Labeling (manual, one-time setup after `tf apply`):**
|
|
116
|
+
```bash
|
|
117
|
+
# List H100 nodes and pick ONE for profiling
|
|
118
|
+
kubectl get nodes -l gpu-type=h100
|
|
119
|
+
|
|
120
|
+
# Label one H100 node as profiling-dedicated (DCGM will NOT run on this node)
|
|
121
|
+
kubectl label node <h100-node-name> gpu.monitoring/profiling-dedicated=true
|
|
122
|
+
|
|
123
|
+
# List B200 nodes and pick ONE for profiling
|
|
124
|
+
kubectl get nodes -l gpu-type=b200
|
|
125
|
+
|
|
126
|
+
# Label one B200 node as profiling-dedicated
|
|
127
|
+
kubectl label node <b200-node-name> gpu.monitoring/profiling-dedicated=true
|
|
128
|
+
|
|
129
|
+
# Verify labels
|
|
130
|
+
kubectl get nodes -l gpu.monitoring/profiling-dedicated=true
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**Grafana Access:**
|
|
134
|
+
```bash
|
|
135
|
+
# Get any node IP
|
|
136
|
+
kubectl get nodes -o wide
|
|
137
|
+
|
|
138
|
+
# Access Grafana at: http://<node-ip>:30080
|
|
139
|
+
# Default credentials: admin / (value of grafana_admin_password variable)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**Available Dashboards:**
|
|
143
|
+
- NVIDIA DCGM Exporter Dashboard (pre-configured from Grafana community)
|
|
144
|
+
- GPU Overview (custom dashboard with utilization, memory, temp, power)
|
|
145
|
+
|
|
146
|
+
**Troubleshooting:**
|
|
147
|
+
```bash
|
|
148
|
+
# Check DCGM pods are running (should NOT be on profiling nodes)
|
|
149
|
+
kubectl get pods -n gpu-operator -l app=nvidia-dcgm-exporter -o wide
|
|
150
|
+
|
|
151
|
+
# Verify Prometheus is scraping DCGM
|
|
152
|
+
kubectl port-forward -n monitoring svc/kube-prometheus-stack-prometheus 9090:9090
|
|
153
|
+
# Then open http://localhost:9090 and query: DCGM_FI_DEV_GPU_UTIL
|
|
154
|
+
|
|
155
|
+
# Check Grafana pods
|
|
156
|
+
kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Recent Fixes (Oct 27, 2025)
|
|
160
|
+
|
|
161
|
+
**NVIDIA Profiling Bootstrap Configuration (Oct 27, 2025):**
|
|
162
|
+
- **Bug Found**: NVIDIA driver installation (`dnf install nvidia-driver`) automatically loads kernel modules during install, so config must be created BEFORE driver installation, not just before explicit modprobe
|
|
163
|
+
- **Fix**: Moved `echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" > /etc/modprobe.d/nvprof.conf` to line 19 (before driver install at line 23)
|
|
164
|
+
- **Previous Location**: Line 59-60 (after driver install) - TOO LATE, modules already loaded during dnf install
|
|
165
|
+
- **New Location**: `terraform-gpu-devservers/templates/al2023-user-data.sh:17-19` (before driver installation)
|
|
166
|
+
- **Benefit**: All new GPU nodes will have profiling enabled automatically without requiring manual configuration or reboots
|
|
167
|
+
- **Rollout**: Run `tf apply` to update launch template, then terminate existing nodes so ASG recreates them with new bootstrap script
|
|
168
|
+
|
|
169
|
+
## Recent Fixes (Oct 8, 2025)
|
|
170
|
+
|
|
171
|
+
**Kubelet Auto-Start Issue on T4 Nodes:**
|
|
172
|
+
- **Problem**: After rebooting T4 nodes to apply NVIDIA profiling config, kubelet didn't auto-start
|
|
173
|
+
- **Root Cause**: `systemctl enable kubelet` wasn't being called during node bootstrap
|
|
174
|
+
- **Temporary Fix**: Manually enabled and started kubelet on all 5 T4 nodes via SSH
|
|
175
|
+
- **Future**: Nodes should be terminated and recreated by ASG to get fresh bootstrap (user-data runs nodeadm which should enable kubelet)
|
|
176
|
+
|
|
177
|
+
**Decimal/Float Type Error in Lambda:**
|
|
178
|
+
- **Problem**: `unsupported operand type(s) for *: 'decimal.Decimal' and 'float'` error when allocating GPU resources
|
|
179
|
+
- **Root Cause**: DynamoDB returns numbers as `Decimal` type, but Lambda code was multiplying with Python floats
|
|
180
|
+
- **Fix**: Added `gpu_count = int(gpu_count)` at start of `get_pod_resource_limits()` and `get_pod_resource_requests()` functions
|
|
181
|
+
- **Location**: `terraform-gpu-devservers/lambda/reservation_processor/index.py:3034` and `:3117`
|
|
182
|
+
|
|
183
|
+
**NVIDIA Profiling Configuration:**
|
|
184
|
+
- **Problem 1**: Pods failed with "unsupported capabilities found in 'compute,profile,utility' (allowed 'compute,utility')"
|
|
185
|
+
- Fix: Removed `profile` from `NVIDIA_DRIVER_CAPABILITIES`, kept only `compute,utility`
|
|
186
|
+
- **Problem 2**: Profiling failed with "driver resource unavailable" even with `CAP_PERFMON` and `CAP_SYS_PTRACE`
|
|
187
|
+
- Fix: Changed to `CAP_SYS_ADMIN` which is required for NVIDIA GPU profiling (ncu, nsys)
|
|
188
|
+
- **Root Cause**: NVIDIA profiling tools need full SYS_ADMIN capability to access driver resources
|
|
189
|
+
- **Final Config**: `SYS_ADMIN` capability + node-level `NVreg_RestrictProfilingToAdminUsers=0`
|
|
190
|
+
- **Location**: `terraform-gpu-devservers/lambda/reservation_processor/index.py:4000` and `:3984`
|
|
191
|
+
|
|
192
|
+
**No Persistent Disk Flag (Oct 8, 2025):**
|
|
193
|
+
- **Problem**: When user created 2nd reservation and confirmed "continue without persistent disk", Lambda waited 60s for disk detachment, timed out, set status to "failed", but then CONTINUED execution and restored from snapshot anyway
|
|
194
|
+
- **Root Cause 1**: The timeout logic at line 305 raised `RuntimeError` which was caught by outer try-except block at line 2108, but `persistent_volume_id` variable remained set from earlier operations, so pod creation still used a persistent disk
|
|
195
|
+
- **Root Cause 2**: Exception handler at line 2275 only set `use_persistent_disk = False` but didn't clear `persistent_volume_id`, so any disk created/restored before the exception would still be attached to the pod
|
|
196
|
+
- **Fix Part 1 - Explicit Flag**: Added `no_persistent_disk` flag that flows from CLI through SQS to Lambda
|
|
197
|
+
- CLI: When user confirms to continue without persistent disk, sets `no_persistent_disk=True` in SQS message
|
|
198
|
+
- Lambda: Checks `no_persistent_disk` flag early (line 2087-2090) and skips ALL persistent disk logic if true
|
|
199
|
+
- Files: `cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py:914`, `reservations.py:396,450,487,544`, `lambda/reservation_processor/index.py:2087-2090`
|
|
200
|
+
- **Fix Part 2 - Exception Cleanup**: Updated exception handler at line 2275 to properly clean up state
|
|
201
|
+
- Sets `persistent_volume_id = None` to clear any volume created before the error
|
|
202
|
+
- Sets `is_new_disk = True` so EmptyDir gets proper shell environment setup
|
|
203
|
+
- Location: `lambda/reservation_processor/index.py:2279-2280`
|
|
204
|
+
- **Benefit**: No more waiting for disk detachment, no snapshot restoration, clean EmptyDir volume from the start. Even if disk operations fail mid-way, exception handler ensures no disk is attached.
|
|
205
|
+
|
|
206
|
+
### 📋 Remaining Tasks
|
|
207
|
+
|
|
208
|
+
- **FQDN for devservers** - Set up proper domain names for development server access
|
|
209
|
+
- **Automated SSH config per reservation** - ✅ DONE - Each reservation now gets `~/.devgpu/<reservation_id>-sshconfig` file, use with `ssh -F ~/.devgpu/<reservation_id>-sshconfig <pod_name>`
|
|
210
|
+
- **Custom Docker image scaffold** - Create Dockerfile with pre-installed packages (Jupyter, etc.)
|
|
211
|
+
- **Add Docker CI image run** - allow user to specify gpu-dev ci-debug <testurl> that downloads that docker-image and goes for it
|
|
212
|
+
- **Increase /dev/shm for NCCL** - Bump /dev/shm space from 64MB for NCCL requirements (https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#docker)
|
|
213
|
+
- **Add nvcuvid.so support** - Enable NCU (NVIDIA Nsight Compute) support with nvcuvid.so library
|
|
214
|
+
|
|
215
|
+
- **Make gpu-type case agnostic** - Allow case-insensitive GPU type parameters (e.g., h100, H100, HuNdred should all work)
|
|
216
|
+
- **Error on non-existing GPU type** - Error out if people ask for a non-existing GPU type
|
|
217
|
+
- **Error on too many GPUs** - Error out if people ask for more GPUs than available in node (8 for H100/B200, 4 for T4, etc.)
|
|
218
|
+
- **Fix GPU SKU validation** - Add proper error handling for non-existing/unavailable GPU types (e.g., user requesting A100 when only T4 available should get immediate error, not pending pod that will never schedule)
|
|
219
|
+
- **Set HuggingFace cache location** - Set HF_HOME or XDG_CACHE_HOME to /tmp or /workspace so HuggingFace doesn't fill up user home directories with model downloads
|
|
220
|
+
- **Add verbose CLI output** - More detailed status and progress information for debugging
|
|
221
|
+
- **Interactive CLI for cancel/edit** - Make `gpu-dev cancel` and `gpu-dev edit` interactive when no reservation ID specified - show list with up/down arrow selection
|
|
222
|
+
- **Default reservation edit/cancel** - Auto-select reservation if user only has one active
|
|
223
|
+
- **Add a command gpu-dev availability** that shows how many gpus of each type are available to reserve at the moment, and if 0, what the estimated queue time is
|
|
224
|
+
- **Production deployment** - Switch to p5.48xlarge instances when ready
|
|
225
|
+
- **Investigate NFS** - Research NFS integration for shared storage across pods
|
|
226
|
+
- **Persistent disk** - Implement persistent disk storage for user data across sessions
|
|
227
|
+
- **Validate CUDA version** - Add CUDA version validation and display in container startup
|
|
228
|
+
- **Validate NVIDIA driver version** - Display and validate NVIDIA driver version
|
|
229
|
+
- **Test wall messages** - Verify that wall message functionality works correctly
|
|
230
|
+
- **Validate if expiration works as expected** - Test and verify pod cleanup and reservation expiry process
|
|
231
|
+
- **Simplify code + clean up** - Refactor and clean up codebase for maintainability
|
|
232
|
+
- **Add Docker** - Install and configure Docker in development containers - maybe --docker at reserve, which will use dind if possible to the container (to investigate how feasible)
|
|
233
|
+
- **Add ghstack** - Install ghstack tool for GitHub stack management
|
|
234
|
+
- **Improve debugging and observability** - Add better CLI feedback for pod status, container logs, and error details. Current debugging experience is poor - users need kubectl/aws cli knowledge to debug issues. CLI should show:
|
|
235
|
+
- Real-time pod startup logs during `gpu-dev reserve`
|
|
236
|
+
- Container error messages when pods fail
|
|
237
|
+
- Image pull status and errors
|
|
238
|
+
- Resource allocation details
|
|
239
|
+
- More detailed error messages with troubleshooting hints
|
|
240
|
+
- **Add CloudWatch logs for pods** - Store pod logs in CloudWatch for better debugging and monitoring
|
|
241
|
+
- **Add tests for everything** - Implement comprehensive test suite for all components
|
|
242
|
+
- **Investigate multi node communication** - Research inter-node networking for multi-GPU setups
|
|
243
|
+
- **Switch between H100/B200 GPU types** - Add `--gpu-type=b200` CLI option with separate queues per GPU type
|
|
244
|
+
- **GPU queue status command** - Add status command to show queue length per GPU type (eg, `gpu-dev queue-status`)
|
|
245
|
+
- **Jupyter notebook integration** - Add `--jupyter` flag to enable Jupyter notebook and TensorBoard access
|
|
246
|
+
- **Add user collaboration feature** - Add `--add-user <github_name>` flag to allow users to add someone to the server
|
|
247
|
+
- **Display Bug:** - CLI shows "G6" instead of "L4" in availability table - likely resolves on prod release when Lambda functions are updated with new GPU type mappings
|
|
248
|
+
- **Fix extend command warning cleanup** - When using `--extend`, the system doesn't remove the WARN_EXPIRES_IN_5MIN.txt file and doesn't reset the expiry warning tracking in the database. Need to either clear the warning state from the table or keep warning history elsewhere for auditing purposes
|
|
249
|
+
- **Max reservation time: 48 hours** - Maximum reservation duration is 48 hours (initial 24h + one 24h extension allowed)
|
|
250
|
+
- **Scale up T4 instances** - Add 3 more T4 nodes (g4dn.12xlarge) to cluster
|
|
251
|
+
- **Scale up L4 instances** - Add 3 more L4 nodes (g6.12xlarge) to cluster
|
|
252
|
+
- **Add on-demand H100/H200/B200 capacity** - Add at least 2 nodes each of H100 (p5.48xlarge), H200 (p5e.48xlarge), and B200 (p6-b200.48xlarge) as on-demand capacity in addition to existing reserved instances
|
|
253
|
+
- **Future features**:
|
|
254
|
+
- Multi-server (16 GPU) reservations
|
|
255
|
+
- GitHub organization/team verification
|
|
256
|
+
- Reservation extensions
|
|
257
|
+
- Usage monitoring and quotas
|
|
258
|
+
|
|
259
|
+
## Current Working Architecture
|
|
260
|
+
|
|
261
|
+
**Infrastructure (us-east-2):**
|
|
262
|
+
|
|
263
|
+
- **Current**: 2x p4d.24xlarge instances (8 A100 GPUs each = 16 total GPUs)
|
|
264
|
+
- **Previous testing**: 2x g4dn.12xlarge instances (4 T4 GPUs each = 8 total GPUs)
|
|
265
|
+
- **Future**: 2x p5.48xlarge instances (8 H100 GPUs each = 16 total GPUs) when capacity available
|
|
266
|
+
- EKS cluster with GPU-optimized node groups
|
|
267
|
+
- NVIDIA device plugin for GPU resource exposure
|
|
268
|
+
- Single AZ deployment with cluster placement groups
|
|
269
|
+
|
|
270
|
+
**Reservation System:**
|
|
271
|
+
|
|
272
|
+
- SQS queue for async reservation requests
|
|
273
|
+
- Lambda functions for pod creation and expiry management
|
|
274
|
+
- DynamoDB for reservation and server state tracking
|
|
275
|
+
- Kubernetes pods with GPU resource allocation (1/2/4 GPUs)
|
|
276
|
+
- NodePort services for SSH access to pods
|
|
277
|
+
|
|
278
|
+
**Authentication & Access:**
|
|
279
|
+
|
|
280
|
+
- GitHub username configuration for SSH key fetching
|
|
281
|
+
- Public key injection into pods via init containers
|
|
282
|
+
- Copy-pasteable SSH commands with NodePort access
|
|
283
|
+
|
|
284
|
+
**CLI Tool:**
|
|
285
|
+
|
|
286
|
+
- Python CLI with config at `~/.config/gpu-dev/config.json`
|
|
287
|
+
- Commands: `reserve`, `list`, `config`
|
|
288
|
+
- Real-time polling until reservation is ready
|