@yeongjaeyou/claude-code-config 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/gpu-parallel-pipeline/SKILL.md +99 -0
- package/.claude/skills/gpu-parallel-pipeline/references/architecture.md +194 -0
- package/.claude/skills/gpu-parallel-pipeline/references/single-gpu-patterns.md +225 -0
- package/.claude/skills/gpu-parallel-pipeline/references/troubleshooting.md +247 -0
- package/.claude/skills/gpu-parallel-pipeline/scripts/check_gpu_memory.py +80 -0
- package/package.json +1 -1
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: gpu-parallel-pipeline
|
|
3
|
+
description: Design and implement PyTorch GPU parallel processing pipelines for maximum throughput. Use when scaling workloads across multiple GPUs (ProcessPool, CUDA_VISIBLE_DEVICES isolation), optimizing single GPU utilization (CUDA Streams, async inference, model batching), or building I/O + compute pipelines (ThreadPool for loading, batch inference). Triggers on "multi-GPU", "GPU parallel", "batch inference", "CUDA isolation", "GPU utilization", "ProcessPool GPU", "PyTorch multi-GPU".
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# GPU Parallel Pipeline
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
This skill provides patterns for maximizing GPU throughput in data processing pipelines.
|
|
11
|
+
|
|
12
|
+
**Three core patterns:**
|
|
13
|
+
1. **Multi-GPU Distribution** - ProcessPool with GPU isolation via CUDA_VISIBLE_DEVICES
|
|
14
|
+
2. **Single GPU Optimization** - CUDA Streams, async inference, model batching
|
|
15
|
+
3. **I/O + Compute Pipeline** - ThreadPool for I/O parallelization + batch inference
|
|
16
|
+
|
|
17
|
+
## Quick Reference
|
|
18
|
+
|
|
19
|
+
| Pattern | Use Case | Speedup |
|
|
20
|
+
|---------|----------|---------|
|
|
21
|
+
| Multi-GPU ProcessPool | Large dataset, multiple GPUs | ~N x (N = GPU count) |
|
|
22
|
+
| ThreadPool I/O + Batch | I/O bottleneck (image loading) | 2-5x |
|
|
23
|
+
| CUDA Streams | Multiple models on single GPU | 1.5-3x |
|
|
24
|
+
|
|
25
|
+
## Multi-GPU Architecture
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
Main Process (Coordinator)
|
|
29
|
+
|
|
|
30
|
+
+-- GPU 0: ProcessPool Worker (CUDA_VISIBLE_DEVICES=0)
|
|
31
|
+
| +-- ThreadPool (I/O)
|
|
32
|
+
| +-- Model batch inference
|
|
33
|
+
|
|
|
34
|
+
+-- GPU 1: ProcessPool Worker (CUDA_VISIBLE_DEVICES=1)
|
|
35
|
+
| +-- ThreadPool (I/O)
|
|
36
|
+
| +-- Model batch inference
|
|
37
|
+
|
|
|
38
|
+
+-- GPU N: ...
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Key Implementation Steps
|
|
42
|
+
|
|
43
|
+
1. **Worker initialization with GPU isolation**
|
|
44
|
+
```python
|
|
45
|
+
def _worker_init_with_gpu(gpu_id: int) -> None:
|
|
46
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
|
47
|
+
# Initialize model here (once per worker)
|
|
48
|
+
global _model
|
|
49
|
+
_model = load_model()
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
2. **Spawn context (not fork)**
|
|
53
|
+
```python
|
|
54
|
+
ctx = mp.get_context("spawn") # Required for CUDA
|
|
55
|
+
with ProcessPoolExecutor(max_workers=n_gpus, mp_context=ctx) as executor:
|
|
56
|
+
...
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
3. **Chunk distribution**
|
|
60
|
+
```python
|
|
61
|
+
chunk_size = (n_total + n_gpus - 1) // n_gpus
|
|
62
|
+
chunks = [records[i*chunk_size:(i+1)*chunk_size] for i in range(n_gpus)]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## I/O + Compute Pipeline
|
|
66
|
+
|
|
67
|
+
Separate I/O (disk read) from compute (GPU inference) using ThreadPool:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
def _load_images_parallel(paths: list[str], max_workers: int = 8) -> dict:
|
|
71
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
72
|
+
futures = {executor.submit(cv2.imread, p): p for p in paths}
|
|
73
|
+
return {futures[f]: f.result() for f in as_completed(futures)}
|
|
74
|
+
|
|
75
|
+
def process_batch_hybrid(batch: list[dict]) -> list[dict]:
|
|
76
|
+
# 1. ThreadPool I/O
|
|
77
|
+
images = _load_images_parallel([r["path"] for r in batch])
|
|
78
|
+
# 2. GPU batch inference
|
|
79
|
+
results = model.predict_batch(list(images.values()))
|
|
80
|
+
return results
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Detailed References
|
|
84
|
+
|
|
85
|
+
- **[architecture.md](references/architecture.md)**: Multi-GPU ProcessPool design, worker lifecycle, error handling
|
|
86
|
+
- **[single-gpu-patterns.md](references/single-gpu-patterns.md)**: CUDA Streams, async inference, model parallelism
|
|
87
|
+
- **[troubleshooting.md](references/troubleshooting.md)**: spawn vs fork, OOM, CUDA_VISIBLE_DEVICES issues
|
|
88
|
+
|
|
89
|
+
## Memory Planning
|
|
90
|
+
|
|
91
|
+
Before implementation, check GPU memory:
|
|
92
|
+
```bash
|
|
93
|
+
python scripts/check_gpu_memory.py
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**Rule of thumb:**
|
|
97
|
+
- Workers per GPU = GPU_Memory / Model_Memory
|
|
98
|
+
- Example: 24GB GPU, 5GB model = 4 workers/GPU max
|
|
99
|
+
- Leave 2-3GB headroom for CUDA overhead
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# Multi-GPU Architecture
|
|
2
|
+
|
|
3
|
+
## Table of Contents
|
|
4
|
+
- [ProcessPool with GPU Isolation](#processpool-with-gpu-isolation)
|
|
5
|
+
- [Chunk Distribution Pattern](#chunk-distribution-pattern)
|
|
6
|
+
- [Complete Multi-GPU Orchestration](#complete-multi-gpu-orchestration)
|
|
7
|
+
- [Worker Lifecycle](#worker-lifecycle)
|
|
8
|
+
- [Error Handling Strategy](#error-handling-strategy)
|
|
9
|
+
- [Progress Tracking](#progress-tracking)
|
|
10
|
+
- [Performance Considerations](#performance-considerations)
|
|
11
|
+
|
|
12
|
+
## ProcessPool with GPU Isolation
|
|
13
|
+
|
|
14
|
+
### Why ProcessPool over ThreadPool for GPU?
|
|
15
|
+
|
|
16
|
+
Python's GIL doesn't affect GPU operations, but CUDA context initialization requires process isolation for reliable multi-GPU usage. Each process should own exactly one GPU.
|
|
17
|
+
|
|
18
|
+
### CUDA_VISIBLE_DEVICES Isolation
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
import os
|
|
22
|
+
import multiprocessing as mp
|
|
23
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
24
|
+
|
|
25
|
+
# Process-local state
|
|
26
|
+
_model = None
|
|
27
|
+
_gpu_id = None
|
|
28
|
+
|
|
29
|
+
def _worker_init_with_gpu(gpu_id: int) -> None:
|
|
30
|
+
"""Initialize worker with GPU isolation.
|
|
31
|
+
|
|
32
|
+
Must be called at the start of each worker process.
|
|
33
|
+
CUDA_VISIBLE_DEVICES makes this GPU appear as device:0 to PyTorch/TF.
|
|
34
|
+
"""
|
|
35
|
+
global _model, _gpu_id
|
|
36
|
+
|
|
37
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
|
38
|
+
_gpu_id = gpu_id
|
|
39
|
+
|
|
40
|
+
# Import ML framework AFTER setting CUDA_VISIBLE_DEVICES
|
|
41
|
+
import torch
|
|
42
|
+
_model = YourModel().cuda() # Now on device:0 (the isolated GPU)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Chunk Distribution Pattern
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
def distribute_to_gpus(records: list, n_gpus: int) -> list[tuple]:
|
|
49
|
+
"""Distribute records evenly across GPUs.
|
|
50
|
+
|
|
51
|
+
Returns list of (chunk, gpu_id, position) tuples.
|
|
52
|
+
"""
|
|
53
|
+
if n_gpus < 1:
|
|
54
|
+
raise ValueError(f"n_gpus must be >= 1, got {n_gpus}")
|
|
55
|
+
|
|
56
|
+
n_total = len(records)
|
|
57
|
+
chunk_size = (n_total + n_gpus - 1) // n_gpus # ceiling division
|
|
58
|
+
|
|
59
|
+
chunks = []
|
|
60
|
+
for i in range(n_gpus):
|
|
61
|
+
start = i * chunk_size
|
|
62
|
+
end = min(start + chunk_size, n_total)
|
|
63
|
+
if start < n_total:
|
|
64
|
+
chunks.append((records[start:end], i, i)) # (data, gpu_id, tqdm_position)
|
|
65
|
+
|
|
66
|
+
return chunks
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Complete Multi-GPU Orchestration
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
def run_multi_gpu(
|
|
73
|
+
records: list[dict],
|
|
74
|
+
n_gpus: int = 4,
|
|
75
|
+
batch_size: int = 128,
|
|
76
|
+
) -> list[dict]:
|
|
77
|
+
"""Orchestrate multi-GPU parallel processing.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
records: Data records to process
|
|
81
|
+
n_gpus: Number of GPUs to use
|
|
82
|
+
batch_size: Batch size per GPU
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Processed records with results
|
|
86
|
+
"""
|
|
87
|
+
if not records:
|
|
88
|
+
return []
|
|
89
|
+
|
|
90
|
+
# Distribute data
|
|
91
|
+
chunks = distribute_to_gpus(records, n_gpus)
|
|
92
|
+
print(f"Distributing {len(records):,} items across {len(chunks)} GPUs")
|
|
93
|
+
|
|
94
|
+
# CRITICAL: Use spawn context for CUDA
|
|
95
|
+
ctx = mp.get_context("spawn")
|
|
96
|
+
|
|
97
|
+
# Track GPU assignments for error recovery
|
|
98
|
+
gpu_to_chunk = {gpu_id: chunk for chunk, gpu_id, _ in chunks}
|
|
99
|
+
|
|
100
|
+
all_results = []
|
|
101
|
+
failed_chunks = []
|
|
102
|
+
|
|
103
|
+
with ProcessPoolExecutor(max_workers=len(chunks), mp_context=ctx) as executor:
|
|
104
|
+
futures = {
|
|
105
|
+
executor.submit(_process_gpu_chunk, chunk, gpu_id, batch_size, pos): gpu_id
|
|
106
|
+
for chunk, gpu_id, pos in chunks
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
for future in as_completed(futures):
|
|
110
|
+
gpu_id = futures[future]
|
|
111
|
+
try:
|
|
112
|
+
results = future.result()
|
|
113
|
+
all_results.extend(results)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
print(f"[ERROR] GPU {gpu_id} failed: {e}")
|
|
116
|
+
failed_chunks.append((gpu_id, gpu_to_chunk[gpu_id]))
|
|
117
|
+
|
|
118
|
+
# Handle failures gracefully (don't lose data)
|
|
119
|
+
if failed_chunks:
|
|
120
|
+
for gpu_id, chunk in failed_chunks:
|
|
121
|
+
for record in chunk:
|
|
122
|
+
record["_error"] = f"GPU {gpu_id} failed"
|
|
123
|
+
all_results.append(record)
|
|
124
|
+
|
|
125
|
+
return all_results
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Worker Lifecycle
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
spawn context creates new process
|
|
132
|
+
|
|
|
133
|
+
v
|
|
134
|
+
_worker_init_with_gpu(gpu_id)
|
|
135
|
+
- Set CUDA_VISIBLE_DEVICES
|
|
136
|
+
- Import ML framework
|
|
137
|
+
- Load model to GPU
|
|
138
|
+
|
|
|
139
|
+
v
|
|
140
|
+
Process batches in loop
|
|
141
|
+
|
|
|
142
|
+
v
|
|
143
|
+
ProcessPool cleanup (model freed)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Error Handling Strategy
|
|
147
|
+
|
|
148
|
+
1. **Per-GPU failure isolation**: One GPU failure shouldn't crash others
|
|
149
|
+
2. **Data preservation**: Failed chunks get marked, not dropped
|
|
150
|
+
3. **Graceful degradation**: Continue with remaining GPUs
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# Track failures
|
|
154
|
+
failed_chunks: list[tuple[int, list]] = []
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
results = future.result(timeout=300) # 5-min timeout
|
|
158
|
+
except Exception as e:
|
|
159
|
+
failed_chunks.append((gpu_id, original_chunk))
|
|
160
|
+
|
|
161
|
+
# After all futures complete
|
|
162
|
+
if failed_chunks:
|
|
163
|
+
print(f"[WARN] {len(failed_chunks)} GPU(s) failed")
|
|
164
|
+
# Add failed records with error markers
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Progress Tracking
|
|
168
|
+
|
|
169
|
+
Use tqdm with position parameter for multi-bar display:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from tqdm import tqdm
|
|
173
|
+
|
|
174
|
+
def _process_gpu_chunk(records, gpu_id, batch_size, position):
|
|
175
|
+
_worker_init_with_gpu(gpu_id)
|
|
176
|
+
|
|
177
|
+
batches = [records[i:i+batch_size] for i in range(0, len(records), batch_size)]
|
|
178
|
+
results = []
|
|
179
|
+
|
|
180
|
+
for batch in tqdm(batches, desc=f"GPU {gpu_id}", position=position, leave=False):
|
|
181
|
+
batch_results = process_batch(batch)
|
|
182
|
+
results.extend(batch_results)
|
|
183
|
+
|
|
184
|
+
return results
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## Performance Considerations
|
|
188
|
+
|
|
189
|
+
| Factor | Recommendation |
|
|
190
|
+
|--------|---------------|
|
|
191
|
+
| Batch size | Start with 64-128, tune based on GPU memory |
|
|
192
|
+
| Workers per GPU | Usually 1 for large models, 2-4 for small models |
|
|
193
|
+
| I/O workers | 4-8 ThreadPool workers per GPU worker |
|
|
194
|
+
| Chunk size | Balanced across GPUs (ceiling division) |
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Single GPU Optimization Patterns
|
|
2
|
+
|
|
3
|
+
## Table of Contents
|
|
4
|
+
- [CUDA Streams for Concurrent Operations](#cuda-streams-for-concurrent-operations)
|
|
5
|
+
- [Async Inference Pattern](#async-inference-pattern)
|
|
6
|
+
- [Model Batching for Multiple Small Models](#model-batching-for-multiple-small-models)
|
|
7
|
+
- [Dynamic Batching](#dynamic-batching)
|
|
8
|
+
- [Memory Optimization](#memory-optimization)
|
|
9
|
+
- [Throughput Measurement](#throughput-measurement)
|
|
10
|
+
- [Best Practices Summary](#best-practices-summary)
|
|
11
|
+
|
|
12
|
+
## CUDA Streams for Concurrent Operations
|
|
13
|
+
|
|
14
|
+
CUDA streams allow overlapping data transfer and computation:
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
import torch
|
|
18
|
+
|
|
19
|
+
def process_with_streams(batches: list, model):
|
|
20
|
+
"""Process batches using CUDA streams for overlap."""
|
|
21
|
+
streams = [torch.cuda.Stream() for _ in range(2)]
|
|
22
|
+
results = []
|
|
23
|
+
|
|
24
|
+
for i, batch in enumerate(batches):
|
|
25
|
+
stream = streams[i % 2]
|
|
26
|
+
|
|
27
|
+
with torch.cuda.stream(stream):
|
|
28
|
+
# Transfer to GPU
|
|
29
|
+
data = batch.cuda(non_blocking=True)
|
|
30
|
+
# Compute
|
|
31
|
+
output = model(data)
|
|
32
|
+
results.append(output)
|
|
33
|
+
|
|
34
|
+
# Synchronize all streams
|
|
35
|
+
torch.cuda.synchronize()
|
|
36
|
+
return results
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Async Inference Pattern
|
|
40
|
+
|
|
41
|
+
For pipelines with I/O and compute stages:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
import asyncio
|
|
45
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
46
|
+
|
|
47
|
+
class AsyncInferencePipeline:
|
|
48
|
+
def __init__(self, model, io_workers: int = 4):
|
|
49
|
+
self.model = model
|
|
50
|
+
self.io_executor = ThreadPoolExecutor(max_workers=io_workers)
|
|
51
|
+
self.batch_queue = asyncio.Queue(maxsize=2) # Prefetch 2 batches
|
|
52
|
+
|
|
53
|
+
async def load_batch(self, paths: list[str]):
|
|
54
|
+
"""Load batch in thread pool (non-blocking)."""
|
|
55
|
+
loop = asyncio.get_event_loop()
|
|
56
|
+
images = await loop.run_in_executor(
|
|
57
|
+
self.io_executor,
|
|
58
|
+
lambda: [load_image(p) for p in paths]
|
|
59
|
+
)
|
|
60
|
+
return torch.stack(images)
|
|
61
|
+
|
|
62
|
+
async def producer(self, all_paths: list[str], batch_size: int):
|
|
63
|
+
"""Continuously load batches."""
|
|
64
|
+
for i in range(0, len(all_paths), batch_size):
|
|
65
|
+
batch_paths = all_paths[i:i+batch_size]
|
|
66
|
+
batch = await self.load_batch(batch_paths)
|
|
67
|
+
await self.batch_queue.put(batch)
|
|
68
|
+
await self.batch_queue.put(None) # Signal end
|
|
69
|
+
|
|
70
|
+
async def consumer(self):
|
|
71
|
+
"""Process batches as they arrive."""
|
|
72
|
+
results = []
|
|
73
|
+
while True:
|
|
74
|
+
batch = await self.batch_queue.get()
|
|
75
|
+
if batch is None:
|
|
76
|
+
break
|
|
77
|
+
with torch.no_grad():
|
|
78
|
+
output = self.model(batch.cuda())
|
|
79
|
+
results.append(output.cpu())
|
|
80
|
+
return results
|
|
81
|
+
|
|
82
|
+
async def run(self, paths: list[str], batch_size: int = 32):
|
|
83
|
+
producer_task = asyncio.create_task(self.producer(paths, batch_size))
|
|
84
|
+
results = await self.consumer()
|
|
85
|
+
await producer_task
|
|
86
|
+
return results
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Model Batching for Multiple Small Models
|
|
90
|
+
|
|
91
|
+
Run multiple small models on single GPU:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
class MultiModelPipeline:
|
|
95
|
+
"""Run multiple models efficiently on single GPU."""
|
|
96
|
+
|
|
97
|
+
def __init__(self, models: list):
|
|
98
|
+
self.models = [m.cuda() for m in models]
|
|
99
|
+
self.streams = [torch.cuda.Stream() for _ in models]
|
|
100
|
+
|
|
101
|
+
def forward_all(self, inputs: list[torch.Tensor]) -> list[torch.Tensor]:
|
|
102
|
+
"""Run all models concurrently using streams."""
|
|
103
|
+
outputs = [None] * len(self.models)
|
|
104
|
+
|
|
105
|
+
# Launch all models
|
|
106
|
+
for i, (model, stream, x) in enumerate(zip(self.models, self.streams, inputs)):
|
|
107
|
+
with torch.cuda.stream(stream):
|
|
108
|
+
outputs[i] = model(x.cuda(non_blocking=True))
|
|
109
|
+
|
|
110
|
+
# Wait for all
|
|
111
|
+
torch.cuda.synchronize()
|
|
112
|
+
return outputs
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Dynamic Batching
|
|
116
|
+
|
|
117
|
+
Maximize GPU utilization with variable batch sizes:
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
class DynamicBatcher:
|
|
121
|
+
"""Accumulate inputs until batch is full or timeout."""
|
|
122
|
+
|
|
123
|
+
def __init__(self, model, max_batch: int = 64, timeout_ms: int = 10):
|
|
124
|
+
self.model = model
|
|
125
|
+
self.max_batch = max_batch
|
|
126
|
+
self.timeout_ms = timeout_ms
|
|
127
|
+
self.pending = []
|
|
128
|
+
self.last_submit = time.time()
|
|
129
|
+
|
|
130
|
+
def add(self, item):
|
|
131
|
+
self.pending.append(item)
|
|
132
|
+
|
|
133
|
+
should_process = (
|
|
134
|
+
len(self.pending) >= self.max_batch or
|
|
135
|
+
(time.time() - self.last_submit) * 1000 > self.timeout_ms
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if should_process and self.pending:
|
|
139
|
+
return self._process_batch()
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
def _process_batch(self):
|
|
143
|
+
batch = torch.stack(self.pending[:self.max_batch])
|
|
144
|
+
self.pending = self.pending[self.max_batch:]
|
|
145
|
+
self.last_submit = time.time()
|
|
146
|
+
|
|
147
|
+
with torch.no_grad():
|
|
148
|
+
return self.model(batch.cuda())
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Memory Optimization
|
|
152
|
+
|
|
153
|
+
### Gradient Checkpointing (Training)
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from torch.utils.checkpoint import checkpoint
|
|
157
|
+
|
|
158
|
+
class EfficientModel(nn.Module):
|
|
159
|
+
def forward(self, x):
|
|
160
|
+
# Checkpoint intermediate layers to save memory
|
|
161
|
+
x = checkpoint(self.layer1, x)
|
|
162
|
+
x = checkpoint(self.layer2, x)
|
|
163
|
+
x = self.head(x)
|
|
164
|
+
return x
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Mixed Precision Inference
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
with torch.cuda.amp.autocast():
|
|
171
|
+
output = model(input) # Uses FP16 automatically
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Memory-Efficient Attention (for transformers)
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
# Use torch.nn.functional.scaled_dot_product_attention (PyTorch 2.0+)
|
|
178
|
+
# Automatically uses FlashAttention when available
|
|
179
|
+
from torch.nn.functional import scaled_dot_product_attention
|
|
180
|
+
|
|
181
|
+
attn_output = scaled_dot_product_attention(q, k, v, is_causal=True)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Throughput Measurement
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
import time
|
|
188
|
+
import torch
|
|
189
|
+
|
|
190
|
+
def benchmark_throughput(model, input_shape, n_iterations=100, warmup=10):
|
|
191
|
+
"""Measure model throughput in samples/second."""
|
|
192
|
+
model.eval()
|
|
193
|
+
dummy_input = torch.randn(*input_shape).cuda()
|
|
194
|
+
|
|
195
|
+
# Warmup
|
|
196
|
+
for _ in range(warmup):
|
|
197
|
+
with torch.no_grad():
|
|
198
|
+
_ = model(dummy_input)
|
|
199
|
+
|
|
200
|
+
torch.cuda.synchronize()
|
|
201
|
+
start = time.perf_counter()
|
|
202
|
+
|
|
203
|
+
for _ in range(n_iterations):
|
|
204
|
+
with torch.no_grad():
|
|
205
|
+
_ = model(dummy_input)
|
|
206
|
+
|
|
207
|
+
torch.cuda.synchronize()
|
|
208
|
+
elapsed = time.perf_counter() - start
|
|
209
|
+
|
|
210
|
+
batch_size = input_shape[0]
|
|
211
|
+
throughput = (n_iterations * batch_size) / elapsed
|
|
212
|
+
print(f"Throughput: {throughput:.1f} samples/sec")
|
|
213
|
+
return throughput
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## Best Practices Summary
|
|
217
|
+
|
|
218
|
+
| Technique | When to Use | Memory Impact |
|
|
219
|
+
|-----------|-------------|---------------|
|
|
220
|
+
| CUDA Streams | Multiple independent ops | Minimal |
|
|
221
|
+
| Async I/O | I/O bottleneck | Minimal |
|
|
222
|
+
| Multi-model | Multiple small models | +1 model per stream |
|
|
223
|
+
| Dynamic batching | Variable input rate | Configurable |
|
|
224
|
+
| Mixed precision | Large models, Ampere+ GPU | -50% |
|
|
225
|
+
| Checkpointing | Training large models | -60% (slower) |
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
# Troubleshooting Guide
|
|
2
|
+
|
|
3
|
+
## Table of Contents
|
|
4
|
+
- [spawn vs fork Context](#spawn-vs-fork-context)
|
|
5
|
+
- [CUDA_VISIBLE_DEVICES Issues](#cuda_visible_devices-issues)
|
|
6
|
+
- [GPU Memory OOM](#gpu-memory-oom)
|
|
7
|
+
- [Pickling Errors](#pickling-errors)
|
|
8
|
+
- [Process Hangs](#process-hangs)
|
|
9
|
+
- [Debugging Checklist](#debugging-checklist)
|
|
10
|
+
- [Quick Fixes](#quick-fixes)
|
|
11
|
+
|
|
12
|
+
## spawn vs fork Context
|
|
13
|
+
|
|
14
|
+
### Problem: Silent Failures with fork
|
|
15
|
+
|
|
16
|
+
When using `fork` context with CUDA:
|
|
17
|
+
- Worker processes inherit CUDA context from parent
|
|
18
|
+
- Functions may fail to pickle correctly
|
|
19
|
+
- Workers might return None silently instead of crashing
|
|
20
|
+
|
|
21
|
+
### Symptom
|
|
22
|
+
```
|
|
23
|
+
# Processing completes in seconds instead of hours
|
|
24
|
+
# All results are None
|
|
25
|
+
# No error messages
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Solution: Always Use spawn
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import multiprocessing as mp
|
|
32
|
+
|
|
33
|
+
# WRONG
|
|
34
|
+
with ProcessPoolExecutor(max_workers=4) as executor:
|
|
35
|
+
...
|
|
36
|
+
|
|
37
|
+
# CORRECT
|
|
38
|
+
ctx = mp.get_context("spawn")
|
|
39
|
+
with ProcessPoolExecutor(max_workers=4, mp_context=ctx) as executor:
|
|
40
|
+
...
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Why spawn works
|
|
44
|
+
|
|
45
|
+
| Context | Behavior | CUDA Safe |
|
|
46
|
+
|---------|----------|-----------|
|
|
47
|
+
| fork | Copy parent process memory | No |
|
|
48
|
+
| spawn | Start fresh process | Yes |
|
|
49
|
+
| forkserver | Fork from server process | Partial |
|
|
50
|
+
|
|
51
|
+
## CUDA_VISIBLE_DEVICES Issues
|
|
52
|
+
|
|
53
|
+
### Problem: All Workers Use Same GPU
|
|
54
|
+
|
|
55
|
+
Workers share parent's CUDA context if not isolated.
|
|
56
|
+
|
|
57
|
+
### Solution: Set Early in Worker
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
def _worker_init(gpu_id: int):
|
|
61
|
+
# MUST be first line - before any CUDA import
|
|
62
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
|
63
|
+
|
|
64
|
+
# NOW import PyTorch
|
|
65
|
+
import torch
|
|
66
|
+
|
|
67
|
+
# device:0 is now the isolated GPU
|
|
68
|
+
model = Model().to("cuda:0")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Verification
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
def _worker_init(gpu_id: int):
|
|
75
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
|
76
|
+
import torch
|
|
77
|
+
|
|
78
|
+
# Should print only 1 device
|
|
79
|
+
print(f"Worker {gpu_id}: {torch.cuda.device_count()} device(s)")
|
|
80
|
+
print(f"Device name: {torch.cuda.get_device_name(0)}")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Common Mistake
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
# WRONG: Setting after import
|
|
87
|
+
import torch
|
|
88
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) # Too late!
|
|
89
|
+
|
|
90
|
+
# WRONG: Using device index directly
|
|
91
|
+
model.to(f"cuda:{gpu_id}") # Sees all GPUs, doesn't isolate
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## GPU Memory OOM
|
|
95
|
+
|
|
96
|
+
### Symptom
|
|
97
|
+
```
|
|
98
|
+
RuntimeError: CUDA out of memory. Tried to allocate X MiB
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Diagnosis
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
def check_memory():
|
|
105
|
+
import torch
|
|
106
|
+
for i in range(torch.cuda.device_count()):
|
|
107
|
+
props = torch.cuda.get_device_properties(i)
|
|
108
|
+
total = props.total_memory / 1e9
|
|
109
|
+
reserved = torch.cuda.memory_reserved(i) / 1e9
|
|
110
|
+
allocated = torch.cuda.memory_allocated(i) / 1e9
|
|
111
|
+
print(f"GPU {i}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved, {total:.1f}GB total")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Solutions
|
|
115
|
+
|
|
116
|
+
1. **Reduce batch size**
|
|
117
|
+
```python
|
|
118
|
+
batch_size = 64 # Start small, increase until OOM
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
2. **Enable mixed precision**
|
|
122
|
+
```python
|
|
123
|
+
with torch.cuda.amp.autocast():
|
|
124
|
+
output = model(input)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
3. **Clear cache between batches**
|
|
128
|
+
```python
|
|
129
|
+
torch.cuda.empty_cache() # Use sparingly, has overhead
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
4. **Reduce workers per GPU**
|
|
133
|
+
```python
|
|
134
|
+
# If model uses 8GB on 24GB GPU
|
|
135
|
+
workers_per_gpu = 24 // 8 - 1 # Leave headroom = 2 workers
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Memory Planning Formula
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
available_memory = total_gpu_memory - cuda_overhead (2-3GB)
|
|
142
|
+
model_memory = model_size * precision_multiplier
|
|
143
|
+
- FP32: model_params * 4 bytes
|
|
144
|
+
- FP16: model_params * 2 bytes
|
|
145
|
+
- INT8: model_params * 1 byte
|
|
146
|
+
|
|
147
|
+
workers_per_gpu = floor(available_memory / model_memory)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Pickling Errors
|
|
151
|
+
|
|
152
|
+
### Symptom
|
|
153
|
+
```
|
|
154
|
+
_pickle.PicklingError: Can't pickle <local object>
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Common Causes
|
|
158
|
+
|
|
159
|
+
1. **Lambda functions**
|
|
160
|
+
```python
|
|
161
|
+
# WRONG
|
|
162
|
+
executor.submit(lambda x: process(x), data)
|
|
163
|
+
|
|
164
|
+
# CORRECT
|
|
165
|
+
def process_wrapper(data):
|
|
166
|
+
return process(data)
|
|
167
|
+
executor.submit(process_wrapper, data)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
2. **Nested functions**
|
|
171
|
+
```python
|
|
172
|
+
# WRONG
|
|
173
|
+
def outer():
|
|
174
|
+
def inner(x):
|
|
175
|
+
return x * 2
|
|
176
|
+
executor.submit(inner, data)
|
|
177
|
+
|
|
178
|
+
# CORRECT: Define at module level
|
|
179
|
+
def inner(x):
|
|
180
|
+
return x * 2
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
3. **CUDA tensors**
|
|
184
|
+
```python
|
|
185
|
+
# WRONG: Passing CUDA tensor to worker
|
|
186
|
+
executor.submit(process, tensor.cuda())
|
|
187
|
+
|
|
188
|
+
# CORRECT: Pass CPU tensor, move to GPU in worker
|
|
189
|
+
executor.submit(process, tensor.cpu())
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Process Hangs
|
|
193
|
+
|
|
194
|
+
### Symptom
|
|
195
|
+
- Workers never complete
|
|
196
|
+
- No progress bar updates
|
|
197
|
+
- CPU/GPU utilization drops to 0
|
|
198
|
+
|
|
199
|
+
### Diagnosis
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
# Add timeout to futures
|
|
203
|
+
for future in as_completed(futures, timeout=300):
|
|
204
|
+
try:
|
|
205
|
+
result = future.result(timeout=60)
|
|
206
|
+
except TimeoutError:
|
|
207
|
+
print(f"Worker timed out")
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### Common Causes
|
|
211
|
+
|
|
212
|
+
1. **Deadlock in worker**
|
|
213
|
+
- Check for locks that never release
|
|
214
|
+
- Ensure thread-safe data structures
|
|
215
|
+
|
|
216
|
+
2. **CUDA synchronization hang**
|
|
217
|
+
```python
|
|
218
|
+
# Add sync points for debugging
|
|
219
|
+
torch.cuda.synchronize()
|
|
220
|
+
print("Sync point reached")
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
3. **I/O blocking**
|
|
224
|
+
```python
|
|
225
|
+
# Set timeouts on I/O operations
|
|
226
|
+
img = cv2.imread(path) # Can hang on network storage
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Debugging Checklist
|
|
230
|
+
|
|
231
|
+
1. [ ] Using spawn context?
|
|
232
|
+
2. [ ] CUDA_VISIBLE_DEVICES set before imports?
|
|
233
|
+
3. [ ] Functions defined at module level (not nested)?
|
|
234
|
+
4. [ ] No CUDA tensors passed between processes?
|
|
235
|
+
5. [ ] Sufficient GPU memory for batch size?
|
|
236
|
+
6. [ ] Timeouts set for futures?
|
|
237
|
+
7. [ ] Progress tracking (tqdm) enabled?
|
|
238
|
+
|
|
239
|
+
## Quick Fixes
|
|
240
|
+
|
|
241
|
+
| Issue | Quick Fix |
|
|
242
|
+
|-------|-----------|
|
|
243
|
+
| Silent None returns | Add spawn context |
|
|
244
|
+
| All workers on GPU 0 | Set CUDA_VISIBLE_DEVICES first |
|
|
245
|
+
| OOM | Reduce batch_size by 50% |
|
|
246
|
+
| Pickle error | Move function to module level |
|
|
247
|
+
| Process hangs | Add timeout, check I/O |
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""GPU memory check utility for parallel pipeline planning.
|
|
3
|
+
|
|
4
|
+
Reports available GPU memory and recommends workers per GPU based on model size.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python check_gpu_memory.py
|
|
8
|
+
python check_gpu_memory.py --model-memory 5.0 # Specify model memory in GB
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def check_gpu_memory(model_memory_gb: float | None = None) -> None:
|
|
18
|
+
"""Check GPU memory and recommend worker count.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
model_memory_gb: Estimated model memory usage in GB (optional)
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
import torch
|
|
25
|
+
except ImportError:
|
|
26
|
+
print("PyTorch not installed. Install with: pip install torch")
|
|
27
|
+
sys.exit(1)
|
|
28
|
+
|
|
29
|
+
if not torch.cuda.is_available():
|
|
30
|
+
print("CUDA not available")
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
33
|
+
n_gpus = torch.cuda.device_count()
|
|
34
|
+
print(f"Found {n_gpus} GPU(s)\n")
|
|
35
|
+
print("=" * 60)
|
|
36
|
+
|
|
37
|
+
total_available = 0
|
|
38
|
+
cuda_overhead_gb = 2.5 # Reserved for CUDA context
|
|
39
|
+
|
|
40
|
+
for i in range(n_gpus):
|
|
41
|
+
props = torch.cuda.get_device_properties(i)
|
|
42
|
+
total_gb = props.total_memory / 1e9
|
|
43
|
+
available_gb = total_gb - cuda_overhead_gb
|
|
44
|
+
|
|
45
|
+
print(f"GPU {i}: {props.name}")
|
|
46
|
+
print(f" Total memory: {total_gb:.1f} GB")
|
|
47
|
+
print(f" Available (after CUDA overhead): {available_gb:.1f} GB")
|
|
48
|
+
|
|
49
|
+
if model_memory_gb:
|
|
50
|
+
workers = int(available_gb / model_memory_gb)
|
|
51
|
+
print(f" Recommended workers (for {model_memory_gb}GB model): {workers}")
|
|
52
|
+
|
|
53
|
+
total_available += available_gb
|
|
54
|
+
print()
|
|
55
|
+
|
|
56
|
+
print("=" * 60)
|
|
57
|
+
print(f"Total available memory: {total_available:.1f} GB")
|
|
58
|
+
|
|
59
|
+
if model_memory_gb:
|
|
60
|
+
total_workers = int(total_available / model_memory_gb)
|
|
61
|
+
print(f"Total recommended workers: {total_workers}")
|
|
62
|
+
print(f"\nSuggested command:")
|
|
63
|
+
print(f" --n-gpus {n_gpus} --batch-size 64")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def main():
|
|
67
|
+
parser = argparse.ArgumentParser(description="Check GPU memory for parallel pipeline")
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--model-memory",
|
|
70
|
+
type=float,
|
|
71
|
+
default=None,
|
|
72
|
+
help="Estimated model memory usage in GB",
|
|
73
|
+
)
|
|
74
|
+
args = parser.parse_args()
|
|
75
|
+
|
|
76
|
+
check_gpu_memory(args.model_memory)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
if __name__ == "__main__":
|
|
80
|
+
main()
|