EvoScientist 0.1.0rc1__py3-none-any.whl → 0.1.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- EvoScientist/EvoScientist.py +1 -1
- EvoScientist/cli.py +450 -178
- EvoScientist/middleware.py +5 -1
- EvoScientist/skills/accelerate/SKILL.md +332 -0
- EvoScientist/skills/accelerate/references/custom-plugins.md +453 -0
- EvoScientist/skills/accelerate/references/megatron-integration.md +489 -0
- EvoScientist/skills/accelerate/references/performance.md +525 -0
- EvoScientist/skills/bitsandbytes/SKILL.md +411 -0
- EvoScientist/skills/bitsandbytes/references/memory-optimization.md +521 -0
- EvoScientist/skills/bitsandbytes/references/qlora-training.md +521 -0
- EvoScientist/skills/bitsandbytes/references/quantization-formats.md +447 -0
- EvoScientist/skills/clip/SKILL.md +253 -0
- EvoScientist/skills/clip/references/applications.md +207 -0
- EvoScientist/skills/find-skills/SKILL.md +133 -0
- EvoScientist/skills/find-skills/scripts/install_skill.py +211 -0
- EvoScientist/skills/flash-attention/SKILL.md +367 -0
- EvoScientist/skills/flash-attention/references/benchmarks.md +215 -0
- EvoScientist/skills/flash-attention/references/transformers-integration.md +293 -0
- EvoScientist/skills/langgraph-docs/SKILL.md +36 -0
- EvoScientist/skills/llama-cpp/SKILL.md +258 -0
- EvoScientist/skills/llama-cpp/references/optimization.md +89 -0
- EvoScientist/skills/llama-cpp/references/quantization.md +213 -0
- EvoScientist/skills/llama-cpp/references/server.md +125 -0
- EvoScientist/skills/lm-evaluation-harness/SKILL.md +490 -0
- EvoScientist/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- EvoScientist/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- EvoScientist/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- EvoScientist/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- EvoScientist/skills/ml-paper-writing/SKILL.md +937 -0
- EvoScientist/skills/ml-paper-writing/references/checklists.md +361 -0
- EvoScientist/skills/ml-paper-writing/references/citation-workflow.md +562 -0
- EvoScientist/skills/ml-paper-writing/references/reviewer-guidelines.md +367 -0
- EvoScientist/skills/ml-paper-writing/references/sources.md +159 -0
- EvoScientist/skills/ml-paper-writing/references/writing-guide.md +476 -0
- EvoScientist/skills/ml-paper-writing/templates/README.md +251 -0
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/README.md +534 -0
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex +144 -0
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex +952 -0
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bib +111 -0
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bst +1493 -0
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.sty +315 -0
- EvoScientist/skills/ml-paper-writing/templates/acl/README.md +50 -0
- EvoScientist/skills/ml-paper-writing/templates/acl/acl.sty +312 -0
- EvoScientist/skills/ml-paper-writing/templates/acl/acl_latex.tex +377 -0
- EvoScientist/skills/ml-paper-writing/templates/acl/acl_lualatex.tex +101 -0
- EvoScientist/skills/ml-paper-writing/templates/acl/acl_natbib.bst +1940 -0
- EvoScientist/skills/ml-paper-writing/templates/acl/anthology.bib.txt +26 -0
- EvoScientist/skills/ml-paper-writing/templates/acl/custom.bib +70 -0
- EvoScientist/skills/ml-paper-writing/templates/acl/formatting.md +326 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/README.md +3 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bib +11 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bst +1440 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.pdf +0 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.sty +218 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.tex +305 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/fancyhdr.sty +485 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/math_commands.tex +508 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/natbib.sty +1246 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/fancyhdr.sty +485 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib +24 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst +1440 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf +0 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty +246 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex +414 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/math_commands.tex +508 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/natbib.sty +1246 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithm.sty +79 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithmic.sty +201 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.bib +75 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.pdf +0 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.tex +662 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/fancyhdr.sty +864 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.bst +1443 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.sty +767 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/icml_numpapers.pdf +0 -0
- EvoScientist/skills/ml-paper-writing/templates/neurips2025/Makefile +36 -0
- EvoScientist/skills/ml-paper-writing/templates/neurips2025/extra_pkgs.tex +53 -0
- EvoScientist/skills/ml-paper-writing/templates/neurips2025/main.tex +38 -0
- EvoScientist/skills/ml-paper-writing/templates/neurips2025/neurips.sty +382 -0
- EvoScientist/skills/peft/SKILL.md +431 -0
- EvoScientist/skills/peft/references/advanced-usage.md +514 -0
- EvoScientist/skills/peft/references/troubleshooting.md +480 -0
- EvoScientist/skills/ray-data/SKILL.md +326 -0
- EvoScientist/skills/ray-data/references/integration.md +82 -0
- EvoScientist/skills/ray-data/references/transformations.md +83 -0
- EvoScientist/skills/skill-creator/LICENSE.txt +202 -0
- EvoScientist/skills/skill-creator/SKILL.md +356 -0
- EvoScientist/skills/skill-creator/references/output-patterns.md +82 -0
- EvoScientist/skills/skill-creator/references/workflows.md +28 -0
- EvoScientist/skills/skill-creator/scripts/init_skill.py +303 -0
- EvoScientist/skills/skill-creator/scripts/package_skill.py +110 -0
- EvoScientist/skills/skill-creator/scripts/quick_validate.py +95 -0
- EvoScientist/skills/tensorboard/SKILL.md +629 -0
- EvoScientist/skills/tensorboard/references/integrations.md +638 -0
- EvoScientist/skills/tensorboard/references/profiling.md +545 -0
- EvoScientist/skills/tensorboard/references/visualization.md +620 -0
- EvoScientist/skills/vllm/SKILL.md +364 -0
- EvoScientist/skills/vllm/references/optimization.md +226 -0
- EvoScientist/skills/vllm/references/quantization.md +284 -0
- EvoScientist/skills/vllm/references/server-deployment.md +255 -0
- EvoScientist/skills/vllm/references/troubleshooting.md +447 -0
- {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/METADATA +26 -3
- evoscientist-0.1.0rc2.dist-info/RECORD +119 -0
- evoscientist-0.1.0rc1.dist-info/RECORD +0 -21
- {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/WHEEL +0 -0
- {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/entry_points.txt +0 -0
- {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/licenses/LICENSE +0 -0
- {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
# Performance Profiling Guide
|
|
2
|
+
|
|
3
|
+
Complete guide to profiling and optimizing ML models with TensorBoard.
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
- PyTorch Profiler
|
|
7
|
+
- TensorFlow Profiler
|
|
8
|
+
- GPU Utilization
|
|
9
|
+
- Memory Profiling
|
|
10
|
+
- Bottleneck Detection
|
|
11
|
+
- Optimization Strategies
|
|
12
|
+
|
|
13
|
+
## PyTorch Profiler
|
|
14
|
+
|
|
15
|
+
### Basic Profiling
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import torch
|
|
19
|
+
import torch.profiler as profiler
|
|
20
|
+
|
|
21
|
+
model = MyModel().cuda()
|
|
22
|
+
optimizer = torch.optim.Adam(model.parameters())
|
|
23
|
+
|
|
24
|
+
# Profile training loop
|
|
25
|
+
with profiler.profile(
|
|
26
|
+
activities=[
|
|
27
|
+
profiler.ProfilerActivity.CPU,
|
|
28
|
+
profiler.ProfilerActivity.CUDA,
|
|
29
|
+
],
|
|
30
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/profiler'),
|
|
31
|
+
record_shapes=True,
|
|
32
|
+
with_stack=True
|
|
33
|
+
) as prof:
|
|
34
|
+
for step, (data, target) in enumerate(train_loader):
|
|
35
|
+
optimizer.zero_grad()
|
|
36
|
+
output = model(data.cuda())
|
|
37
|
+
loss = F.cross_entropy(output, target.cuda())
|
|
38
|
+
loss.backward()
|
|
39
|
+
optimizer.step()
|
|
40
|
+
|
|
41
|
+
# Mark step for profiler
|
|
42
|
+
prof.step()
|
|
43
|
+
|
|
44
|
+
if step >= 10: # Profile first 10 steps
|
|
45
|
+
break
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Profiler Configuration
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
with profiler.profile(
|
|
52
|
+
activities=[
|
|
53
|
+
profiler.ProfilerActivity.CPU, # Profile CPU ops
|
|
54
|
+
profiler.ProfilerActivity.CUDA, # Profile GPU ops
|
|
55
|
+
],
|
|
56
|
+
schedule=profiler.schedule(
|
|
57
|
+
wait=1, # Warmup steps (skip profiling)
|
|
58
|
+
warmup=1, # Steps to warmup profiler
|
|
59
|
+
active=3, # Steps to actively profile
|
|
60
|
+
repeat=2 # Repeat cycle 2 times
|
|
61
|
+
),
|
|
62
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/profiler'),
|
|
63
|
+
record_shapes=True, # Record tensor shapes
|
|
64
|
+
profile_memory=True, # Track memory allocation
|
|
65
|
+
with_stack=True, # Record source code stack traces
|
|
66
|
+
with_flops=True # Estimate FLOPS
|
|
67
|
+
) as prof:
|
|
68
|
+
for step, batch in enumerate(train_loader):
|
|
69
|
+
train_step(batch)
|
|
70
|
+
prof.step()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Profile Inference
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
model.eval()
|
|
77
|
+
|
|
78
|
+
with profiler.profile(
|
|
79
|
+
activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA],
|
|
80
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/inference_profiler')
|
|
81
|
+
) as prof:
|
|
82
|
+
with torch.no_grad():
|
|
83
|
+
for i in range(100):
|
|
84
|
+
data = torch.randn(1, 3, 224, 224).cuda()
|
|
85
|
+
output = model(data)
|
|
86
|
+
prof.step()
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Analyze Profile Data
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
# Print profiler summary
|
|
93
|
+
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
|
|
94
|
+
|
|
95
|
+
# Export Chrome trace (for chrome://tracing)
|
|
96
|
+
prof.export_chrome_trace("trace.json")
|
|
97
|
+
|
|
98
|
+
# View in TensorBoard
|
|
99
|
+
# tensorboard --logdir=runs/profiler
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**TensorBoard Profile Tab shows:**
|
|
103
|
+
- Overview: GPU utilization, step time breakdown
|
|
104
|
+
- Operator view: Time spent in each operation
|
|
105
|
+
- Kernel view: GPU kernel execution
|
|
106
|
+
- Trace view: Timeline of operations
|
|
107
|
+
- Memory view: Memory allocation over time
|
|
108
|
+
|
|
109
|
+
## TensorFlow Profiler
|
|
110
|
+
|
|
111
|
+
### Profile with Callback
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
import tensorflow as tf
|
|
115
|
+
|
|
116
|
+
# Create profiler callback
|
|
117
|
+
tensorboard_callback = tf.keras.callbacks.TensorBoard(
|
|
118
|
+
log_dir='logs/profiler',
|
|
119
|
+
profile_batch='10,20' # Profile batches 10-20
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Train with profiling
|
|
123
|
+
model.fit(
|
|
124
|
+
x_train, y_train,
|
|
125
|
+
epochs=5,
|
|
126
|
+
callbacks=[tensorboard_callback]
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Launch TensorBoard
|
|
130
|
+
# tensorboard --logdir=logs/profiler
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Programmatic Profiling
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
import tensorflow as tf
|
|
137
|
+
|
|
138
|
+
# Start profiler
|
|
139
|
+
tf.profiler.experimental.start('logs/profiler')
|
|
140
|
+
|
|
141
|
+
# Training code
|
|
142
|
+
for epoch in range(5):
|
|
143
|
+
for step, (x, y) in enumerate(train_dataset):
|
|
144
|
+
with tf.GradientTape() as tape:
|
|
145
|
+
predictions = model(x, training=True)
|
|
146
|
+
loss = loss_fn(y, predictions)
|
|
147
|
+
|
|
148
|
+
gradients = tape.gradient(loss, model.trainable_variables)
|
|
149
|
+
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
|
|
150
|
+
|
|
151
|
+
# Profile specific steps
|
|
152
|
+
if epoch == 2 and step == 10:
|
|
153
|
+
tf.profiler.experimental.start('logs/profiler_step10')
|
|
154
|
+
|
|
155
|
+
if epoch == 2 and step == 20:
|
|
156
|
+
tf.profiler.experimental.stop()
|
|
157
|
+
|
|
158
|
+
# Stop profiler
|
|
159
|
+
tf.profiler.experimental.stop()
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Profile Custom Training Loop
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
# Profile with context manager
|
|
166
|
+
with tf.profiler.experimental.Profile('logs/profiler'):
|
|
167
|
+
for epoch in range(3):
|
|
168
|
+
for step, (x, y) in enumerate(train_dataset):
|
|
169
|
+
train_step(x, y)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## GPU Utilization
|
|
173
|
+
|
|
174
|
+
### Monitor GPU Usage
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
import torch
|
|
178
|
+
import torch.profiler as profiler
|
|
179
|
+
|
|
180
|
+
with profiler.profile(
|
|
181
|
+
activities=[profiler.ProfilerActivity.CUDA],
|
|
182
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/gpu_profile'),
|
|
183
|
+
with_stack=True
|
|
184
|
+
) as prof:
|
|
185
|
+
for step, batch in enumerate(train_loader):
|
|
186
|
+
# Your training step
|
|
187
|
+
output = model(batch.cuda())
|
|
188
|
+
loss = criterion(output, target.cuda())
|
|
189
|
+
loss.backward()
|
|
190
|
+
optimizer.step()
|
|
191
|
+
|
|
192
|
+
prof.step()
|
|
193
|
+
|
|
194
|
+
# View in TensorBoard > Profile > Overview
|
|
195
|
+
# Shows: GPU utilization %, kernel efficiency, memory bandwidth
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Optimize GPU Utilization
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
# ✅ Good: Keep GPU busy
|
|
202
|
+
def train_step(batch):
|
|
203
|
+
# Overlap data transfer with computation
|
|
204
|
+
data = batch.cuda(non_blocking=True) # Async transfer
|
|
205
|
+
|
|
206
|
+
# Mixed precision for faster computation
|
|
207
|
+
with torch.cuda.amp.autocast():
|
|
208
|
+
output = model(data)
|
|
209
|
+
loss = criterion(output, target)
|
|
210
|
+
|
|
211
|
+
return loss
|
|
212
|
+
|
|
213
|
+
# ❌ Bad: GPU idle during data transfer
|
|
214
|
+
def train_step_slow(batch):
|
|
215
|
+
data = batch.cuda() # Blocking transfer
|
|
216
|
+
output = model(data)
|
|
217
|
+
return loss
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Reduce CPU-GPU Synchronization
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
# ✅ Good: Minimize synchronization
|
|
224
|
+
for epoch in range(100):
|
|
225
|
+
for batch in train_loader:
|
|
226
|
+
loss = train_step(batch)
|
|
227
|
+
|
|
228
|
+
# Accumulate losses (no sync)
|
|
229
|
+
total_loss += loss.item()
|
|
230
|
+
|
|
231
|
+
# Synchronize once per epoch
|
|
232
|
+
avg_loss = total_loss / len(train_loader)
|
|
233
|
+
|
|
234
|
+
# ❌ Bad: Frequent synchronization
|
|
235
|
+
for batch in train_loader:
|
|
236
|
+
loss = train_step(batch)
|
|
237
|
+
print(f"Loss: {loss.item()}") # Syncs every batch!
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Memory Profiling
|
|
241
|
+
|
|
242
|
+
### Track Memory Allocation
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
import torch
|
|
246
|
+
import torch.profiler as profiler
|
|
247
|
+
|
|
248
|
+
with profiler.profile(
|
|
249
|
+
activities=[profiler.ProfilerActivity.CUDA],
|
|
250
|
+
profile_memory=True,
|
|
251
|
+
record_shapes=True,
|
|
252
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/memory_profile')
|
|
253
|
+
) as prof:
|
|
254
|
+
for step, batch in enumerate(train_loader):
|
|
255
|
+
train_step(batch)
|
|
256
|
+
prof.step()
|
|
257
|
+
|
|
258
|
+
# View in TensorBoard > Profile > Memory View
|
|
259
|
+
# Shows: Memory allocation over time, peak memory, allocation stack traces
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
### Find Memory Leaks
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
import torch
|
|
266
|
+
|
|
267
|
+
# Record memory snapshots
|
|
268
|
+
torch.cuda.memory._record_memory_history(
|
|
269
|
+
enabled=True,
|
|
270
|
+
max_entries=100000
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# Training
|
|
274
|
+
for batch in train_loader:
|
|
275
|
+
train_step(batch)
|
|
276
|
+
|
|
277
|
+
# Save memory snapshot
|
|
278
|
+
snapshot = torch.cuda.memory._snapshot()
|
|
279
|
+
torch.cuda.memory._dump_snapshot("memory_snapshot.pickle")
|
|
280
|
+
|
|
281
|
+
# Analyze with:
|
|
282
|
+
# python -m torch.cuda.memory_viz trace_plot memory_snapshot.pickle -o memory_trace.html
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### Optimize Memory Usage
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
# ✅ Good: Gradient accumulation for large batches
|
|
289
|
+
accumulation_steps = 4
|
|
290
|
+
|
|
291
|
+
for i, batch in enumerate(train_loader):
|
|
292
|
+
# Forward
|
|
293
|
+
output = model(batch)
|
|
294
|
+
loss = criterion(output, target) / accumulation_steps
|
|
295
|
+
|
|
296
|
+
# Backward
|
|
297
|
+
loss.backward()
|
|
298
|
+
|
|
299
|
+
# Step optimizer every accumulation_steps
|
|
300
|
+
if (i + 1) % accumulation_steps == 0:
|
|
301
|
+
optimizer.step()
|
|
302
|
+
optimizer.zero_grad()
|
|
303
|
+
|
|
304
|
+
# ✅ Good: Release memory explicitly
|
|
305
|
+
del intermediate_tensor
|
|
306
|
+
torch.cuda.empty_cache()
|
|
307
|
+
|
|
308
|
+
# ✅ Good: Use gradient checkpointing
|
|
309
|
+
from torch.utils.checkpoint import checkpoint
|
|
310
|
+
|
|
311
|
+
def custom_forward(module, input):
|
|
312
|
+
return checkpoint(module, input)
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
## Bottleneck Detection
|
|
316
|
+
|
|
317
|
+
### Identify Slow Operations
|
|
318
|
+
|
|
319
|
+
```python
|
|
320
|
+
with profiler.profile(
|
|
321
|
+
activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA],
|
|
322
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/bottleneck_profile'),
|
|
323
|
+
with_stack=True
|
|
324
|
+
) as prof:
|
|
325
|
+
for step, batch in enumerate(train_loader):
|
|
326
|
+
train_step(batch)
|
|
327
|
+
prof.step()
|
|
328
|
+
|
|
329
|
+
# Print slowest operations
|
|
330
|
+
print(prof.key_averages().table(
|
|
331
|
+
sort_by="cuda_time_total",
|
|
332
|
+
row_limit=20
|
|
333
|
+
))
|
|
334
|
+
|
|
335
|
+
# Expected output:
|
|
336
|
+
# Name | CPU time | CUDA time | Calls
|
|
337
|
+
# aten::conv2d | 5.2 ms | 45.3 ms | 32
|
|
338
|
+
# aten::batch_norm | 1.1 ms | 8.7 ms | 32
|
|
339
|
+
# aten::relu | 0.3 ms | 2.1 ms | 32
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### Optimize Data Loading
|
|
343
|
+
|
|
344
|
+
```python
|
|
345
|
+
# ✅ Good: Efficient data loading
|
|
346
|
+
train_loader = torch.utils.data.DataLoader(
|
|
347
|
+
dataset,
|
|
348
|
+
batch_size=32,
|
|
349
|
+
num_workers=4, # Parallel data loading
|
|
350
|
+
pin_memory=True, # Faster GPU transfer
|
|
351
|
+
prefetch_factor=2, # Prefetch batches
|
|
352
|
+
persistent_workers=True # Reuse workers
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Profile data loading
|
|
356
|
+
import time
|
|
357
|
+
|
|
358
|
+
start = time.time()
|
|
359
|
+
for batch in train_loader:
|
|
360
|
+
pass
|
|
361
|
+
print(f"Data loading time: {time.time() - start:.2f}s")
|
|
362
|
+
|
|
363
|
+
# ❌ Bad: Single worker, no pinning
|
|
364
|
+
train_loader = torch.utils.data.DataLoader(
|
|
365
|
+
dataset,
|
|
366
|
+
batch_size=32,
|
|
367
|
+
num_workers=0 # Slow!
|
|
368
|
+
)
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
### Profile Specific Operations
|
|
372
|
+
|
|
373
|
+
```python
|
|
374
|
+
# Context manager for specific code blocks
|
|
375
|
+
with profiler.record_function("data_preprocessing"):
|
|
376
|
+
data = preprocess(batch)
|
|
377
|
+
|
|
378
|
+
with profiler.record_function("forward_pass"):
|
|
379
|
+
output = model(data)
|
|
380
|
+
|
|
381
|
+
with profiler.record_function("loss_computation"):
|
|
382
|
+
loss = criterion(output, target)
|
|
383
|
+
|
|
384
|
+
# View in TensorBoard > Profile > Trace View
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
## Optimization Strategies
|
|
388
|
+
|
|
389
|
+
### Mixed Precision Training
|
|
390
|
+
|
|
391
|
+
```python
|
|
392
|
+
import torch
|
|
393
|
+
from torch.cuda.amp import autocast, GradScaler
|
|
394
|
+
|
|
395
|
+
scaler = GradScaler()
|
|
396
|
+
|
|
397
|
+
for batch in train_loader:
|
|
398
|
+
optimizer.zero_grad()
|
|
399
|
+
|
|
400
|
+
# Mixed precision forward pass
|
|
401
|
+
with autocast():
|
|
402
|
+
output = model(batch.cuda())
|
|
403
|
+
loss = criterion(output, target.cuda())
|
|
404
|
+
|
|
405
|
+
# Scaled backward pass
|
|
406
|
+
scaler.scale(loss).backward()
|
|
407
|
+
scaler.step(optimizer)
|
|
408
|
+
scaler.update()
|
|
409
|
+
|
|
410
|
+
# Profile to verify speedup
|
|
411
|
+
with profiler.profile(
|
|
412
|
+
activities=[profiler.ProfilerActivity.CUDA],
|
|
413
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/mixed_precision')
|
|
414
|
+
) as prof:
|
|
415
|
+
train_with_mixed_precision()
|
|
416
|
+
prof.step()
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
### Kernel Fusion
|
|
420
|
+
|
|
421
|
+
```python
|
|
422
|
+
# ✅ Good: Fused operations
|
|
423
|
+
# torch.nn.functional.gelu() is fused
|
|
424
|
+
output = F.gelu(x)
|
|
425
|
+
|
|
426
|
+
# ❌ Bad: Separate operations
|
|
427
|
+
# Manual GELU (slower due to multiple kernels)
|
|
428
|
+
output = 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))
|
|
429
|
+
|
|
430
|
+
# Use torch.jit to fuse custom operations
|
|
431
|
+
@torch.jit.script
|
|
432
|
+
def fused_gelu(x):
|
|
433
|
+
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
### Reduce Host-Device Transfers
|
|
437
|
+
|
|
438
|
+
```python
|
|
439
|
+
# ✅ Good: Keep data on GPU
|
|
440
|
+
data = data.cuda() # Transfer once
|
|
441
|
+
for epoch in range(100):
|
|
442
|
+
output = model(data) # No transfer
|
|
443
|
+
loss = criterion(output, target)
|
|
444
|
+
|
|
445
|
+
# ❌ Bad: Frequent transfers
|
|
446
|
+
for epoch in range(100):
|
|
447
|
+
output = model(data.cuda()) # Transfer every epoch!
|
|
448
|
+
loss = criterion(output.cpu(), target.cpu()) # Transfer back!
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
### Batch Size Optimization
|
|
452
|
+
|
|
453
|
+
```python
|
|
454
|
+
# Find optimal batch size with profiling
|
|
455
|
+
for batch_size in [16, 32, 64, 128, 256]:
|
|
456
|
+
train_loader = DataLoader(dataset, batch_size=batch_size)
|
|
457
|
+
|
|
458
|
+
with profiler.profile(
|
|
459
|
+
activities=[profiler.ProfilerActivity.CUDA],
|
|
460
|
+
profile_memory=True,
|
|
461
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./runs/bs{batch_size}')
|
|
462
|
+
) as prof:
|
|
463
|
+
for step, batch in enumerate(train_loader):
|
|
464
|
+
train_step(batch)
|
|
465
|
+
prof.step()
|
|
466
|
+
|
|
467
|
+
if step >= 10:
|
|
468
|
+
break
|
|
469
|
+
|
|
470
|
+
# Compare in TensorBoard:
|
|
471
|
+
# - GPU utilization
|
|
472
|
+
# - Memory usage
|
|
473
|
+
# - Throughput (samples/sec)
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
## Best Practices
|
|
477
|
+
|
|
478
|
+
### 1. Profile Representative Workloads
|
|
479
|
+
|
|
480
|
+
```python
|
|
481
|
+
# ✅ Good: Profile realistic training scenario
|
|
482
|
+
with profiler.profile(...) as prof:
|
|
483
|
+
for epoch in range(3): # Profile multiple epochs
|
|
484
|
+
for step, batch in enumerate(train_loader):
|
|
485
|
+
train_step(batch)
|
|
486
|
+
prof.step()
|
|
487
|
+
|
|
488
|
+
# ❌ Bad: Profile single step
|
|
489
|
+
with profiler.profile(...) as prof:
|
|
490
|
+
train_step(single_batch)
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
### 2. Profile Periodically
|
|
494
|
+
|
|
495
|
+
```python
|
|
496
|
+
# Profile every N epochs
|
|
497
|
+
if epoch % 10 == 0:
|
|
498
|
+
with profiler.profile(
|
|
499
|
+
activities=[profiler.ProfilerActivity.CUDA],
|
|
500
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./runs/epoch{epoch}')
|
|
501
|
+
) as prof:
|
|
502
|
+
train_epoch()
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
### 3. Compare Before/After Optimizations
|
|
506
|
+
|
|
507
|
+
```python
|
|
508
|
+
# Baseline
|
|
509
|
+
with profiler.profile(...) as prof:
|
|
510
|
+
baseline_train()
|
|
511
|
+
prof.step()
|
|
512
|
+
|
|
513
|
+
# After optimization
|
|
514
|
+
with profiler.profile(...) as prof:
|
|
515
|
+
optimized_train()
|
|
516
|
+
prof.step()
|
|
517
|
+
|
|
518
|
+
# Compare in TensorBoard
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
### 4. Profile Inference
|
|
522
|
+
|
|
523
|
+
```python
|
|
524
|
+
# Production inference profiling
|
|
525
|
+
model.eval()
|
|
526
|
+
|
|
527
|
+
with profiler.profile(
|
|
528
|
+
activities=[profiler.ProfilerActivity.CUDA],
|
|
529
|
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/inference')
|
|
530
|
+
) as prof:
|
|
531
|
+
with torch.no_grad():
|
|
532
|
+
for i in range(1000): # Realistic load
|
|
533
|
+
data = get_production_request()
|
|
534
|
+
output = model(data)
|
|
535
|
+
prof.step()
|
|
536
|
+
|
|
537
|
+
# Analyze latency percentiles in TensorBoard
|
|
538
|
+
```
|
|
539
|
+
|
|
540
|
+
## Resources
|
|
541
|
+
|
|
542
|
+
- **PyTorch Profiler**: https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
|
|
543
|
+
- **TensorFlow Profiler**: https://www.tensorflow.org/guide/profiler
|
|
544
|
+
- **NVIDIA Nsight**: https://developer.nvidia.com/nsight-systems
|
|
545
|
+
- **PyTorch Bottleneck**: https://pytorch.org/docs/stable/bottleneck.html
|