EvoScientist 0.1.0rc1__py3-none-any.whl → 0.1.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. EvoScientist/EvoScientist.py +1 -1
  2. EvoScientist/cli.py +450 -178
  3. EvoScientist/middleware.py +5 -1
  4. EvoScientist/skills/accelerate/SKILL.md +332 -0
  5. EvoScientist/skills/accelerate/references/custom-plugins.md +453 -0
  6. EvoScientist/skills/accelerate/references/megatron-integration.md +489 -0
  7. EvoScientist/skills/accelerate/references/performance.md +525 -0
  8. EvoScientist/skills/bitsandbytes/SKILL.md +411 -0
  9. EvoScientist/skills/bitsandbytes/references/memory-optimization.md +521 -0
  10. EvoScientist/skills/bitsandbytes/references/qlora-training.md +521 -0
  11. EvoScientist/skills/bitsandbytes/references/quantization-formats.md +447 -0
  12. EvoScientist/skills/clip/SKILL.md +253 -0
  13. EvoScientist/skills/clip/references/applications.md +207 -0
  14. EvoScientist/skills/find-skills/SKILL.md +133 -0
  15. EvoScientist/skills/find-skills/scripts/install_skill.py +211 -0
  16. EvoScientist/skills/flash-attention/SKILL.md +367 -0
  17. EvoScientist/skills/flash-attention/references/benchmarks.md +215 -0
  18. EvoScientist/skills/flash-attention/references/transformers-integration.md +293 -0
  19. EvoScientist/skills/langgraph-docs/SKILL.md +36 -0
  20. EvoScientist/skills/llama-cpp/SKILL.md +258 -0
  21. EvoScientist/skills/llama-cpp/references/optimization.md +89 -0
  22. EvoScientist/skills/llama-cpp/references/quantization.md +213 -0
  23. EvoScientist/skills/llama-cpp/references/server.md +125 -0
  24. EvoScientist/skills/lm-evaluation-harness/SKILL.md +490 -0
  25. EvoScientist/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  26. EvoScientist/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  27. EvoScientist/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  28. EvoScientist/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  29. EvoScientist/skills/ml-paper-writing/SKILL.md +937 -0
  30. EvoScientist/skills/ml-paper-writing/references/checklists.md +361 -0
  31. EvoScientist/skills/ml-paper-writing/references/citation-workflow.md +562 -0
  32. EvoScientist/skills/ml-paper-writing/references/reviewer-guidelines.md +367 -0
  33. EvoScientist/skills/ml-paper-writing/references/sources.md +159 -0
  34. EvoScientist/skills/ml-paper-writing/references/writing-guide.md +476 -0
  35. EvoScientist/skills/ml-paper-writing/templates/README.md +251 -0
  36. EvoScientist/skills/ml-paper-writing/templates/aaai2026/README.md +534 -0
  37. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex +144 -0
  38. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex +952 -0
  39. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bib +111 -0
  40. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bst +1493 -0
  41. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.sty +315 -0
  42. EvoScientist/skills/ml-paper-writing/templates/acl/README.md +50 -0
  43. EvoScientist/skills/ml-paper-writing/templates/acl/acl.sty +312 -0
  44. EvoScientist/skills/ml-paper-writing/templates/acl/acl_latex.tex +377 -0
  45. EvoScientist/skills/ml-paper-writing/templates/acl/acl_lualatex.tex +101 -0
  46. EvoScientist/skills/ml-paper-writing/templates/acl/acl_natbib.bst +1940 -0
  47. EvoScientist/skills/ml-paper-writing/templates/acl/anthology.bib.txt +26 -0
  48. EvoScientist/skills/ml-paper-writing/templates/acl/custom.bib +70 -0
  49. EvoScientist/skills/ml-paper-writing/templates/acl/formatting.md +326 -0
  50. EvoScientist/skills/ml-paper-writing/templates/colm2025/README.md +3 -0
  51. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bib +11 -0
  52. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bst +1440 -0
  53. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.pdf +0 -0
  54. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.sty +218 -0
  55. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.tex +305 -0
  56. EvoScientist/skills/ml-paper-writing/templates/colm2025/fancyhdr.sty +485 -0
  57. EvoScientist/skills/ml-paper-writing/templates/colm2025/math_commands.tex +508 -0
  58. EvoScientist/skills/ml-paper-writing/templates/colm2025/natbib.sty +1246 -0
  59. EvoScientist/skills/ml-paper-writing/templates/iclr2026/fancyhdr.sty +485 -0
  60. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib +24 -0
  61. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst +1440 -0
  62. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf +0 -0
  63. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty +246 -0
  64. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex +414 -0
  65. EvoScientist/skills/ml-paper-writing/templates/iclr2026/math_commands.tex +508 -0
  66. EvoScientist/skills/ml-paper-writing/templates/iclr2026/natbib.sty +1246 -0
  67. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithm.sty +79 -0
  68. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithmic.sty +201 -0
  69. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.bib +75 -0
  70. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.pdf +0 -0
  71. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.tex +662 -0
  72. EvoScientist/skills/ml-paper-writing/templates/icml2026/fancyhdr.sty +864 -0
  73. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.bst +1443 -0
  74. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.sty +767 -0
  75. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml_numpapers.pdf +0 -0
  76. EvoScientist/skills/ml-paper-writing/templates/neurips2025/Makefile +36 -0
  77. EvoScientist/skills/ml-paper-writing/templates/neurips2025/extra_pkgs.tex +53 -0
  78. EvoScientist/skills/ml-paper-writing/templates/neurips2025/main.tex +38 -0
  79. EvoScientist/skills/ml-paper-writing/templates/neurips2025/neurips.sty +382 -0
  80. EvoScientist/skills/peft/SKILL.md +431 -0
  81. EvoScientist/skills/peft/references/advanced-usage.md +514 -0
  82. EvoScientist/skills/peft/references/troubleshooting.md +480 -0
  83. EvoScientist/skills/ray-data/SKILL.md +326 -0
  84. EvoScientist/skills/ray-data/references/integration.md +82 -0
  85. EvoScientist/skills/ray-data/references/transformations.md +83 -0
  86. EvoScientist/skills/skill-creator/LICENSE.txt +202 -0
  87. EvoScientist/skills/skill-creator/SKILL.md +356 -0
  88. EvoScientist/skills/skill-creator/references/output-patterns.md +82 -0
  89. EvoScientist/skills/skill-creator/references/workflows.md +28 -0
  90. EvoScientist/skills/skill-creator/scripts/init_skill.py +303 -0
  91. EvoScientist/skills/skill-creator/scripts/package_skill.py +110 -0
  92. EvoScientist/skills/skill-creator/scripts/quick_validate.py +95 -0
  93. EvoScientist/skills/tensorboard/SKILL.md +629 -0
  94. EvoScientist/skills/tensorboard/references/integrations.md +638 -0
  95. EvoScientist/skills/tensorboard/references/profiling.md +545 -0
  96. EvoScientist/skills/tensorboard/references/visualization.md +620 -0
  97. EvoScientist/skills/vllm/SKILL.md +364 -0
  98. EvoScientist/skills/vllm/references/optimization.md +226 -0
  99. EvoScientist/skills/vllm/references/quantization.md +284 -0
  100. EvoScientist/skills/vllm/references/server-deployment.md +255 -0
  101. EvoScientist/skills/vllm/references/troubleshooting.md +447 -0
  102. {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/METADATA +26 -3
  103. evoscientist-0.1.0rc2.dist-info/RECORD +119 -0
  104. evoscientist-0.1.0rc1.dist-info/RECORD +0 -21
  105. {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/WHEEL +0 -0
  106. {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/entry_points.txt +0 -0
  107. {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/licenses/LICENSE +0 -0
  108. {evoscientist-0.1.0rc1.dist-info → evoscientist-0.1.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,545 @@
1
+ # Performance Profiling Guide
2
+
3
+ Complete guide to profiling and optimizing ML models with TensorBoard.
4
+
5
+ ## Table of Contents
6
+ - PyTorch Profiler
7
+ - TensorFlow Profiler
8
+ - GPU Utilization
9
+ - Memory Profiling
10
+ - Bottleneck Detection
11
+ - Optimization Strategies
12
+
13
+ ## PyTorch Profiler
14
+
15
+ ### Basic Profiling
16
+
17
+ ```python
18
+ import torch
19
+ import torch.profiler as profiler
20
+
21
+ model = MyModel().cuda()
22
+ optimizer = torch.optim.Adam(model.parameters())
23
+
24
+ # Profile training loop
25
+ with profiler.profile(
26
+ activities=[
27
+ profiler.ProfilerActivity.CPU,
28
+ profiler.ProfilerActivity.CUDA,
29
+ ],
30
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/profiler'),
31
+ record_shapes=True,
32
+ with_stack=True
33
+ ) as prof:
34
+ for step, (data, target) in enumerate(train_loader):
35
+ optimizer.zero_grad()
36
+ output = model(data.cuda())
37
+ loss = F.cross_entropy(output, target.cuda())
38
+ loss.backward()
39
+ optimizer.step()
40
+
41
+ # Mark step for profiler
42
+ prof.step()
43
+
44
+ if step >= 10: # Profile first 10 steps
45
+ break
46
+ ```
47
+
48
+ ### Profiler Configuration
49
+
50
+ ```python
51
+ with profiler.profile(
52
+ activities=[
53
+ profiler.ProfilerActivity.CPU, # Profile CPU ops
54
+ profiler.ProfilerActivity.CUDA, # Profile GPU ops
55
+ ],
56
+ schedule=profiler.schedule(
57
+ wait=1, # Warmup steps (skip profiling)
58
+ warmup=1, # Steps to warmup profiler
59
+ active=3, # Steps to actively profile
60
+ repeat=2 # Repeat cycle 2 times
61
+ ),
62
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/profiler'),
63
+ record_shapes=True, # Record tensor shapes
64
+ profile_memory=True, # Track memory allocation
65
+ with_stack=True, # Record source code stack traces
66
+ with_flops=True # Estimate FLOPS
67
+ ) as prof:
68
+ for step, batch in enumerate(train_loader):
69
+ train_step(batch)
70
+ prof.step()
71
+ ```
72
+
73
+ ### Profile Inference
74
+
75
+ ```python
76
+ model.eval()
77
+
78
+ with profiler.profile(
79
+ activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA],
80
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/inference_profiler')
81
+ ) as prof:
82
+ with torch.no_grad():
83
+ for i in range(100):
84
+ data = torch.randn(1, 3, 224, 224).cuda()
85
+ output = model(data)
86
+ prof.step()
87
+ ```
88
+
89
+ ### Analyze Profile Data
90
+
91
+ ```python
92
+ # Print profiler summary
93
+ print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
94
+
95
+ # Export Chrome trace (for chrome://tracing)
96
+ prof.export_chrome_trace("trace.json")
97
+
98
+ # View in TensorBoard
99
+ # tensorboard --logdir=runs/profiler
100
+ ```
101
+
102
+ **TensorBoard Profile Tab shows:**
103
+ - Overview: GPU utilization, step time breakdown
104
+ - Operator view: Time spent in each operation
105
+ - Kernel view: GPU kernel execution
106
+ - Trace view: Timeline of operations
107
+ - Memory view: Memory allocation over time
108
+
109
+ ## TensorFlow Profiler
110
+
111
+ ### Profile with Callback
112
+
113
+ ```python
114
+ import tensorflow as tf
115
+
116
+ # Create profiler callback
117
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(
118
+ log_dir='logs/profiler',
119
+ profile_batch='10,20' # Profile batches 10-20
120
+ )
121
+
122
+ # Train with profiling
123
+ model.fit(
124
+ x_train, y_train,
125
+ epochs=5,
126
+ callbacks=[tensorboard_callback]
127
+ )
128
+
129
+ # Launch TensorBoard
130
+ # tensorboard --logdir=logs/profiler
131
+ ```
132
+
133
+ ### Programmatic Profiling
134
+
135
+ ```python
136
+ import tensorflow as tf
137
+
138
+ # Start profiler
139
+ tf.profiler.experimental.start('logs/profiler')
140
+
141
+ # Training code
142
+ for epoch in range(5):
143
+ for step, (x, y) in enumerate(train_dataset):
144
+ with tf.GradientTape() as tape:
145
+ predictions = model(x, training=True)
146
+ loss = loss_fn(y, predictions)
147
+
148
+ gradients = tape.gradient(loss, model.trainable_variables)
149
+ optimizer.apply_gradients(zip(gradients, model.trainable_variables))
150
+
151
+ # Profile specific steps
152
+ if epoch == 2 and step == 10:
153
+ tf.profiler.experimental.start('logs/profiler_step10')
154
+
155
+ if epoch == 2 and step == 20:
156
+ tf.profiler.experimental.stop()
157
+
158
+ # Stop profiler
159
+ tf.profiler.experimental.stop()
160
+ ```
161
+
162
+ ### Profile Custom Training Loop
163
+
164
+ ```python
165
+ # Profile with context manager
166
+ with tf.profiler.experimental.Profile('logs/profiler'):
167
+ for epoch in range(3):
168
+ for step, (x, y) in enumerate(train_dataset):
169
+ train_step(x, y)
170
+ ```
171
+
172
+ ## GPU Utilization
173
+
174
+ ### Monitor GPU Usage
175
+
176
+ ```python
177
+ import torch
178
+ import torch.profiler as profiler
179
+
180
+ with profiler.profile(
181
+ activities=[profiler.ProfilerActivity.CUDA],
182
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/gpu_profile'),
183
+ with_stack=True
184
+ ) as prof:
185
+ for step, batch in enumerate(train_loader):
186
+ # Your training step
187
+ output = model(batch.cuda())
188
+ loss = criterion(output, target.cuda())
189
+ loss.backward()
190
+ optimizer.step()
191
+
192
+ prof.step()
193
+
194
+ # View in TensorBoard > Profile > Overview
195
+ # Shows: GPU utilization %, kernel efficiency, memory bandwidth
196
+ ```
197
+
198
+ ### Optimize GPU Utilization
199
+
200
+ ```python
201
+ # ✅ Good: Keep GPU busy
202
+ def train_step(batch):
203
+ # Overlap data transfer with computation
204
+ data = batch.cuda(non_blocking=True) # Async transfer
205
+
206
+ # Mixed precision for faster computation
207
+ with torch.cuda.amp.autocast():
208
+ output = model(data)
209
+ loss = criterion(output, target)
210
+
211
+ return loss
212
+
213
+ # ❌ Bad: GPU idle during data transfer
214
+ def train_step_slow(batch):
215
+ data = batch.cuda() # Blocking transfer
216
+ output = model(data)
217
+ return loss
218
+ ```
219
+
220
+ ### Reduce CPU-GPU Synchronization
221
+
222
+ ```python
223
+ # ✅ Good: Minimize synchronization
224
+ for epoch in range(100):
225
+ for batch in train_loader:
226
+ loss = train_step(batch)
227
+
228
+ # Accumulate losses (no sync)
229
+ total_loss += loss.item()
230
+
231
+ # Synchronize once per epoch
232
+ avg_loss = total_loss / len(train_loader)
233
+
234
+ # ❌ Bad: Frequent synchronization
235
+ for batch in train_loader:
236
+ loss = train_step(batch)
237
+ print(f"Loss: {loss.item()}") # Syncs every batch!
238
+ ```
239
+
240
+ ## Memory Profiling
241
+
242
+ ### Track Memory Allocation
243
+
244
+ ```python
245
+ import torch
246
+ import torch.profiler as profiler
247
+
248
+ with profiler.profile(
249
+ activities=[profiler.ProfilerActivity.CUDA],
250
+ profile_memory=True,
251
+ record_shapes=True,
252
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/memory_profile')
253
+ ) as prof:
254
+ for step, batch in enumerate(train_loader):
255
+ train_step(batch)
256
+ prof.step()
257
+
258
+ # View in TensorBoard > Profile > Memory View
259
+ # Shows: Memory allocation over time, peak memory, allocation stack traces
260
+ ```
261
+
262
+ ### Find Memory Leaks
263
+
264
+ ```python
265
+ import torch
266
+
267
+ # Record memory snapshots
268
+ torch.cuda.memory._record_memory_history(
269
+ enabled=True,
270
+ max_entries=100000
271
+ )
272
+
273
+ # Training
274
+ for batch in train_loader:
275
+ train_step(batch)
276
+
277
+ # Save memory snapshot
278
+ snapshot = torch.cuda.memory._snapshot()
279
+ torch.cuda.memory._dump_snapshot("memory_snapshot.pickle")
280
+
281
+ # Analyze with:
282
+ # python -m torch.cuda.memory_viz trace_plot memory_snapshot.pickle -o memory_trace.html
283
+ ```
284
+
285
+ ### Optimize Memory Usage
286
+
287
+ ```python
288
+ # ✅ Good: Gradient accumulation for large batches
289
+ accumulation_steps = 4
290
+
291
+ for i, batch in enumerate(train_loader):
292
+ # Forward
293
+ output = model(batch)
294
+ loss = criterion(output, target) / accumulation_steps
295
+
296
+ # Backward
297
+ loss.backward()
298
+
299
+ # Step optimizer every accumulation_steps
300
+ if (i + 1) % accumulation_steps == 0:
301
+ optimizer.step()
302
+ optimizer.zero_grad()
303
+
304
+ # ✅ Good: Release memory explicitly
305
+ del intermediate_tensor
306
+ torch.cuda.empty_cache()
307
+
308
+ # ✅ Good: Use gradient checkpointing
309
+ from torch.utils.checkpoint import checkpoint
310
+
311
+ def custom_forward(module, input):
312
+ return checkpoint(module, input)
313
+ ```
314
+
315
+ ## Bottleneck Detection
316
+
317
+ ### Identify Slow Operations
318
+
319
+ ```python
320
+ with profiler.profile(
321
+ activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA],
322
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/bottleneck_profile'),
323
+ with_stack=True
324
+ ) as prof:
325
+ for step, batch in enumerate(train_loader):
326
+ train_step(batch)
327
+ prof.step()
328
+
329
+ # Print slowest operations
330
+ print(prof.key_averages().table(
331
+ sort_by="cuda_time_total",
332
+ row_limit=20
333
+ ))
334
+
335
+ # Expected output:
336
+ # Name | CPU time | CUDA time | Calls
337
+ # aten::conv2d | 5.2 ms | 45.3 ms | 32
338
+ # aten::batch_norm | 1.1 ms | 8.7 ms | 32
339
+ # aten::relu | 0.3 ms | 2.1 ms | 32
340
+ ```
341
+
342
+ ### Optimize Data Loading
343
+
344
+ ```python
345
+ # ✅ Good: Efficient data loading
346
+ train_loader = torch.utils.data.DataLoader(
347
+ dataset,
348
+ batch_size=32,
349
+ num_workers=4, # Parallel data loading
350
+ pin_memory=True, # Faster GPU transfer
351
+ prefetch_factor=2, # Prefetch batches
352
+ persistent_workers=True # Reuse workers
353
+ )
354
+
355
+ # Profile data loading
356
+ import time
357
+
358
+ start = time.time()
359
+ for batch in train_loader:
360
+ pass
361
+ print(f"Data loading time: {time.time() - start:.2f}s")
362
+
363
+ # ❌ Bad: Single worker, no pinning
364
+ train_loader = torch.utils.data.DataLoader(
365
+ dataset,
366
+ batch_size=32,
367
+ num_workers=0 # Slow!
368
+ )
369
+ ```
370
+
371
+ ### Profile Specific Operations
372
+
373
+ ```python
374
+ # Context manager for specific code blocks
375
+ with profiler.record_function("data_preprocessing"):
376
+ data = preprocess(batch)
377
+
378
+ with profiler.record_function("forward_pass"):
379
+ output = model(data)
380
+
381
+ with profiler.record_function("loss_computation"):
382
+ loss = criterion(output, target)
383
+
384
+ # View in TensorBoard > Profile > Trace View
385
+ ```
386
+
387
+ ## Optimization Strategies
388
+
389
+ ### Mixed Precision Training
390
+
391
+ ```python
392
+ import torch
393
+ from torch.cuda.amp import autocast, GradScaler
394
+
395
+ scaler = GradScaler()
396
+
397
+ for batch in train_loader:
398
+ optimizer.zero_grad()
399
+
400
+ # Mixed precision forward pass
401
+ with autocast():
402
+ output = model(batch.cuda())
403
+ loss = criterion(output, target.cuda())
404
+
405
+ # Scaled backward pass
406
+ scaler.scale(loss).backward()
407
+ scaler.step(optimizer)
408
+ scaler.update()
409
+
410
+ # Profile to verify speedup
411
+ with profiler.profile(
412
+ activities=[profiler.ProfilerActivity.CUDA],
413
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/mixed_precision')
414
+ ) as prof:
415
+ train_with_mixed_precision()
416
+ prof.step()
417
+ ```
418
+
419
+ ### Kernel Fusion
420
+
421
+ ```python
422
+ # ✅ Good: Fused operations
423
+ # torch.nn.functional.gelu() is fused
424
+ output = F.gelu(x)
425
+
426
+ # ❌ Bad: Separate operations
427
+ # Manual GELU (slower due to multiple kernels)
428
+ output = 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))
429
+
430
+ # Use torch.jit to fuse custom operations
431
+ @torch.jit.script
432
+ def fused_gelu(x):
433
+ return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))
434
+ ```
435
+
436
+ ### Reduce Host-Device Transfers
437
+
438
+ ```python
439
+ # ✅ Good: Keep data on GPU
440
+ data = data.cuda() # Transfer once
441
+ for epoch in range(100):
442
+ output = model(data) # No transfer
443
+ loss = criterion(output, target)
444
+
445
+ # ❌ Bad: Frequent transfers
446
+ for epoch in range(100):
447
+ output = model(data.cuda()) # Transfer every epoch!
448
+ loss = criterion(output.cpu(), target.cpu()) # Transfer back!
449
+ ```
450
+
451
+ ### Batch Size Optimization
452
+
453
+ ```python
454
+ # Find optimal batch size with profiling
455
+ for batch_size in [16, 32, 64, 128, 256]:
456
+ train_loader = DataLoader(dataset, batch_size=batch_size)
457
+
458
+ with profiler.profile(
459
+ activities=[profiler.ProfilerActivity.CUDA],
460
+ profile_memory=True,
461
+ on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./runs/bs{batch_size}')
462
+ ) as prof:
463
+ for step, batch in enumerate(train_loader):
464
+ train_step(batch)
465
+ prof.step()
466
+
467
+ if step >= 10:
468
+ break
469
+
470
+ # Compare in TensorBoard:
471
+ # - GPU utilization
472
+ # - Memory usage
473
+ # - Throughput (samples/sec)
474
+ ```
475
+
476
+ ## Best Practices
477
+
478
+ ### 1. Profile Representative Workloads
479
+
480
+ ```python
481
+ # ✅ Good: Profile realistic training scenario
482
+ with profiler.profile(...) as prof:
483
+ for epoch in range(3): # Profile multiple epochs
484
+ for step, batch in enumerate(train_loader):
485
+ train_step(batch)
486
+ prof.step()
487
+
488
+ # ❌ Bad: Profile single step
489
+ with profiler.profile(...) as prof:
490
+ train_step(single_batch)
491
+ ```
492
+
493
+ ### 2. Profile Periodically
494
+
495
+ ```python
496
+ # Profile every N epochs
497
+ if epoch % 10 == 0:
498
+ with profiler.profile(
499
+ activities=[profiler.ProfilerActivity.CUDA],
500
+ on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./runs/epoch{epoch}')
501
+ ) as prof:
502
+ train_epoch()
503
+ ```
504
+
505
+ ### 3. Compare Before/After Optimizations
506
+
507
+ ```python
508
+ # Baseline
509
+ with profiler.profile(...) as prof:
510
+ baseline_train()
511
+ prof.step()
512
+
513
+ # After optimization
514
+ with profiler.profile(...) as prof:
515
+ optimized_train()
516
+ prof.step()
517
+
518
+ # Compare in TensorBoard
519
+ ```
520
+
521
+ ### 4. Profile Inference
522
+
523
+ ```python
524
+ # Production inference profiling
525
+ model.eval()
526
+
527
+ with profiler.profile(
528
+ activities=[profiler.ProfilerActivity.CUDA],
529
+ on_trace_ready=torch.profiler.tensorboard_trace_handler('./runs/inference')
530
+ ) as prof:
531
+ with torch.no_grad():
532
+ for i in range(1000): # Realistic load
533
+ data = get_production_request()
534
+ output = model(data)
535
+ prof.step()
536
+
537
+ # Analyze latency percentiles in TensorBoard
538
+ ```
539
+
540
+ ## Resources
541
+
542
+ - **PyTorch Profiler**: https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html
543
+ - **TensorFlow Profiler**: https://www.tensorflow.org/guide/profiler
544
+ - **NVIDIA Nsight**: https://developer.nvidia.com/nsight-systems
545
+ - **PyTorch Bottleneck**: https://pytorch.org/docs/stable/bottleneck.html