EvoScientist 0.0.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. EvoScientist/EvoScientist.py +157 -0
  2. EvoScientist/__init__.py +24 -0
  3. EvoScientist/__main__.py +4 -0
  4. EvoScientist/backends.py +392 -0
  5. EvoScientist/cli.py +1553 -0
  6. EvoScientist/middleware.py +35 -0
  7. EvoScientist/prompts.py +277 -0
  8. EvoScientist/skills/accelerate/SKILL.md +332 -0
  9. EvoScientist/skills/accelerate/references/custom-plugins.md +453 -0
  10. EvoScientist/skills/accelerate/references/megatron-integration.md +489 -0
  11. EvoScientist/skills/accelerate/references/performance.md +525 -0
  12. EvoScientist/skills/bitsandbytes/SKILL.md +411 -0
  13. EvoScientist/skills/bitsandbytes/references/memory-optimization.md +521 -0
  14. EvoScientist/skills/bitsandbytes/references/qlora-training.md +521 -0
  15. EvoScientist/skills/bitsandbytes/references/quantization-formats.md +447 -0
  16. EvoScientist/skills/find-skills/SKILL.md +133 -0
  17. EvoScientist/skills/find-skills/scripts/install_skill.py +211 -0
  18. EvoScientist/skills/flash-attention/SKILL.md +367 -0
  19. EvoScientist/skills/flash-attention/references/benchmarks.md +215 -0
  20. EvoScientist/skills/flash-attention/references/transformers-integration.md +293 -0
  21. EvoScientist/skills/llama-cpp/SKILL.md +258 -0
  22. EvoScientist/skills/llama-cpp/references/optimization.md +89 -0
  23. EvoScientist/skills/llama-cpp/references/quantization.md +213 -0
  24. EvoScientist/skills/llama-cpp/references/server.md +125 -0
  25. EvoScientist/skills/lm-evaluation-harness/SKILL.md +490 -0
  26. EvoScientist/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  27. EvoScientist/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  28. EvoScientist/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  29. EvoScientist/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  30. EvoScientist/skills/ml-paper-writing/SKILL.md +937 -0
  31. EvoScientist/skills/ml-paper-writing/references/checklists.md +361 -0
  32. EvoScientist/skills/ml-paper-writing/references/citation-workflow.md +562 -0
  33. EvoScientist/skills/ml-paper-writing/references/reviewer-guidelines.md +367 -0
  34. EvoScientist/skills/ml-paper-writing/references/sources.md +159 -0
  35. EvoScientist/skills/ml-paper-writing/references/writing-guide.md +476 -0
  36. EvoScientist/skills/ml-paper-writing/templates/README.md +251 -0
  37. EvoScientist/skills/ml-paper-writing/templates/aaai2026/README.md +534 -0
  38. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex +144 -0
  39. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex +952 -0
  40. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bib +111 -0
  41. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bst +1493 -0
  42. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.sty +315 -0
  43. EvoScientist/skills/ml-paper-writing/templates/acl/README.md +50 -0
  44. EvoScientist/skills/ml-paper-writing/templates/acl/acl.sty +312 -0
  45. EvoScientist/skills/ml-paper-writing/templates/acl/acl_latex.tex +377 -0
  46. EvoScientist/skills/ml-paper-writing/templates/acl/acl_lualatex.tex +101 -0
  47. EvoScientist/skills/ml-paper-writing/templates/acl/acl_natbib.bst +1940 -0
  48. EvoScientist/skills/ml-paper-writing/templates/acl/anthology.bib.txt +26 -0
  49. EvoScientist/skills/ml-paper-writing/templates/acl/custom.bib +70 -0
  50. EvoScientist/skills/ml-paper-writing/templates/acl/formatting.md +326 -0
  51. EvoScientist/skills/ml-paper-writing/templates/colm2025/README.md +3 -0
  52. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bib +11 -0
  53. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bst +1440 -0
  54. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.pdf +0 -0
  55. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.sty +218 -0
  56. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.tex +305 -0
  57. EvoScientist/skills/ml-paper-writing/templates/colm2025/fancyhdr.sty +485 -0
  58. EvoScientist/skills/ml-paper-writing/templates/colm2025/math_commands.tex +508 -0
  59. EvoScientist/skills/ml-paper-writing/templates/colm2025/natbib.sty +1246 -0
  60. EvoScientist/skills/ml-paper-writing/templates/iclr2026/fancyhdr.sty +485 -0
  61. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib +24 -0
  62. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst +1440 -0
  63. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf +0 -0
  64. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty +246 -0
  65. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex +414 -0
  66. EvoScientist/skills/ml-paper-writing/templates/iclr2026/math_commands.tex +508 -0
  67. EvoScientist/skills/ml-paper-writing/templates/iclr2026/natbib.sty +1246 -0
  68. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithm.sty +79 -0
  69. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithmic.sty +201 -0
  70. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.bib +75 -0
  71. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.pdf +0 -0
  72. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.tex +662 -0
  73. EvoScientist/skills/ml-paper-writing/templates/icml2026/fancyhdr.sty +864 -0
  74. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.bst +1443 -0
  75. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.sty +767 -0
  76. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml_numpapers.pdf +0 -0
  77. EvoScientist/skills/ml-paper-writing/templates/neurips2025/Makefile +36 -0
  78. EvoScientist/skills/ml-paper-writing/templates/neurips2025/extra_pkgs.tex +53 -0
  79. EvoScientist/skills/ml-paper-writing/templates/neurips2025/main.tex +38 -0
  80. EvoScientist/skills/ml-paper-writing/templates/neurips2025/neurips.sty +382 -0
  81. EvoScientist/skills/peft/SKILL.md +431 -0
  82. EvoScientist/skills/peft/references/advanced-usage.md +514 -0
  83. EvoScientist/skills/peft/references/troubleshooting.md +480 -0
  84. EvoScientist/skills/ray-data/SKILL.md +326 -0
  85. EvoScientist/skills/ray-data/references/integration.md +82 -0
  86. EvoScientist/skills/ray-data/references/transformations.md +83 -0
  87. EvoScientist/skills/skill-creator/LICENSE.txt +202 -0
  88. EvoScientist/skills/skill-creator/SKILL.md +356 -0
  89. EvoScientist/skills/skill-creator/references/output-patterns.md +82 -0
  90. EvoScientist/skills/skill-creator/references/workflows.md +28 -0
  91. EvoScientist/skills/skill-creator/scripts/init_skill.py +303 -0
  92. EvoScientist/skills/skill-creator/scripts/package_skill.py +110 -0
  93. EvoScientist/skills/skill-creator/scripts/quick_validate.py +95 -0
  94. EvoScientist/stream/__init__.py +53 -0
  95. EvoScientist/stream/emitter.py +94 -0
  96. EvoScientist/stream/formatter.py +168 -0
  97. EvoScientist/stream/tracker.py +115 -0
  98. EvoScientist/stream/utils.py +255 -0
  99. EvoScientist/subagent.yaml +147 -0
  100. EvoScientist/tools.py +135 -0
  101. EvoScientist/utils.py +207 -0
  102. evoscientist-0.0.1.dev1.dist-info/METADATA +222 -0
  103. evoscientist-0.0.1.dev1.dist-info/RECORD +107 -0
  104. evoscientist-0.0.1.dev1.dist-info/WHEEL +5 -0
  105. evoscientist-0.0.1.dev1.dist-info/entry_points.txt +2 -0
  106. evoscientist-0.0.1.dev1.dist-info/licenses/LICENSE +21 -0
  107. evoscientist-0.0.1.dev1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,519 @@
1
+ # Distributed Evaluation
2
+
3
+ Guide to running evaluation across multiple GPUs using data parallelism and tensor/pipeline parallelism.
4
+
5
+ ## Overview
6
+
7
+ Distributed evaluation speeds up benchmarking by:
8
+ - **Data Parallelism**: Split evaluation samples across GPUs (each GPU has full model copy)
9
+ - **Tensor Parallelism**: Split model weights across GPUs (for large models)
10
+ - **Pipeline Parallelism**: Split model layers across GPUs (for very large models)
11
+
12
+ **When to use**:
13
+ - Data Parallel: Model fits on single GPU, want faster evaluation
14
+ - Tensor/Pipeline Parallel: Model too large for single GPU
15
+
16
+ ## HuggingFace Models (`hf`)
17
+
18
+ ### Data Parallelism (Recommended)
19
+
20
+ Each GPU loads a full copy of the model and processes a subset of evaluation data.
21
+
22
+ **Single Node (8 GPUs)**:
23
+ ```bash
24
+ accelerate launch --multi_gpu --num_processes 8 \
25
+ -m lm_eval --model hf \
26
+ --model_args pretrained=meta-llama/Llama-2-7b-hf,dtype=bfloat16 \
27
+ --tasks mmlu,gsm8k,hellaswag \
28
+ --batch_size 16
29
+ ```
30
+
31
+ **Speedup**: Near-linear (8 GPUs = ~8× faster)
32
+
33
+ **Memory**: Each GPU needs full model (7B model ≈ 14GB × 8 = 112GB total)
34
+
35
+ ### Tensor Parallelism (Model Sharding)
36
+
37
+ Split model weights across GPUs for models too large for single GPU.
38
+
39
+ **Without accelerate launcher**:
40
+ ```bash
41
+ lm_eval --model hf \
42
+ --model_args \
43
+ pretrained=meta-llama/Llama-2-70b-hf,\
44
+ parallelize=True,\
45
+ dtype=bfloat16 \
46
+ --tasks mmlu,gsm8k \
47
+ --batch_size 8
48
+ ```
49
+
50
+ **With 8 GPUs**: 70B model (140GB) / 8 = 17.5GB per GPU ✅
51
+
52
+ **Advanced sharding**:
53
+ ```bash
54
+ lm_eval --model hf \
55
+ --model_args \
56
+ pretrained=meta-llama/Llama-2-70b-hf,\
57
+ parallelize=True,\
58
+ device_map_option=auto,\
59
+ max_memory_per_gpu=40GB,\
60
+ max_cpu_memory=100GB,\
61
+ dtype=bfloat16 \
62
+ --tasks mmlu
63
+ ```
64
+
65
+ **Options**:
66
+ - `device_map_option`: `"auto"` (default), `"balanced"`, `"balanced_low_0"`
67
+ - `max_memory_per_gpu`: Max memory per GPU (e.g., `"40GB"`)
68
+ - `max_cpu_memory`: Max CPU memory for offloading
69
+ - `offload_folder`: Disk offloading directory
70
+
71
+ ### Combined Data + Tensor Parallelism
72
+
73
+ Use both for very large models.
74
+
75
+ **Example: 70B model on 16 GPUs (2 copies, 8 GPUs each)**:
76
+ ```bash
77
+ accelerate launch --multi_gpu --num_processes 2 \
78
+ -m lm_eval --model hf \
79
+ --model_args \
80
+ pretrained=meta-llama/Llama-2-70b-hf,\
81
+ parallelize=True,\
82
+ dtype=bfloat16 \
83
+ --tasks mmlu \
84
+ --batch_size 8
85
+ ```
86
+
87
+ **Result**: 2× speedup from data parallelism, 70B model fits via tensor parallelism
88
+
89
+ ### Configuration with `accelerate config`
90
+
91
+ Create `~/.cache/huggingface/accelerate/default_config.yaml`:
92
+ ```yaml
93
+ compute_environment: LOCAL_MACHINE
94
+ distributed_type: MULTI_GPU
95
+ num_machines: 1
96
+ num_processes: 8
97
+ gpu_ids: all
98
+ mixed_precision: bf16
99
+ ```
100
+
101
+ **Then run**:
102
+ ```bash
103
+ accelerate launch -m lm_eval --model hf \
104
+ --model_args pretrained=meta-llama/Llama-2-7b-hf \
105
+ --tasks mmlu
106
+ ```
107
+
108
+ ## vLLM Models (`vllm`)
109
+
110
+ vLLM provides highly optimized distributed inference.
111
+
112
+ ### Tensor Parallelism
113
+
114
+ **Single Node (4 GPUs)**:
115
+ ```bash
116
+ lm_eval --model vllm \
117
+ --model_args \
118
+ pretrained=meta-llama/Llama-2-70b-hf,\
119
+ tensor_parallel_size=4,\
120
+ dtype=auto,\
121
+ gpu_memory_utilization=0.9 \
122
+ --tasks mmlu,gsm8k \
123
+ --batch_size auto
124
+ ```
125
+
126
+ **Memory**: 70B model split across 4 GPUs = ~35GB per GPU
127
+
128
+ ### Data Parallelism
129
+
130
+ **Multiple model replicas**:
131
+ ```bash
132
+ lm_eval --model vllm \
133
+ --model_args \
134
+ pretrained=meta-llama/Llama-2-7b-hf,\
135
+ data_parallel_size=4,\
136
+ dtype=auto,\
137
+ gpu_memory_utilization=0.8 \
138
+ --tasks hellaswag,arc_challenge \
139
+ --batch_size auto
140
+ ```
141
+
142
+ **Result**: 4 model replicas = 4× throughput
143
+
144
+ ### Combined Tensor + Data Parallelism
145
+
146
+ **Example: 8 GPUs = 4 TP × 2 DP**:
147
+ ```bash
148
+ lm_eval --model vllm \
149
+ --model_args \
150
+ pretrained=meta-llama/Llama-2-70b-hf,\
151
+ tensor_parallel_size=4,\
152
+ data_parallel_size=2,\
153
+ dtype=auto,\
154
+ gpu_memory_utilization=0.85 \
155
+ --tasks mmlu \
156
+ --batch_size auto
157
+ ```
158
+
159
+ **Result**: 70B model fits (TP=4), 2× speedup (DP=2)
160
+
161
+ ### Multi-Node vLLM
162
+
163
+ vLLM doesn't natively support multi-node. Use Ray:
164
+
165
+ ```bash
166
+ # Start Ray cluster
167
+ ray start --head --port=6379
168
+
169
+ # Run evaluation
170
+ lm_eval --model vllm \
171
+ --model_args \
172
+ pretrained=meta-llama/Llama-2-70b-hf,\
173
+ tensor_parallel_size=8,\
174
+ dtype=auto \
175
+ --tasks mmlu
176
+ ```
177
+
178
+ ## NVIDIA NeMo Models (`nemo_lm`)
179
+
180
+ ### Data Replication
181
+
182
+ **8 replicas on 8 GPUs**:
183
+ ```bash
184
+ torchrun --nproc-per-node=8 --no-python \
185
+ lm_eval --model nemo_lm \
186
+ --model_args \
187
+ path=/path/to/model.nemo,\
188
+ devices=8 \
189
+ --tasks hellaswag,arc_challenge \
190
+ --batch_size 32
191
+ ```
192
+
193
+ **Speedup**: Near-linear (8× faster)
194
+
195
+ ### Tensor Parallelism
196
+
197
+ **4-way tensor parallelism**:
198
+ ```bash
199
+ torchrun --nproc-per-node=4 --no-python \
200
+ lm_eval --model nemo_lm \
201
+ --model_args \
202
+ path=/path/to/70b_model.nemo,\
203
+ devices=4,\
204
+ tensor_model_parallel_size=4 \
205
+ --tasks mmlu,gsm8k \
206
+ --batch_size 16
207
+ ```
208
+
209
+ ### Pipeline Parallelism
210
+
211
+ **2 TP × 2 PP on 4 GPUs**:
212
+ ```bash
213
+ torchrun --nproc-per-node=4 --no-python \
214
+ lm_eval --model nemo_lm \
215
+ --model_args \
216
+ path=/path/to/model.nemo,\
217
+ devices=4,\
218
+ tensor_model_parallel_size=2,\
219
+ pipeline_model_parallel_size=2 \
220
+ --tasks mmlu \
221
+ --batch_size 8
222
+ ```
223
+
224
+ **Constraint**: `devices = TP × PP`
225
+
226
+ ### Multi-Node NeMo
227
+
228
+ Currently not supported by lm-evaluation-harness.
229
+
230
+ ## SGLang Models (`sglang`)
231
+
232
+ ### Tensor Parallelism
233
+
234
+ ```bash
235
+ lm_eval --model sglang \
236
+ --model_args \
237
+ pretrained=meta-llama/Llama-2-70b-hf,\
238
+ tp_size=4,\
239
+ dtype=auto \
240
+ --tasks gsm8k \
241
+ --batch_size auto
242
+ ```
243
+
244
+ ### Data Parallelism (Deprecated)
245
+
246
+ **Note**: SGLang is deprecating data parallelism. Use tensor parallelism instead.
247
+
248
+ ```bash
249
+ lm_eval --model sglang \
250
+ --model_args \
251
+ pretrained=meta-llama/Llama-2-7b-hf,\
252
+ dp_size=4,\
253
+ dtype=auto \
254
+ --tasks mmlu
255
+ ```
256
+
257
+ ## Performance Comparison
258
+
259
+ ### 70B Model Evaluation (MMLU, 5-shot)
260
+
261
+ | Method | GPUs | Time | Memory/GPU | Notes |
262
+ |--------|------|------|------------|-------|
263
+ | HF (no parallel) | 1 | 8 hours | 140GB (OOM) | Won't fit |
264
+ | HF (TP=8) | 8 | 2 hours | 17.5GB | Slower, fits |
265
+ | HF (DP=8) | 8 | 1 hour | 140GB (OOM) | Won't fit |
266
+ | vLLM (TP=4) | 4 | 30 min | 35GB | Fast! |
267
+ | vLLM (TP=4, DP=2) | 8 | 15 min | 35GB | Fastest |
268
+
269
+ ### 7B Model Evaluation (Multiple Tasks)
270
+
271
+ | Method | GPUs | Time | Speedup |
272
+ |--------|------|------|---------|
273
+ | HF (single) | 1 | 4 hours | 1× |
274
+ | HF (DP=4) | 4 | 1 hour | 4× |
275
+ | HF (DP=8) | 8 | 30 min | 8× |
276
+ | vLLM (DP=8) | 8 | 15 min | 16× |
277
+
278
+ **Takeaway**: vLLM is significantly faster than HuggingFace for inference.
279
+
280
+ ## Choosing Parallelism Strategy
281
+
282
+ ### Decision Tree
283
+
284
+ ```
285
+ Model fits on single GPU?
286
+ ├─ YES: Use data parallelism
287
+ │ ├─ HF: accelerate launch --multi_gpu --num_processes N
288
+ │ └─ vLLM: data_parallel_size=N (fastest)
289
+
290
+ └─ NO: Use tensor/pipeline parallelism
291
+ ├─ Model < 70B:
292
+ │ └─ vLLM: tensor_parallel_size=4
293
+ ├─ Model 70-175B:
294
+ │ ├─ vLLM: tensor_parallel_size=8
295
+ │ └─ Or HF: parallelize=True
296
+ └─ Model > 175B:
297
+ └─ Contact framework authors
298
+ ```
299
+
300
+ ### Memory Estimation
301
+
302
+ **Rule of thumb**:
303
+ ```
304
+ Memory (GB) = Parameters (B) × Precision (bytes) × 1.2 (overhead)
305
+ ```
306
+
307
+ **Examples**:
308
+ - 7B FP16: 7 × 2 × 1.2 = 16.8GB ✅ Fits A100 40GB
309
+ - 13B FP16: 13 × 2 × 1.2 = 31.2GB ✅ Fits A100 40GB
310
+ - 70B FP16: 70 × 2 × 1.2 = 168GB ❌ Need TP=4 or TP=8
311
+ - 70B BF16: 70 × 2 × 1.2 = 168GB (same as FP16)
312
+
313
+ **With tensor parallelism**:
314
+ ```
315
+ Memory per GPU = Total Memory / TP
316
+ ```
317
+
318
+ - 70B on 4 GPUs: 168GB / 4 = 42GB per GPU ✅
319
+ - 70B on 8 GPUs: 168GB / 8 = 21GB per GPU ✅
320
+
321
+ ## Multi-Node Evaluation
322
+
323
+ ### HuggingFace with SLURM
324
+
325
+ **Submit job**:
326
+ ```bash
327
+ #!/bin/bash
328
+ #SBATCH --nodes=4
329
+ #SBATCH --gpus-per-node=8
330
+ #SBATCH --ntasks-per-node=1
331
+
332
+ srun accelerate launch --multi_gpu \
333
+ --num_processes $((SLURM_NNODES * 8)) \
334
+ -m lm_eval --model hf \
335
+ --model_args pretrained=meta-llama/Llama-2-7b-hf \
336
+ --tasks mmlu,gsm8k,hellaswag \
337
+ --batch_size 16
338
+ ```
339
+
340
+ **Submit**:
341
+ ```bash
342
+ sbatch eval_job.sh
343
+ ```
344
+
345
+ ### Manual Multi-Node Setup
346
+
347
+ **On each node, run**:
348
+ ```bash
349
+ accelerate launch \
350
+ --multi_gpu \
351
+ --num_machines 4 \
352
+ --num_processes 32 \
353
+ --main_process_ip $MASTER_IP \
354
+ --main_process_port 29500 \
355
+ --machine_rank $NODE_RANK \
356
+ -m lm_eval --model hf \
357
+ --model_args pretrained=meta-llama/Llama-2-7b-hf \
358
+ --tasks mmlu
359
+ ```
360
+
361
+ **Environment variables**:
362
+ - `MASTER_IP`: IP of rank 0 node
363
+ - `NODE_RANK`: 0, 1, 2, 3 for each node
364
+
365
+ ## Best Practices
366
+
367
+ ### 1. Start Small
368
+
369
+ Test on small sample first:
370
+ ```bash
371
+ lm_eval --model hf \
372
+ --model_args pretrained=meta-llama/Llama-2-70b-hf,parallelize=True \
373
+ --tasks mmlu \
374
+ --limit 100 # Just 100 samples
375
+ ```
376
+
377
+ ### 2. Monitor GPU Usage
378
+
379
+ ```bash
380
+ # Terminal 1: Run evaluation
381
+ lm_eval --model hf ...
382
+
383
+ # Terminal 2: Monitor
384
+ watch -n 1 nvidia-smi
385
+ ```
386
+
387
+ Look for:
388
+ - GPU utilization > 90%
389
+ - Memory usage stable
390
+ - All GPUs active
391
+
392
+ ### 3. Optimize Batch Size
393
+
394
+ ```bash
395
+ # Auto batch size (recommended)
396
+ --batch_size auto
397
+
398
+ # Or tune manually
399
+ --batch_size 16 # Start here
400
+ --batch_size 32 # Increase if memory allows
401
+ ```
402
+
403
+ ### 4. Use Mixed Precision
404
+
405
+ ```bash
406
+ --model_args dtype=bfloat16 # Faster, less memory
407
+ ```
408
+
409
+ ### 5. Check Communication
410
+
411
+ For data parallelism, check network bandwidth:
412
+ ```bash
413
+ # Should see InfiniBand or high-speed network
414
+ nvidia-smi topo -m
415
+ ```
416
+
417
+ ## Troubleshooting
418
+
419
+ ### "CUDA out of memory"
420
+
421
+ **Solutions**:
422
+ 1. Increase tensor parallelism:
423
+ ```bash
424
+ --model_args tensor_parallel_size=8 # Was 4
425
+ ```
426
+
427
+ 2. Reduce batch size:
428
+ ```bash
429
+ --batch_size 4 # Was 16
430
+ ```
431
+
432
+ 3. Lower precision:
433
+ ```bash
434
+ --model_args dtype=int8 # Quantization
435
+ ```
436
+
437
+ ### "NCCL error" or Hanging
438
+
439
+ **Check**:
440
+ 1. All GPUs visible: `nvidia-smi`
441
+ 2. NCCL installed: `python -c "import torch; print(torch.cuda.nccl.version())"`
442
+ 3. Network connectivity between nodes
443
+
444
+ **Fix**:
445
+ ```bash
446
+ export NCCL_DEBUG=INFO # Enable debug logging
447
+ export NCCL_IB_DISABLE=0 # Use InfiniBand if available
448
+ ```
449
+
450
+ ### Slow Evaluation
451
+
452
+ **Possible causes**:
453
+ 1. **Data loading bottleneck**: Preprocess dataset
454
+ 2. **Low GPU utilization**: Increase batch size
455
+ 3. **Communication overhead**: Reduce parallelism degree
456
+
457
+ **Profile**:
458
+ ```bash
459
+ lm_eval --model hf \
460
+ --model_args pretrained=meta-llama/Llama-2-7b-hf \
461
+ --tasks mmlu \
462
+ --limit 100 \
463
+ --log_samples # Check timing
464
+ ```
465
+
466
+ ### GPUs Imbalanced
467
+
468
+ **Symptom**: GPU 0 at 100%, others at 50%
469
+
470
+ **Solution**: Use `device_map_option=balanced`:
471
+ ```bash
472
+ --model_args parallelize=True,device_map_option=balanced
473
+ ```
474
+
475
+ ## Example Configurations
476
+
477
+ ### Small Model (7B) - Fast Evaluation
478
+
479
+ ```bash
480
+ # 8 A100s, data parallel
481
+ accelerate launch --multi_gpu --num_processes 8 \
482
+ -m lm_eval --model hf \
483
+ --model_args \
484
+ pretrained=meta-llama/Llama-2-7b-hf,\
485
+ dtype=bfloat16 \
486
+ --tasks mmlu,gsm8k,hellaswag,arc_challenge \
487
+ --num_fewshot 5 \
488
+ --batch_size 32
489
+
490
+ # Time: ~30 minutes
491
+ ```
492
+
493
+ ### Large Model (70B) - vLLM
494
+
495
+ ```bash
496
+ # 8 H100s, tensor parallel
497
+ lm_eval --model vllm \
498
+ --model_args \
499
+ pretrained=meta-llama/Llama-2-70b-hf,\
500
+ tensor_parallel_size=8,\
501
+ dtype=auto,\
502
+ gpu_memory_utilization=0.9 \
503
+ --tasks mmlu,gsm8k,humaneval \
504
+ --num_fewshot 5 \
505
+ --batch_size auto
506
+
507
+ # Time: ~1 hour
508
+ ```
509
+
510
+ ### Very Large Model (175B+)
511
+
512
+ **Requires specialized setup - contact framework maintainers**
513
+
514
+ ## References
515
+
516
+ - HuggingFace Accelerate: https://huggingface.co/docs/accelerate/
517
+ - vLLM docs: https://docs.vllm.ai/
518
+ - NeMo docs: https://docs.nvidia.com/nemo-framework/
519
+ - lm-eval distributed guide: `docs/model_guide.md`