EvoScientist 0.0.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. EvoScientist/EvoScientist.py +157 -0
  2. EvoScientist/__init__.py +24 -0
  3. EvoScientist/__main__.py +4 -0
  4. EvoScientist/backends.py +392 -0
  5. EvoScientist/cli.py +1553 -0
  6. EvoScientist/middleware.py +35 -0
  7. EvoScientist/prompts.py +277 -0
  8. EvoScientist/skills/accelerate/SKILL.md +332 -0
  9. EvoScientist/skills/accelerate/references/custom-plugins.md +453 -0
  10. EvoScientist/skills/accelerate/references/megatron-integration.md +489 -0
  11. EvoScientist/skills/accelerate/references/performance.md +525 -0
  12. EvoScientist/skills/bitsandbytes/SKILL.md +411 -0
  13. EvoScientist/skills/bitsandbytes/references/memory-optimization.md +521 -0
  14. EvoScientist/skills/bitsandbytes/references/qlora-training.md +521 -0
  15. EvoScientist/skills/bitsandbytes/references/quantization-formats.md +447 -0
  16. EvoScientist/skills/find-skills/SKILL.md +133 -0
  17. EvoScientist/skills/find-skills/scripts/install_skill.py +211 -0
  18. EvoScientist/skills/flash-attention/SKILL.md +367 -0
  19. EvoScientist/skills/flash-attention/references/benchmarks.md +215 -0
  20. EvoScientist/skills/flash-attention/references/transformers-integration.md +293 -0
  21. EvoScientist/skills/llama-cpp/SKILL.md +258 -0
  22. EvoScientist/skills/llama-cpp/references/optimization.md +89 -0
  23. EvoScientist/skills/llama-cpp/references/quantization.md +213 -0
  24. EvoScientist/skills/llama-cpp/references/server.md +125 -0
  25. EvoScientist/skills/lm-evaluation-harness/SKILL.md +490 -0
  26. EvoScientist/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  27. EvoScientist/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  28. EvoScientist/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  29. EvoScientist/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  30. EvoScientist/skills/ml-paper-writing/SKILL.md +937 -0
  31. EvoScientist/skills/ml-paper-writing/references/checklists.md +361 -0
  32. EvoScientist/skills/ml-paper-writing/references/citation-workflow.md +562 -0
  33. EvoScientist/skills/ml-paper-writing/references/reviewer-guidelines.md +367 -0
  34. EvoScientist/skills/ml-paper-writing/references/sources.md +159 -0
  35. EvoScientist/skills/ml-paper-writing/references/writing-guide.md +476 -0
  36. EvoScientist/skills/ml-paper-writing/templates/README.md +251 -0
  37. EvoScientist/skills/ml-paper-writing/templates/aaai2026/README.md +534 -0
  38. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex +144 -0
  39. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex +952 -0
  40. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bib +111 -0
  41. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bst +1493 -0
  42. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.sty +315 -0
  43. EvoScientist/skills/ml-paper-writing/templates/acl/README.md +50 -0
  44. EvoScientist/skills/ml-paper-writing/templates/acl/acl.sty +312 -0
  45. EvoScientist/skills/ml-paper-writing/templates/acl/acl_latex.tex +377 -0
  46. EvoScientist/skills/ml-paper-writing/templates/acl/acl_lualatex.tex +101 -0
  47. EvoScientist/skills/ml-paper-writing/templates/acl/acl_natbib.bst +1940 -0
  48. EvoScientist/skills/ml-paper-writing/templates/acl/anthology.bib.txt +26 -0
  49. EvoScientist/skills/ml-paper-writing/templates/acl/custom.bib +70 -0
  50. EvoScientist/skills/ml-paper-writing/templates/acl/formatting.md +326 -0
  51. EvoScientist/skills/ml-paper-writing/templates/colm2025/README.md +3 -0
  52. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bib +11 -0
  53. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bst +1440 -0
  54. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.pdf +0 -0
  55. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.sty +218 -0
  56. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.tex +305 -0
  57. EvoScientist/skills/ml-paper-writing/templates/colm2025/fancyhdr.sty +485 -0
  58. EvoScientist/skills/ml-paper-writing/templates/colm2025/math_commands.tex +508 -0
  59. EvoScientist/skills/ml-paper-writing/templates/colm2025/natbib.sty +1246 -0
  60. EvoScientist/skills/ml-paper-writing/templates/iclr2026/fancyhdr.sty +485 -0
  61. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib +24 -0
  62. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst +1440 -0
  63. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf +0 -0
  64. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty +246 -0
  65. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex +414 -0
  66. EvoScientist/skills/ml-paper-writing/templates/iclr2026/math_commands.tex +508 -0
  67. EvoScientist/skills/ml-paper-writing/templates/iclr2026/natbib.sty +1246 -0
  68. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithm.sty +79 -0
  69. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithmic.sty +201 -0
  70. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.bib +75 -0
  71. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.pdf +0 -0
  72. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.tex +662 -0
  73. EvoScientist/skills/ml-paper-writing/templates/icml2026/fancyhdr.sty +864 -0
  74. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.bst +1443 -0
  75. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.sty +767 -0
  76. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml_numpapers.pdf +0 -0
  77. EvoScientist/skills/ml-paper-writing/templates/neurips2025/Makefile +36 -0
  78. EvoScientist/skills/ml-paper-writing/templates/neurips2025/extra_pkgs.tex +53 -0
  79. EvoScientist/skills/ml-paper-writing/templates/neurips2025/main.tex +38 -0
  80. EvoScientist/skills/ml-paper-writing/templates/neurips2025/neurips.sty +382 -0
  81. EvoScientist/skills/peft/SKILL.md +431 -0
  82. EvoScientist/skills/peft/references/advanced-usage.md +514 -0
  83. EvoScientist/skills/peft/references/troubleshooting.md +480 -0
  84. EvoScientist/skills/ray-data/SKILL.md +326 -0
  85. EvoScientist/skills/ray-data/references/integration.md +82 -0
  86. EvoScientist/skills/ray-data/references/transformations.md +83 -0
  87. EvoScientist/skills/skill-creator/LICENSE.txt +202 -0
  88. EvoScientist/skills/skill-creator/SKILL.md +356 -0
  89. EvoScientist/skills/skill-creator/references/output-patterns.md +82 -0
  90. EvoScientist/skills/skill-creator/references/workflows.md +28 -0
  91. EvoScientist/skills/skill-creator/scripts/init_skill.py +303 -0
  92. EvoScientist/skills/skill-creator/scripts/package_skill.py +110 -0
  93. EvoScientist/skills/skill-creator/scripts/quick_validate.py +95 -0
  94. EvoScientist/stream/__init__.py +53 -0
  95. EvoScientist/stream/emitter.py +94 -0
  96. EvoScientist/stream/formatter.py +168 -0
  97. EvoScientist/stream/tracker.py +115 -0
  98. EvoScientist/stream/utils.py +255 -0
  99. EvoScientist/subagent.yaml +147 -0
  100. EvoScientist/tools.py +135 -0
  101. EvoScientist/utils.py +207 -0
  102. evoscientist-0.0.1.dev1.dist-info/METADATA +222 -0
  103. evoscientist-0.0.1.dev1.dist-info/RECORD +107 -0
  104. evoscientist-0.0.1.dev1.dist-info/WHEEL +5 -0
  105. evoscientist-0.0.1.dev1.dist-info/entry_points.txt +2 -0
  106. evoscientist-0.0.1.dev1.dist-info/licenses/LICENSE +21 -0
  107. evoscientist-0.0.1.dev1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,490 @@
1
+ # API Evaluation
2
+
3
+ Guide to evaluating OpenAI, Anthropic, and other API-based language models.
4
+
5
+ ## Overview
6
+
7
+ The lm-evaluation-harness supports evaluating API-based models through a unified `TemplateAPI` interface. This allows benchmarking of:
8
+ - OpenAI models (GPT-4, GPT-3.5, etc.)
9
+ - Anthropic models (Claude 3, Claude 2, etc.)
10
+ - Local OpenAI-compatible APIs
11
+ - Custom API endpoints
12
+
13
+ **Why evaluate API models**:
14
+ - Benchmark closed-source models
15
+ - Compare API models to open models
16
+ - Validate API performance
17
+ - Track model updates over time
18
+
19
+ ## Supported API Models
20
+
21
+ | Provider | Model Type | Request Types | Logprobs |
22
+ |----------|------------|---------------|----------|
23
+ | OpenAI (completions) | `openai-completions` | All | ✅ Yes |
24
+ | OpenAI (chat) | `openai-chat-completions` | `generate_until` only | ❌ No |
25
+ | Anthropic (completions) | `anthropic-completions` | All | ❌ No |
26
+ | Anthropic (chat) | `anthropic-chat` | `generate_until` only | ❌ No |
27
+ | Local (OpenAI-compatible) | `local-completions` | Depends on server | Varies |
28
+
29
+ **Note**: Models without logprobs can only be evaluated on generation tasks, not perplexity or loglikelihood tasks.
30
+
31
+ ## OpenAI Models
32
+
33
+ ### Setup
34
+
35
+ ```bash
36
+ export OPENAI_API_KEY=sk-...
37
+ ```
38
+
39
+ ### Completion Models (Legacy)
40
+
41
+ **Available models**: `davinci-002`, `babbage-002`
42
+
43
+ ```bash
44
+ lm_eval --model openai-completions \
45
+ --model_args model=davinci-002 \
46
+ --tasks lambada_openai,hellaswag \
47
+ --batch_size auto
48
+ ```
49
+
50
+ **Supports**:
51
+ - `generate_until`: ✅
52
+ - `loglikelihood`: ✅
53
+ - `loglikelihood_rolling`: ✅
54
+
55
+ ### Chat Models
56
+
57
+ **Available models**: `gpt-4`, `gpt-4-turbo`, `gpt-3.5-turbo`
58
+
59
+ ```bash
60
+ lm_eval --model openai-chat-completions \
61
+ --model_args model=gpt-4-turbo \
62
+ --tasks mmlu,gsm8k,humaneval \
63
+ --num_fewshot 5 \
64
+ --batch_size auto
65
+ ```
66
+
67
+ **Supports**:
68
+ - `generate_until`: ✅
69
+ - `loglikelihood`: ❌ (no logprobs)
70
+ - `loglikelihood_rolling`: ❌
71
+
72
+ **Important**: Chat models don't provide logprobs, so they can only be used with generation tasks (MMLU, GSM8K, HumanEval), not perplexity tasks.
73
+
74
+ ### Configuration Options
75
+
76
+ ```bash
77
+ lm_eval --model openai-chat-completions \
78
+ --model_args \
79
+ model=gpt-4-turbo,\
80
+ base_url=https://api.openai.com/v1,\
81
+ num_concurrent=5,\
82
+ max_retries=3,\
83
+ timeout=60,\
84
+ batch_size=auto
85
+ ```
86
+
87
+ **Parameters**:
88
+ - `model`: Model identifier (required)
89
+ - `base_url`: API endpoint (default: OpenAI)
90
+ - `num_concurrent`: Concurrent requests (default: 5)
91
+ - `max_retries`: Retry failed requests (default: 3)
92
+ - `timeout`: Request timeout in seconds (default: 60)
93
+ - `tokenizer`: Tokenizer to use (default: matches model)
94
+ - `tokenizer_backend`: `"tiktoken"` or `"huggingface"`
95
+
96
+ ### Cost Management
97
+
98
+ OpenAI charges per token. Estimate costs before running:
99
+
100
+ ```python
101
+ # Rough estimate
102
+ num_samples = 1000
103
+ avg_tokens_per_sample = 500 # input + output
104
+ cost_per_1k_tokens = 0.01 # GPT-3.5 Turbo
105
+
106
+ total_cost = (num_samples * avg_tokens_per_sample / 1000) * cost_per_1k_tokens
107
+ print(f"Estimated cost: ${total_cost:.2f}")
108
+ ```
109
+
110
+ **Cost-saving tips**:
111
+ - Use `--limit N` for testing
112
+ - Start with `gpt-3.5-turbo` before `gpt-4`
113
+ - Set `max_gen_toks` to minimum needed
114
+ - Use `num_fewshot=0` for zero-shot when possible
115
+
116
+ ## Anthropic Models
117
+
118
+ ### Setup
119
+
120
+ ```bash
121
+ export ANTHROPIC_API_KEY=sk-ant-...
122
+ ```
123
+
124
+ ### Completion Models (Legacy)
125
+
126
+ ```bash
127
+ lm_eval --model anthropic-completions \
128
+ --model_args model=claude-2.1 \
129
+ --tasks lambada_openai,hellaswag \
130
+ --batch_size auto
131
+ ```
132
+
133
+ ### Chat Models (Recommended)
134
+
135
+ **Available models**: `claude-3-5-sonnet-20241022`, `claude-3-opus-20240229`, `claude-3-sonnet-20240229`, `claude-3-haiku-20240307`
136
+
137
+ ```bash
138
+ lm_eval --model anthropic-chat \
139
+ --model_args model=claude-3-5-sonnet-20241022 \
140
+ --tasks mmlu,gsm8k,humaneval \
141
+ --num_fewshot 5 \
142
+ --batch_size auto
143
+ ```
144
+
145
+ **Aliases**: `anthropic-chat-completions` (same as `anthropic-chat`)
146
+
147
+ ### Configuration Options
148
+
149
+ ```bash
150
+ lm_eval --model anthropic-chat \
151
+ --model_args \
152
+ model=claude-3-5-sonnet-20241022,\
153
+ base_url=https://api.anthropic.com,\
154
+ num_concurrent=5,\
155
+ max_retries=3,\
156
+ timeout=60
157
+ ```
158
+
159
+ ### Cost Management
160
+
161
+ Anthropic pricing (as of 2024):
162
+ - Claude 3.5 Sonnet: $3.00 / 1M input, $15.00 / 1M output
163
+ - Claude 3 Opus: $15.00 / 1M input, $75.00 / 1M output
164
+ - Claude 3 Haiku: $0.25 / 1M input, $1.25 / 1M output
165
+
166
+ **Budget-friendly strategy**:
167
+ ```bash
168
+ # Test on small sample first
169
+ lm_eval --model anthropic-chat \
170
+ --model_args model=claude-3-haiku-20240307 \
171
+ --tasks mmlu \
172
+ --limit 100
173
+
174
+ # Then run full eval on best model
175
+ lm_eval --model anthropic-chat \
176
+ --model_args model=claude-3-5-sonnet-20241022 \
177
+ --tasks mmlu \
178
+ --num_fewshot 5
179
+ ```
180
+
181
+ ## Local OpenAI-Compatible APIs
182
+
183
+ Many local inference servers expose OpenAI-compatible APIs (vLLM, Text Generation Inference, llama.cpp, Ollama).
184
+
185
+ ### vLLM Local Server
186
+
187
+ **Start server**:
188
+ ```bash
189
+ vllm serve meta-llama/Llama-2-7b-hf \
190
+ --host 0.0.0.0 \
191
+ --port 8000
192
+ ```
193
+
194
+ **Evaluate**:
195
+ ```bash
196
+ lm_eval --model local-completions \
197
+ --model_args \
198
+ model=meta-llama/Llama-2-7b-hf,\
199
+ base_url=http://localhost:8000/v1,\
200
+ num_concurrent=1 \
201
+ --tasks mmlu,gsm8k \
202
+ --batch_size auto
203
+ ```
204
+
205
+ ### Text Generation Inference (TGI)
206
+
207
+ **Start server**:
208
+ ```bash
209
+ docker run --gpus all --shm-size 1g -p 8080:80 \
210
+ ghcr.io/huggingface/text-generation-inference:latest \
211
+ --model-id meta-llama/Llama-2-7b-hf
212
+ ```
213
+
214
+ **Evaluate**:
215
+ ```bash
216
+ lm_eval --model local-completions \
217
+ --model_args \
218
+ model=meta-llama/Llama-2-7b-hf,\
219
+ base_url=http://localhost:8080/v1 \
220
+ --tasks hellaswag,arc_challenge
221
+ ```
222
+
223
+ ### Ollama
224
+
225
+ **Start server**:
226
+ ```bash
227
+ ollama serve
228
+ ollama pull llama2:7b
229
+ ```
230
+
231
+ **Evaluate**:
232
+ ```bash
233
+ lm_eval --model local-completions \
234
+ --model_args \
235
+ model=llama2:7b,\
236
+ base_url=http://localhost:11434/v1 \
237
+ --tasks mmlu
238
+ ```
239
+
240
+ ### llama.cpp Server
241
+
242
+ **Start server**:
243
+ ```bash
244
+ ./server -m models/llama-2-7b.gguf --host 0.0.0.0 --port 8080
245
+ ```
246
+
247
+ **Evaluate**:
248
+ ```bash
249
+ lm_eval --model local-completions \
250
+ --model_args \
251
+ model=llama2,\
252
+ base_url=http://localhost:8080/v1 \
253
+ --tasks gsm8k
254
+ ```
255
+
256
+ ## Custom API Implementation
257
+
258
+ For custom API endpoints, subclass `TemplateAPI`:
259
+
260
+ ### Create `my_api.py`
261
+
262
+ ```python
263
+ from lm_eval.models.api_models import TemplateAPI
264
+ import requests
265
+
266
+ class MyCustomAPI(TemplateAPI):
267
+ """Custom API model."""
268
+
269
+ def __init__(self, base_url, api_key, **kwargs):
270
+ super().__init__(base_url=base_url, **kwargs)
271
+ self.api_key = api_key
272
+
273
+ def _create_payload(self, messages, gen_kwargs):
274
+ """Create API request payload."""
275
+ return {
276
+ "messages": messages,
277
+ "api_key": self.api_key,
278
+ **gen_kwargs
279
+ }
280
+
281
+ def parse_generations(self, response):
282
+ """Parse generation response."""
283
+ return response.json()["choices"][0]["text"]
284
+
285
+ def parse_logprobs(self, response):
286
+ """Parse logprobs (if available)."""
287
+ # Return None if API doesn't provide logprobs
288
+ logprobs = response.json().get("logprobs")
289
+ if logprobs:
290
+ return logprobs["token_logprobs"]
291
+ return None
292
+ ```
293
+
294
+ ### Register and Use
295
+
296
+ ```python
297
+ from lm_eval import evaluator
298
+ from my_api import MyCustomAPI
299
+
300
+ model = MyCustomAPI(
301
+ base_url="https://api.example.com/v1",
302
+ api_key="your-key"
303
+ )
304
+
305
+ results = evaluator.simple_evaluate(
306
+ model=model,
307
+ tasks=["mmlu", "gsm8k"],
308
+ num_fewshot=5,
309
+ batch_size="auto"
310
+ )
311
+ ```
312
+
313
+ ## Comparing API and Open Models
314
+
315
+ ### Side-by-Side Evaluation
316
+
317
+ ```bash
318
+ # Evaluate OpenAI GPT-4
319
+ lm_eval --model openai-chat-completions \
320
+ --model_args model=gpt-4-turbo \
321
+ --tasks mmlu,gsm8k,hellaswag \
322
+ --num_fewshot 5 \
323
+ --output_path results/gpt4.json
324
+
325
+ # Evaluate open Llama 2 70B
326
+ lm_eval --model hf \
327
+ --model_args pretrained=meta-llama/Llama-2-70b-hf,dtype=bfloat16 \
328
+ --tasks mmlu,gsm8k,hellaswag \
329
+ --num_fewshot 5 \
330
+ --output_path results/llama2-70b.json
331
+
332
+ # Compare results
333
+ python scripts/compare_results.py \
334
+ results/gpt4.json \
335
+ results/llama2-70b.json
336
+ ```
337
+
338
+ ### Typical Comparisons
339
+
340
+ | Model | MMLU | GSM8K | HumanEval | Cost |
341
+ |-------|------|-------|-----------|------|
342
+ | GPT-4 Turbo | 86.4% | 92.0% | 67.0% | $$$$ |
343
+ | Claude 3 Opus | 86.8% | 95.0% | 84.9% | $$$$ |
344
+ | GPT-3.5 Turbo | 70.0% | 57.1% | 48.1% | $$ |
345
+ | Llama 2 70B | 68.9% | 56.8% | 29.9% | Free (self-host) |
346
+ | Mixtral 8x7B | 70.6% | 58.4% | 40.2% | Free (self-host) |
347
+
348
+ ## Best Practices
349
+
350
+ ### Rate Limiting
351
+
352
+ Respect API rate limits:
353
+ ```bash
354
+ lm_eval --model openai-chat-completions \
355
+ --model_args \
356
+ model=gpt-4-turbo,\
357
+ num_concurrent=3,\ # Lower concurrency
358
+ timeout=120 \ # Longer timeout
359
+ --tasks mmlu
360
+ ```
361
+
362
+ ### Reproducibility
363
+
364
+ Set temperature to 0 for deterministic results:
365
+ ```bash
366
+ lm_eval --model openai-chat-completions \
367
+ --model_args model=gpt-4-turbo \
368
+ --tasks mmlu \
369
+ --gen_kwargs temperature=0.0
370
+ ```
371
+
372
+ Or use `seed` for sampling:
373
+ ```bash
374
+ lm_eval --model anthropic-chat \
375
+ --model_args model=claude-3-5-sonnet-20241022 \
376
+ --tasks gsm8k \
377
+ --gen_kwargs temperature=0.7,seed=42
378
+ ```
379
+
380
+ ### Caching
381
+
382
+ API models automatically cache responses to avoid redundant calls:
383
+ ```bash
384
+ # First run: makes API calls
385
+ lm_eval --model openai-chat-completions \
386
+ --model_args model=gpt-4-turbo \
387
+ --tasks mmlu \
388
+ --limit 100
389
+
390
+ # Second run: uses cache (instant, free)
391
+ lm_eval --model openai-chat-completions \
392
+ --model_args model=gpt-4-turbo \
393
+ --tasks mmlu \
394
+ --limit 100
395
+ ```
396
+
397
+ Cache location: `~/.cache/lm_eval/`
398
+
399
+ ### Error Handling
400
+
401
+ APIs can fail. Use retries:
402
+ ```bash
403
+ lm_eval --model openai-chat-completions \
404
+ --model_args \
405
+ model=gpt-4-turbo,\
406
+ max_retries=5,\
407
+ timeout=120 \
408
+ --tasks mmlu
409
+ ```
410
+
411
+ ## Troubleshooting
412
+
413
+ ### "Authentication failed"
414
+
415
+ Check API key:
416
+ ```bash
417
+ echo $OPENAI_API_KEY # Should print sk-...
418
+ echo $ANTHROPIC_API_KEY # Should print sk-ant-...
419
+ ```
420
+
421
+ ### "Rate limit exceeded"
422
+
423
+ Reduce concurrency:
424
+ ```bash
425
+ --model_args num_concurrent=1
426
+ ```
427
+
428
+ Or add delays between requests.
429
+
430
+ ### "Timeout error"
431
+
432
+ Increase timeout:
433
+ ```bash
434
+ --model_args timeout=180
435
+ ```
436
+
437
+ ### "Model not found"
438
+
439
+ For local APIs, verify server is running:
440
+ ```bash
441
+ curl http://localhost:8000/v1/models
442
+ ```
443
+
444
+ ### Cost Runaway
445
+
446
+ Use `--limit` for testing:
447
+ ```bash
448
+ lm_eval --model openai-chat-completions \
449
+ --model_args model=gpt-4-turbo \
450
+ --tasks mmlu \
451
+ --limit 50 # Only 50 samples
452
+ ```
453
+
454
+ ## Advanced Features
455
+
456
+ ### Custom Headers
457
+
458
+ ```bash
459
+ lm_eval --model local-completions \
460
+ --model_args \
461
+ base_url=http://api.example.com/v1,\
462
+ header="Authorization: Bearer token,X-Custom: value"
463
+ ```
464
+
465
+ ### Disable SSL Verification (Development Only)
466
+
467
+ ```bash
468
+ lm_eval --model local-completions \
469
+ --model_args \
470
+ base_url=https://localhost:8000/v1,\
471
+ verify_certificate=false
472
+ ```
473
+
474
+ ### Custom Tokenizer
475
+
476
+ ```bash
477
+ lm_eval --model openai-chat-completions \
478
+ --model_args \
479
+ model=gpt-4-turbo,\
480
+ tokenizer=gpt2,\
481
+ tokenizer_backend=huggingface
482
+ ```
483
+
484
+ ## References
485
+
486
+ - OpenAI API: https://platform.openai.com/docs/api-reference
487
+ - Anthropic API: https://docs.anthropic.com/claude/reference
488
+ - TemplateAPI: `lm_eval/models/api_models.py`
489
+ - OpenAI models: `lm_eval/models/openai_completions.py`
490
+ - Anthropic models: `lm_eval/models/anthropic_llms.py`