EvoScientist 0.0.1.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. EvoScientist/EvoScientist.py +157 -0
  2. EvoScientist/__init__.py +24 -0
  3. EvoScientist/__main__.py +4 -0
  4. EvoScientist/backends.py +392 -0
  5. EvoScientist/cli.py +1553 -0
  6. EvoScientist/middleware.py +35 -0
  7. EvoScientist/prompts.py +277 -0
  8. EvoScientist/skills/accelerate/SKILL.md +332 -0
  9. EvoScientist/skills/accelerate/references/custom-plugins.md +453 -0
  10. EvoScientist/skills/accelerate/references/megatron-integration.md +489 -0
  11. EvoScientist/skills/accelerate/references/performance.md +525 -0
  12. EvoScientist/skills/bitsandbytes/SKILL.md +411 -0
  13. EvoScientist/skills/bitsandbytes/references/memory-optimization.md +521 -0
  14. EvoScientist/skills/bitsandbytes/references/qlora-training.md +521 -0
  15. EvoScientist/skills/bitsandbytes/references/quantization-formats.md +447 -0
  16. EvoScientist/skills/find-skills/SKILL.md +133 -0
  17. EvoScientist/skills/find-skills/scripts/install_skill.py +211 -0
  18. EvoScientist/skills/flash-attention/SKILL.md +367 -0
  19. EvoScientist/skills/flash-attention/references/benchmarks.md +215 -0
  20. EvoScientist/skills/flash-attention/references/transformers-integration.md +293 -0
  21. EvoScientist/skills/llama-cpp/SKILL.md +258 -0
  22. EvoScientist/skills/llama-cpp/references/optimization.md +89 -0
  23. EvoScientist/skills/llama-cpp/references/quantization.md +213 -0
  24. EvoScientist/skills/llama-cpp/references/server.md +125 -0
  25. EvoScientist/skills/lm-evaluation-harness/SKILL.md +490 -0
  26. EvoScientist/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  27. EvoScientist/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  28. EvoScientist/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  29. EvoScientist/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  30. EvoScientist/skills/ml-paper-writing/SKILL.md +937 -0
  31. EvoScientist/skills/ml-paper-writing/references/checklists.md +361 -0
  32. EvoScientist/skills/ml-paper-writing/references/citation-workflow.md +562 -0
  33. EvoScientist/skills/ml-paper-writing/references/reviewer-guidelines.md +367 -0
  34. EvoScientist/skills/ml-paper-writing/references/sources.md +159 -0
  35. EvoScientist/skills/ml-paper-writing/references/writing-guide.md +476 -0
  36. EvoScientist/skills/ml-paper-writing/templates/README.md +251 -0
  37. EvoScientist/skills/ml-paper-writing/templates/aaai2026/README.md +534 -0
  38. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex +144 -0
  39. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex +952 -0
  40. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bib +111 -0
  41. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bst +1493 -0
  42. EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.sty +315 -0
  43. EvoScientist/skills/ml-paper-writing/templates/acl/README.md +50 -0
  44. EvoScientist/skills/ml-paper-writing/templates/acl/acl.sty +312 -0
  45. EvoScientist/skills/ml-paper-writing/templates/acl/acl_latex.tex +377 -0
  46. EvoScientist/skills/ml-paper-writing/templates/acl/acl_lualatex.tex +101 -0
  47. EvoScientist/skills/ml-paper-writing/templates/acl/acl_natbib.bst +1940 -0
  48. EvoScientist/skills/ml-paper-writing/templates/acl/anthology.bib.txt +26 -0
  49. EvoScientist/skills/ml-paper-writing/templates/acl/custom.bib +70 -0
  50. EvoScientist/skills/ml-paper-writing/templates/acl/formatting.md +326 -0
  51. EvoScientist/skills/ml-paper-writing/templates/colm2025/README.md +3 -0
  52. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bib +11 -0
  53. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bst +1440 -0
  54. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.pdf +0 -0
  55. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.sty +218 -0
  56. EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.tex +305 -0
  57. EvoScientist/skills/ml-paper-writing/templates/colm2025/fancyhdr.sty +485 -0
  58. EvoScientist/skills/ml-paper-writing/templates/colm2025/math_commands.tex +508 -0
  59. EvoScientist/skills/ml-paper-writing/templates/colm2025/natbib.sty +1246 -0
  60. EvoScientist/skills/ml-paper-writing/templates/iclr2026/fancyhdr.sty +485 -0
  61. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib +24 -0
  62. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst +1440 -0
  63. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf +0 -0
  64. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty +246 -0
  65. EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex +414 -0
  66. EvoScientist/skills/ml-paper-writing/templates/iclr2026/math_commands.tex +508 -0
  67. EvoScientist/skills/ml-paper-writing/templates/iclr2026/natbib.sty +1246 -0
  68. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithm.sty +79 -0
  69. EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithmic.sty +201 -0
  70. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.bib +75 -0
  71. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.pdf +0 -0
  72. EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.tex +662 -0
  73. EvoScientist/skills/ml-paper-writing/templates/icml2026/fancyhdr.sty +864 -0
  74. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.bst +1443 -0
  75. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.sty +767 -0
  76. EvoScientist/skills/ml-paper-writing/templates/icml2026/icml_numpapers.pdf +0 -0
  77. EvoScientist/skills/ml-paper-writing/templates/neurips2025/Makefile +36 -0
  78. EvoScientist/skills/ml-paper-writing/templates/neurips2025/extra_pkgs.tex +53 -0
  79. EvoScientist/skills/ml-paper-writing/templates/neurips2025/main.tex +38 -0
  80. EvoScientist/skills/ml-paper-writing/templates/neurips2025/neurips.sty +382 -0
  81. EvoScientist/skills/peft/SKILL.md +431 -0
  82. EvoScientist/skills/peft/references/advanced-usage.md +514 -0
  83. EvoScientist/skills/peft/references/troubleshooting.md +480 -0
  84. EvoScientist/skills/ray-data/SKILL.md +326 -0
  85. EvoScientist/skills/ray-data/references/integration.md +82 -0
  86. EvoScientist/skills/ray-data/references/transformations.md +83 -0
  87. EvoScientist/skills/skill-creator/LICENSE.txt +202 -0
  88. EvoScientist/skills/skill-creator/SKILL.md +356 -0
  89. EvoScientist/skills/skill-creator/references/output-patterns.md +82 -0
  90. EvoScientist/skills/skill-creator/references/workflows.md +28 -0
  91. EvoScientist/skills/skill-creator/scripts/init_skill.py +303 -0
  92. EvoScientist/skills/skill-creator/scripts/package_skill.py +110 -0
  93. EvoScientist/skills/skill-creator/scripts/quick_validate.py +95 -0
  94. EvoScientist/stream/__init__.py +53 -0
  95. EvoScientist/stream/emitter.py +94 -0
  96. EvoScientist/stream/formatter.py +168 -0
  97. EvoScientist/stream/tracker.py +115 -0
  98. EvoScientist/stream/utils.py +255 -0
  99. EvoScientist/subagent.yaml +147 -0
  100. EvoScientist/tools.py +135 -0
  101. EvoScientist/utils.py +207 -0
  102. evoscientist-0.0.1.dev2.dist-info/METADATA +227 -0
  103. evoscientist-0.0.1.dev2.dist-info/RECORD +107 -0
  104. evoscientist-0.0.1.dev2.dist-info/WHEEL +5 -0
  105. evoscientist-0.0.1.dev2.dist-info/entry_points.txt +5 -0
  106. evoscientist-0.0.1.dev2.dist-info/licenses/LICENSE +21 -0
  107. evoscientist-0.0.1.dev2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,602 @@
1
+ # Custom Tasks
2
+
3
+ Complete guide to creating domain-specific evaluation tasks in lm-evaluation-harness.
4
+
5
+ ## Overview
6
+
7
+ Custom tasks allow you to evaluate models on your own datasets and metrics. Tasks are defined using YAML configuration files with optional Python utilities for complex logic.
8
+
9
+ **Why create custom tasks**:
10
+ - Evaluate on proprietary/domain-specific data
11
+ - Test specific capabilities not covered by existing benchmarks
12
+ - Create evaluation pipelines for internal models
13
+ - Reproduce research experiments
14
+
15
+ ## Quick Start
16
+
17
+ ### Minimal Custom Task
18
+
19
+ Create `my_tasks/simple_qa.yaml`:
20
+
21
+ ```yaml
22
+ task: simple_qa
23
+ dataset_path: data/simple_qa.jsonl
24
+ output_type: generate_until
25
+ doc_to_text: "Question: {{question}}\nAnswer:"
26
+ doc_to_target: "{{answer}}"
27
+ metric_list:
28
+ - metric: exact_match
29
+ aggregation: mean
30
+ higher_is_better: true
31
+ ```
32
+
33
+ **Run it**:
34
+ ```bash
35
+ lm_eval --model hf \
36
+ --model_args pretrained=meta-llama/Llama-2-7b-hf \
37
+ --tasks simple_qa \
38
+ --include_path my_tasks/
39
+ ```
40
+
41
+ ## Task Configuration Reference
42
+
43
+ ### Essential Fields
44
+
45
+ ```yaml
46
+ # Task identification
47
+ task: my_custom_task # Unique task name (required)
48
+ task_alias: "My Task" # Display name
49
+ tag: # Tags for grouping
50
+ - custom
51
+ - domain_specific
52
+
53
+ # Dataset configuration
54
+ dataset_path: data/my_data.jsonl # HuggingFace dataset or local path
55
+ dataset_name: default # Subset name (if applicable)
56
+ training_split: train
57
+ validation_split: validation
58
+ test_split: test
59
+
60
+ # Evaluation configuration
61
+ output_type: generate_until # or loglikelihood, multiple_choice
62
+ num_fewshot: 5 # Number of few-shot examples
63
+ batch_size: auto # Batch size
64
+
65
+ # Prompt templates (Jinja2)
66
+ doc_to_text: "Question: {{question}}"
67
+ doc_to_target: "{{answer}}"
68
+
69
+ # Metrics
70
+ metric_list:
71
+ - metric: exact_match
72
+ aggregation: mean
73
+ higher_is_better: true
74
+
75
+ # Metadata
76
+ metadata:
77
+ version: 1.0
78
+ ```
79
+
80
+ ### Output Types
81
+
82
+ **`generate_until`**: Free-form generation
83
+ ```yaml
84
+ output_type: generate_until
85
+ generation_kwargs:
86
+ max_gen_toks: 256
87
+ until:
88
+ - "\n"
89
+ - "."
90
+ temperature: 0.0
91
+ ```
92
+
93
+ **`loglikelihood`**: Compute log probability of targets
94
+ ```yaml
95
+ output_type: loglikelihood
96
+ # Used for perplexity, classification
97
+ ```
98
+
99
+ **`multiple_choice`**: Choose from options
100
+ ```yaml
101
+ output_type: multiple_choice
102
+ doc_to_choice: "{{choices}}" # List of choices
103
+ ```
104
+
105
+ ## Data Formats
106
+
107
+ ### Local JSONL File
108
+
109
+ `data/my_data.jsonl`:
110
+ ```json
111
+ {"question": "What is 2+2?", "answer": "4"}
112
+ {"question": "Capital of France?", "answer": "Paris"}
113
+ ```
114
+
115
+ **Task config**:
116
+ ```yaml
117
+ dataset_path: data/my_data.jsonl
118
+ dataset_kwargs:
119
+ data_files:
120
+ test: data/my_data.jsonl
121
+ ```
122
+
123
+ ### HuggingFace Dataset
124
+
125
+ ```yaml
126
+ dataset_path: squad
127
+ dataset_name: plain_text
128
+ test_split: validation
129
+ ```
130
+
131
+ ### CSV File
132
+
133
+ `data/my_data.csv`:
134
+ ```csv
135
+ question,answer,category
136
+ What is 2+2?,4,math
137
+ Capital of France?,Paris,geography
138
+ ```
139
+
140
+ **Task config**:
141
+ ```yaml
142
+ dataset_path: data/my_data.csv
143
+ dataset_kwargs:
144
+ data_files:
145
+ test: data/my_data.csv
146
+ ```
147
+
148
+ ## Prompt Engineering
149
+
150
+ ### Simple Template
151
+
152
+ ```yaml
153
+ doc_to_text: "Question: {{question}}\nAnswer:"
154
+ doc_to_target: "{{answer}}"
155
+ ```
156
+
157
+ ### Conditional Logic
158
+
159
+ ```yaml
160
+ doc_to_text: |
161
+ {% if context %}
162
+ Context: {{context}}
163
+ {% endif %}
164
+ Question: {{question}}
165
+ Answer:
166
+ ```
167
+
168
+ ### Multiple Choice
169
+
170
+ ```yaml
171
+ doc_to_text: |
172
+ Question: {{question}}
173
+ A. {{choices[0]}}
174
+ B. {{choices[1]}}
175
+ C. {{choices[2]}}
176
+ D. {{choices[3]}}
177
+ Answer:
178
+
179
+ doc_to_target: "{{ 'ABCD'[answer_idx] }}"
180
+ doc_to_choice: ["A", "B", "C", "D"]
181
+ ```
182
+
183
+ ### Few-Shot Formatting
184
+
185
+ ```yaml
186
+ fewshot_delimiter: "\n\n" # Between examples
187
+ target_delimiter: " " # Between question and answer
188
+ doc_to_text: "Q: {{question}}"
189
+ doc_to_target: "A: {{answer}}"
190
+ ```
191
+
192
+ ## Custom Python Functions
193
+
194
+ For complex logic, use Python functions in `utils.py`.
195
+
196
+ ### Create `my_tasks/utils.py`
197
+
198
+ ```python
199
+ def process_docs(dataset):
200
+ """Preprocess documents."""
201
+ def _process(doc):
202
+ # Custom preprocessing
203
+ doc["question"] = doc["question"].strip().lower()
204
+ return doc
205
+
206
+ return dataset.map(_process)
207
+
208
+ def doc_to_text(doc):
209
+ """Custom prompt formatting."""
210
+ context = doc.get("context", "")
211
+ question = doc["question"]
212
+
213
+ if context:
214
+ return f"Context: {context}\nQuestion: {question}\nAnswer:"
215
+ return f"Question: {question}\nAnswer:"
216
+
217
+ def doc_to_target(doc):
218
+ """Custom target extraction."""
219
+ return doc["answer"].strip().lower()
220
+
221
+ def aggregate_scores(items):
222
+ """Custom metric aggregation."""
223
+ correct = sum(1 for item in items if item == 1.0)
224
+ total = len(items)
225
+ return correct / total if total > 0 else 0.0
226
+ ```
227
+
228
+ ### Use in Task Config
229
+
230
+ ```yaml
231
+ task: my_custom_task
232
+ dataset_path: data/my_data.jsonl
233
+
234
+ # Use Python functions
235
+ process_docs: !function utils.process_docs
236
+ doc_to_text: !function utils.doc_to_text
237
+ doc_to_target: !function utils.doc_to_target
238
+
239
+ metric_list:
240
+ - metric: exact_match
241
+ aggregation: !function utils.aggregate_scores
242
+ higher_is_better: true
243
+ ```
244
+
245
+ ## Real-World Examples
246
+
247
+ ### Example 1: Domain QA Task
248
+
249
+ **Goal**: Evaluate medical question answering.
250
+
251
+ `medical_qa/medical_qa.yaml`:
252
+ ```yaml
253
+ task: medical_qa
254
+ dataset_path: data/medical_qa.jsonl
255
+ output_type: generate_until
256
+ num_fewshot: 3
257
+
258
+ doc_to_text: |
259
+ Medical Question: {{question}}
260
+ Context: {{context}}
261
+ Answer (be concise):
262
+
263
+ doc_to_target: "{{answer}}"
264
+
265
+ generation_kwargs:
266
+ max_gen_toks: 100
267
+ until:
268
+ - "\n\n"
269
+ temperature: 0.0
270
+
271
+ metric_list:
272
+ - metric: exact_match
273
+ aggregation: mean
274
+ higher_is_better: true
275
+ - metric: !function utils.medical_f1
276
+ aggregation: mean
277
+ higher_is_better: true
278
+
279
+ filter_list:
280
+ - name: lowercase
281
+ filter:
282
+ - function: lowercase
283
+ - function: remove_whitespace
284
+
285
+ metadata:
286
+ version: 1.0
287
+ domain: medical
288
+ ```
289
+
290
+ `medical_qa/utils.py`:
291
+ ```python
292
+ from sklearn.metrics import f1_score
293
+ import re
294
+
295
+ def medical_f1(predictions, references):
296
+ """Custom F1 for medical terms."""
297
+ pred_terms = set(extract_medical_terms(predictions[0]))
298
+ ref_terms = set(extract_medical_terms(references[0]))
299
+
300
+ if not pred_terms and not ref_terms:
301
+ return 1.0
302
+ if not pred_terms or not ref_terms:
303
+ return 0.0
304
+
305
+ tp = len(pred_terms & ref_terms)
306
+ fp = len(pred_terms - ref_terms)
307
+ fn = len(ref_terms - pred_terms)
308
+
309
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0
310
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0
311
+
312
+ return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
313
+
314
+ def extract_medical_terms(text):
315
+ """Extract medical terminology."""
316
+ # Custom logic
317
+ return re.findall(r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)*\b', text)
318
+ ```
319
+
320
+ ### Example 2: Code Evaluation
321
+
322
+ `code_eval/python_challenges.yaml`:
323
+ ```yaml
324
+ task: python_challenges
325
+ dataset_path: data/python_problems.jsonl
326
+ output_type: generate_until
327
+ num_fewshot: 0
328
+
329
+ doc_to_text: |
330
+ Write a Python function to solve:
331
+ {{problem_statement}}
332
+
333
+ Function signature:
334
+ {{function_signature}}
335
+
336
+ doc_to_target: "{{canonical_solution}}"
337
+
338
+ generation_kwargs:
339
+ max_gen_toks: 512
340
+ until:
341
+ - "\n\nclass"
342
+ - "\n\ndef"
343
+ temperature: 0.2
344
+
345
+ metric_list:
346
+ - metric: !function utils.execute_code
347
+ aggregation: mean
348
+ higher_is_better: true
349
+
350
+ process_results: !function utils.process_code_results
351
+
352
+ metadata:
353
+ version: 1.0
354
+ ```
355
+
356
+ `code_eval/utils.py`:
357
+ ```python
358
+ import subprocess
359
+ import json
360
+
361
+ def execute_code(predictions, references):
362
+ """Execute generated code against test cases."""
363
+ generated_code = predictions[0]
364
+ test_cases = json.loads(references[0])
365
+
366
+ try:
367
+ # Execute code with test cases
368
+ for test_input, expected_output in test_cases:
369
+ result = execute_with_timeout(generated_code, test_input, timeout=5)
370
+ if result != expected_output:
371
+ return 0.0
372
+ return 1.0
373
+ except Exception:
374
+ return 0.0
375
+
376
+ def execute_with_timeout(code, input_data, timeout=5):
377
+ """Safely execute code with timeout."""
378
+ # Implementation with subprocess and timeout
379
+ pass
380
+
381
+ def process_code_results(doc, results):
382
+ """Process code execution results."""
383
+ return {
384
+ "passed": results[0] == 1.0,
385
+ "generated_code": results[1]
386
+ }
387
+ ```
388
+
389
+ ### Example 3: Instruction Following
390
+
391
+ `instruction_eval/instruction_eval.yaml`:
392
+ ```yaml
393
+ task: instruction_following
394
+ dataset_path: data/instructions.jsonl
395
+ output_type: generate_until
396
+ num_fewshot: 0
397
+
398
+ doc_to_text: |
399
+ Instruction: {{instruction}}
400
+ {% if constraints %}
401
+ Constraints: {{constraints}}
402
+ {% endif %}
403
+ Response:
404
+
405
+ doc_to_target: "{{expected_response}}"
406
+
407
+ generation_kwargs:
408
+ max_gen_toks: 256
409
+ temperature: 0.7
410
+
411
+ metric_list:
412
+ - metric: !function utils.check_constraints
413
+ aggregation: mean
414
+ higher_is_better: true
415
+ - metric: !function utils.semantic_similarity
416
+ aggregation: mean
417
+ higher_is_better: true
418
+
419
+ process_docs: !function utils.add_constraint_checkers
420
+ ```
421
+
422
+ `instruction_eval/utils.py`:
423
+ ```python
424
+ from sentence_transformers import SentenceTransformer, util
425
+
426
+ model = SentenceTransformer('all-MiniLM-L6-v2')
427
+
428
+ def check_constraints(predictions, references):
429
+ """Check if response satisfies constraints."""
430
+ response = predictions[0]
431
+ constraints = json.loads(references[0])
432
+
433
+ satisfied = 0
434
+ total = len(constraints)
435
+
436
+ for constraint in constraints:
437
+ if verify_constraint(response, constraint):
438
+ satisfied += 1
439
+
440
+ return satisfied / total if total > 0 else 1.0
441
+
442
+ def verify_constraint(response, constraint):
443
+ """Verify single constraint."""
444
+ if constraint["type"] == "length":
445
+ return len(response.split()) >= constraint["min_words"]
446
+ elif constraint["type"] == "contains":
447
+ return constraint["keyword"] in response.lower()
448
+ # Add more constraint types
449
+ return True
450
+
451
+ def semantic_similarity(predictions, references):
452
+ """Compute semantic similarity."""
453
+ pred_embedding = model.encode(predictions[0])
454
+ ref_embedding = model.encode(references[0])
455
+ return float(util.cos_sim(pred_embedding, ref_embedding))
456
+
457
+ def add_constraint_checkers(dataset):
458
+ """Parse constraints into verifiable format."""
459
+ def _parse(doc):
460
+ # Parse constraint string into structured format
461
+ doc["parsed_constraints"] = parse_constraints(doc.get("constraints", ""))
462
+ return doc
463
+ return dataset.map(_parse)
464
+ ```
465
+
466
+ ## Advanced Features
467
+
468
+ ### Output Filtering
469
+
470
+ ```yaml
471
+ filter_list:
472
+ - name: extract_answer
473
+ filter:
474
+ - function: regex
475
+ regex_pattern: "Answer: (.*)"
476
+ group: 1
477
+ - function: lowercase
478
+ - function: strip_whitespace
479
+ ```
480
+
481
+ ### Multiple Metrics
482
+
483
+ ```yaml
484
+ metric_list:
485
+ - metric: exact_match
486
+ aggregation: mean
487
+ higher_is_better: true
488
+ - metric: f1
489
+ aggregation: mean
490
+ higher_is_better: true
491
+ - metric: bleu
492
+ aggregation: mean
493
+ higher_is_better: true
494
+ ```
495
+
496
+ ### Task Groups
497
+
498
+ Create `my_tasks/_default.yaml`:
499
+ ```yaml
500
+ group: my_eval_suite
501
+ task:
502
+ - simple_qa
503
+ - medical_qa
504
+ - python_challenges
505
+ ```
506
+
507
+ **Run entire suite**:
508
+ ```bash
509
+ lm_eval --model hf \
510
+ --model_args pretrained=meta-llama/Llama-2-7b-hf \
511
+ --tasks my_eval_suite \
512
+ --include_path my_tasks/
513
+ ```
514
+
515
+ ## Testing Your Task
516
+
517
+ ### Validate Configuration
518
+
519
+ ```bash
520
+ # Test task loading
521
+ lm_eval --tasks my_custom_task --include_path my_tasks/ --limit 0
522
+
523
+ # Run on 5 samples
524
+ lm_eval --model hf \
525
+ --model_args pretrained=gpt2 \
526
+ --tasks my_custom_task \
527
+ --include_path my_tasks/ \
528
+ --limit 5
529
+ ```
530
+
531
+ ### Debug Mode
532
+
533
+ ```bash
534
+ lm_eval --model hf \
535
+ --model_args pretrained=gpt2 \
536
+ --tasks my_custom_task \
537
+ --include_path my_tasks/ \
538
+ --limit 1 \
539
+ --log_samples # Save input/output samples
540
+ ```
541
+
542
+ ## Best Practices
543
+
544
+ 1. **Start simple**: Test with minimal config first
545
+ 2. **Version your tasks**: Use `metadata.version`
546
+ 3. **Document your metrics**: Explain custom metrics in comments
547
+ 4. **Test with multiple models**: Ensure robustness
548
+ 5. **Validate on known examples**: Include sanity checks
549
+ 6. **Use filters carefully**: Can hide errors
550
+ 7. **Handle edge cases**: Empty strings, missing fields
551
+
552
+ ## Common Patterns
553
+
554
+ ### Classification Task
555
+
556
+ ```yaml
557
+ output_type: loglikelihood
558
+ doc_to_text: "Text: {{text}}\nLabel:"
559
+ doc_to_target: " {{label}}" # Space prefix important!
560
+ metric_list:
561
+ - metric: acc
562
+ aggregation: mean
563
+ ```
564
+
565
+ ### Perplexity Evaluation
566
+
567
+ ```yaml
568
+ output_type: loglikelihood_rolling
569
+ doc_to_text: "{{text}}"
570
+ metric_list:
571
+ - metric: perplexity
572
+ aggregation: perplexity
573
+ ```
574
+
575
+ ### Ranking Task
576
+
577
+ ```yaml
578
+ output_type: loglikelihood
579
+ doc_to_text: "Query: {{query}}\nPassage: {{passage}}\nRelevant:"
580
+ doc_to_target: [" Yes", " No"]
581
+ metric_list:
582
+ - metric: acc
583
+ aggregation: mean
584
+ ```
585
+
586
+ ## Troubleshooting
587
+
588
+ **"Task not found"**: Check `--include_path` and task name
589
+
590
+ **Empty results**: Verify `doc_to_text` and `doc_to_target` templates
591
+
592
+ **Metric errors**: Ensure metric names are correct (exact_match, not exact-match)
593
+
594
+ **Filter issues**: Test filters with `--log_samples`
595
+
596
+ **Python function not found**: Check `!function module.function_name` syntax
597
+
598
+ ## References
599
+
600
+ - Task system: EleutherAI/lm-evaluation-harness docs
601
+ - Example tasks: `lm_eval/tasks/` directory
602
+ - TaskConfig: `lm_eval/api/task.py`