eval-framework 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. {eval_framework-0.2.1 → eval_framework-0.2.3}/PKG-INFO +54 -35
  2. {eval_framework-0.2.1 → eval_framework-0.2.3}/README.md +51 -34
  3. {eval_framework-0.2.1 → eval_framework-0.2.3}/pyproject.toml +11 -2
  4. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/context/determined.py +15 -5
  5. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/context/eval.py +4 -0
  6. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/context/local.py +4 -2
  7. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/aleph_alpha.py +13 -1
  8. eval_framework-0.2.3/src/eval_framework/llm/base.py +180 -0
  9. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/huggingface.py +99 -53
  10. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/mistral.py +25 -10
  11. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/openai.py +24 -3
  12. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/vllm.py +94 -43
  13. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/main.py +31 -62
  14. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/json_format.py +9 -1
  15. eval_framework-0.2.3/src/eval_framework/metrics/llm/base.py +33 -0
  16. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +20 -21
  17. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +20 -21
  18. eval_framework-0.2.3/src/eval_framework/metrics/loglikelihood/base.py +50 -0
  19. eval_framework-0.2.3/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
  20. eval_framework-0.2.3/src/eval_framework/metrics/loglikelihood/dcs.py +43 -0
  21. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/loglikelihood/probability_mass.py +9 -12
  22. eval_framework-0.2.3/src/eval_framework/metrics/loglikelihood/ternary.py +42 -0
  23. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/response_generator.py +6 -2
  24. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/result_processors/base.py +14 -0
  25. eval_framework-0.2.3/src/eval_framework/result_processors/hf_uploader.py +75 -0
  26. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/result_processors/result_processor.py +7 -7
  27. eval_framework-0.2.3/src/eval_framework/result_processors/wandb_uploader.py +141 -0
  28. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/run.py +26 -0
  29. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/gsm8k.py +7 -5
  30. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/openbookqa.py +25 -3
  31. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +3 -3
  32. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/sciq.py +22 -1
  33. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/sphyr.py +6 -2
  34. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/truthfulqa.py +5 -5
  35. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/eval_config.py +27 -4
  36. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/task_names.py +3 -1
  37. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/file_ops.py +61 -40
  38. eval_framework-0.2.1/src/eval_framework/llm/base.py +0 -97
  39. eval_framework-0.2.1/src/eval_framework/metrics/llm/base.py +0 -8
  40. eval_framework-0.2.1/src/eval_framework/result_processors/hf_processor.py +0 -87
  41. eval_framework-0.2.1/src/template_formatting/tests/test_formatter_eval.py +0 -408
  42. eval_framework-0.2.1/src/template_formatting/tests/test_formatter_scaling.py +0 -253
  43. eval_framework-0.2.1/src/template_formatting/tests/test_mistral_formatter.py +0 -136
  44. {eval_framework-0.2.1 → eval_framework-0.2.3}/LICENSE +0 -0
  45. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/__init__.py +0 -0
  46. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/base_config.py +0 -0
  47. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/context/__init__.py +0 -0
  48. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/evaluation_generator.py +0 -0
  49. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/exceptions.py +0 -0
  50. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  51. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  52. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  53. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  54. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  55. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/__init__.py +0 -0
  56. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/models.py +0 -0
  57. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/logger.py +0 -0
  58. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/__init__.py +0 -0
  59. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/base.py +0 -0
  60. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/__init__.py +0 -0
  61. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  62. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/bleu.py +0 -0
  63. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/chrf.py +0 -0
  64. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  65. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  66. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/comet.py +0 -0
  67. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  68. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  69. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  70. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  71. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/f1.py +0 -0
  72. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  73. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  74. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  75. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  76. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/length_control.py +0 -0
  77. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  78. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  79. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  80. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/repetition.py +0 -0
  81. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  82. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  83. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  84. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  85. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  86. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/ter.py +0 -0
  87. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  88. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  89. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  90. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/__init__.py +0 -0
  91. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  92. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  93. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  94. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  95. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  96. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  97. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  98. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  99. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  100. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  101. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  102. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  103. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  104. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  105. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  106. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  107. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  108. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  109. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  110. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  111. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  112. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  113. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  114. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/py.typed +0 -0
  115. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/result_processors/__init__.py +0 -0
  116. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/run_direct.py +0 -0
  117. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/shared/types.py +0 -0
  118. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/__init__.py +0 -0
  119. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/base.py +0 -0
  120. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  121. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  122. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  123. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  124. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  125. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  126. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  127. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  128. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  129. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  130. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  131. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  132. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  133. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  134. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  135. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  136. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  137. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  138. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  139. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
  140. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  141. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  142. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  143. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  144. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  145. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  146. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  147. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  148. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  149. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  150. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  151. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  152. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  153. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  154. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  155. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  156. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  157. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/perturbation.py +0 -0
  158. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/registry.py +0 -0
  159. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/task_loader.py +0 -0
  160. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/utils.py +0 -0
  161. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/constants.py +0 -0
  162. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/generate_task_docs.py +0 -0
  163. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/helpers.py +0 -0
  164. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/logging.py +0 -0
  165. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/packaging.py +0 -0
  166. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/README.md +0 -0
  167. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/__init__.py +0 -0
  168. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/formatter.py +0 -0
  169. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/mistral_formatter.py +0 -0
  170. {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -233,6 +233,7 @@ Requires-Dist: jsonlines>=4,<5
233
233
  Requires-Dist: lxml>=6,<7
234
234
  Requires-Dist: python-iso639>=2025.2.18
235
235
  Requires-Dist: wandb>=0.21.1,<1
236
+ Requires-Dist: boto3>=1.40.54,<2
236
237
  Requires-Dist: accelerate ; extra == 'accelerate'
237
238
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
238
239
  Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
@@ -248,6 +249,7 @@ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
248
249
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
249
250
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
250
251
  Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
252
+ Requires-Dist: accelerate>=0.30.0,<1 ; extra == 'transformers'
251
253
  Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
252
254
  Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
253
255
  Requires-Python: >=3.12, <3.13
@@ -305,13 +307,25 @@ There are optional extras available to unlock specific features of the library:
305
307
 
306
308
  As a short hand, the `all` extra installs all of the above.
307
309
 
308
- For development, you can instead install it directly from the repository. Please first install
309
- [uv](https://docs.astral.sh/uv/getting-started/installation/)
310
+ We use `uv` to better resolve dependencies when downloading the extras. You can install uv with:
311
+ ```bash
312
+ curl -LsSf https://astral.sh/uv/install.sh | sh
313
+ ```
314
+ or by follwing the `uv` [installation docs.](https://docs.astral.sh/uv/getting-started/installation/)
310
315
 
311
- To install the project with all optional extras use
316
+ Now, you can safely install the project with all optional extras:
312
317
  ```bash
313
318
  uv sync --all-extras
314
319
  ```
320
+ or with pip
321
+ ```bash
322
+ uv pip install eval_framework[all]
323
+ ```
324
+
325
+ Tip: ensure python is properly installed with uv:
326
+ ```
327
+ uv python install 3.12 --reinstall
328
+ ```
315
329
 
316
330
  We provide custom groups to control optional extras.
317
331
  - `flash_attn`: Install `flash_attn` with correct handling of build isolation
@@ -327,8 +341,9 @@ To evaluate a single benchmark locally, you can use the following command:
327
341
  eval_framework \
328
342
  --models src/eval_framework/llm/models.py \
329
343
  --llm-name Smollm135MInstruct \
330
- --task-name "GSM8K" \
331
- --output-dir ./eval \
344
+ --task-name "MMLU" \
345
+ --task-subjects "abstract_algebra" \
346
+ --output-dir ./eval_results \
332
347
  --num-fewshot 5 \
333
348
  --num-samples 10
334
349
  ```
@@ -414,35 +429,37 @@ pip install eval_framework[transformers]
414
429
 
415
430
  2. **Create and run your first evaluation using HuggingFace model**:
416
431
 
417
- ```python
418
- from pathlib import Path
419
-
420
- from eval_framework.llm.huggingface import HFLLM
421
- from eval_framework.main import main
422
- from eval_framework.tasks.eval_config import EvalConfig
423
- from template_formatting.formatter import HFFormatter
424
-
425
- # Define your model
426
- class MyHuggingFaceModel(HFLLM):
427
- LLM_NAME = "microsoft/DialoGPT-medium"
428
- DEFAULT_FORMATTER = partial(HFFormatter, "microsoft/DialoGPT-medium")
429
-
430
- if __name__ == "__main__":
431
- # Initialize your model
432
- llm = MyHuggingFaceModel()
433
-
434
- # Running evaluation on GSM8K task using 5 few-shot examples and 10 samples
435
- config = EvalConfig(
436
- output_dir=Path("./eval_results"),
437
- num_fewshot=5,
438
- num_samples=10,
439
- task_name="GSM8K",
440
- llm_class=MyHuggingFaceModel,
441
- )
442
-
443
- # Run evaluation and get results
444
- results = main(llm=llm, config=config)
445
- ```
432
+ ```python
433
+ from functools import partial
434
+ from pathlib import Path
435
+
436
+ from eval_framework.llm.huggingface import HFLLM
437
+ from eval_framework.main import main
438
+ from eval_framework.tasks.eval_config import EvalConfig
439
+ from template_formatting.formatter import HFFormatter
440
+
441
+ # Define your model
442
+ class MyHuggingFaceModel(HFLLM):
443
+ LLM_NAME = "microsoft/DialoGPT-medium"
444
+ DEFAULT_FORMATTER = partial(HFFormatter, "microsoft/DialoGPT-medium")
445
+
446
+ if __name__ == "__main__":
447
+ # Initialize your model
448
+ llm = MyHuggingFaceModel()
449
+
450
+ # Running evaluation on MMLU abstract algebra task using 5 few-shot examples and 10 samples
451
+ config = EvalConfig(
452
+ output_dir=Path("./eval_results"),
453
+ num_fewshot=5,
454
+ num_samples=10,
455
+ task_name="MMLU",
456
+ task_subjects=["abstract_algebra", "astronomy"],
457
+ llm_class=MyHuggingFaceModel,
458
+ )
459
+
460
+ # Run evaluation and get results
461
+ results = main(llm=llm, config=config)
462
+ ```
446
463
 
447
464
  3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](docs/understanding_results_guide.md) to interpret them
448
465
 
@@ -450,6 +467,7 @@ pip install eval_framework[transformers]
450
467
 
451
468
  - **Use CLI interface**: See [CLI usage guide](docs/cli_usage.md) for command-line evaluation options
452
469
  - **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](docs/evaluate_huggingface_model.md)
470
+ - **Understand model arguments**: Read out [Model Arguments guide](docs/model_arguments.md)
453
471
  - **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
454
472
  - **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
455
473
  - **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
@@ -465,6 +483,7 @@ pip install eval_framework[transformers]
465
483
 
466
484
  ### Advanced Usage
467
485
 
486
+ - **[Understanding Model Arguments](docs/model_arguments.md)** - Thorough guide on each constructor argument for salient model classes
468
487
  - **[Adding New Benchmarks](docs/add_new_benchmark_guide.md)** - Complete guide with practical examples for adding new benchmarks
469
488
  - **[Benchmarks and Metrics](docs/benchmarks_and_metrics.md)** - Comprehensive overview of all available benchmarks and evaluation metrics
470
489
  - **[Overview of Dataloading](docs/overview_dataloading.md)** - Explanation of dataloading and task/sample/message structure
@@ -39,13 +39,25 @@ There are optional extras available to unlock specific features of the library:
39
39
 
40
40
  As a short hand, the `all` extra installs all of the above.
41
41
 
42
- For development, you can instead install it directly from the repository. Please first install
43
- [uv](https://docs.astral.sh/uv/getting-started/installation/)
42
+ We use `uv` to better resolve dependencies when downloading the extras. You can install uv with:
43
+ ```bash
44
+ curl -LsSf https://astral.sh/uv/install.sh | sh
45
+ ```
46
+ or by follwing the `uv` [installation docs.](https://docs.astral.sh/uv/getting-started/installation/)
44
47
 
45
- To install the project with all optional extras use
48
+ Now, you can safely install the project with all optional extras:
46
49
  ```bash
47
50
  uv sync --all-extras
48
51
  ```
52
+ or with pip
53
+ ```bash
54
+ uv pip install eval_framework[all]
55
+ ```
56
+
57
+ Tip: ensure python is properly installed with uv:
58
+ ```
59
+ uv python install 3.12 --reinstall
60
+ ```
49
61
 
50
62
  We provide custom groups to control optional extras.
51
63
  - `flash_attn`: Install `flash_attn` with correct handling of build isolation
@@ -61,8 +73,9 @@ To evaluate a single benchmark locally, you can use the following command:
61
73
  eval_framework \
62
74
  --models src/eval_framework/llm/models.py \
63
75
  --llm-name Smollm135MInstruct \
64
- --task-name "GSM8K" \
65
- --output-dir ./eval \
76
+ --task-name "MMLU" \
77
+ --task-subjects "abstract_algebra" \
78
+ --output-dir ./eval_results \
66
79
  --num-fewshot 5 \
67
80
  --num-samples 10
68
81
  ```
@@ -148,35 +161,37 @@ pip install eval_framework[transformers]
148
161
 
149
162
  2. **Create and run your first evaluation using HuggingFace model**:
150
163
 
151
- ```python
152
- from pathlib import Path
153
-
154
- from eval_framework.llm.huggingface import HFLLM
155
- from eval_framework.main import main
156
- from eval_framework.tasks.eval_config import EvalConfig
157
- from template_formatting.formatter import HFFormatter
158
-
159
- # Define your model
160
- class MyHuggingFaceModel(HFLLM):
161
- LLM_NAME = "microsoft/DialoGPT-medium"
162
- DEFAULT_FORMATTER = partial(HFFormatter, "microsoft/DialoGPT-medium")
163
-
164
- if __name__ == "__main__":
165
- # Initialize your model
166
- llm = MyHuggingFaceModel()
167
-
168
- # Running evaluation on GSM8K task using 5 few-shot examples and 10 samples
169
- config = EvalConfig(
170
- output_dir=Path("./eval_results"),
171
- num_fewshot=5,
172
- num_samples=10,
173
- task_name="GSM8K",
174
- llm_class=MyHuggingFaceModel,
175
- )
176
-
177
- # Run evaluation and get results
178
- results = main(llm=llm, config=config)
179
- ```
164
+ ```python
165
+ from functools import partial
166
+ from pathlib import Path
167
+
168
+ from eval_framework.llm.huggingface import HFLLM
169
+ from eval_framework.main import main
170
+ from eval_framework.tasks.eval_config import EvalConfig
171
+ from template_formatting.formatter import HFFormatter
172
+
173
+ # Define your model
174
+ class MyHuggingFaceModel(HFLLM):
175
+ LLM_NAME = "microsoft/DialoGPT-medium"
176
+ DEFAULT_FORMATTER = partial(HFFormatter, "microsoft/DialoGPT-medium")
177
+
178
+ if __name__ == "__main__":
179
+ # Initialize your model
180
+ llm = MyHuggingFaceModel()
181
+
182
+ # Running evaluation on MMLU abstract algebra task using 5 few-shot examples and 10 samples
183
+ config = EvalConfig(
184
+ output_dir=Path("./eval_results"),
185
+ num_fewshot=5,
186
+ num_samples=10,
187
+ task_name="MMLU",
188
+ task_subjects=["abstract_algebra", "astronomy"],
189
+ llm_class=MyHuggingFaceModel,
190
+ )
191
+
192
+ # Run evaluation and get results
193
+ results = main(llm=llm, config=config)
194
+ ```
180
195
 
181
196
  3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](docs/understanding_results_guide.md) to interpret them
182
197
 
@@ -184,6 +199,7 @@ pip install eval_framework[transformers]
184
199
 
185
200
  - **Use CLI interface**: See [CLI usage guide](docs/cli_usage.md) for command-line evaluation options
186
201
  - **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](docs/evaluate_huggingface_model.md)
202
+ - **Understand model arguments**: Read out [Model Arguments guide](docs/model_arguments.md)
187
203
  - **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
188
204
  - **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
189
205
  - **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
@@ -199,6 +215,7 @@ pip install eval_framework[transformers]
199
215
 
200
216
  ### Advanced Usage
201
217
 
218
+ - **[Understanding Model Arguments](docs/model_arguments.md)** - Thorough guide on each constructor argument for salient model classes
202
219
  - **[Adding New Benchmarks](docs/add_new_benchmark_guide.md)** - Complete guide with practical examples for adding new benchmarks
203
220
  - **[Benchmarks and Metrics](docs/benchmarks_and_metrics.md)** - Comprehensive overview of all available benchmarks and evaluation metrics
204
221
  - **[Overview of Dataloading](docs/overview_dataloading.md)** - Explanation of dataloading and task/sample/message structure
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.1"
3
+ version = "0.2.3"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -40,6 +40,7 @@ dependencies = [
40
40
  "lxml>=6,<7",
41
41
  "python-iso639>=2025.2.18",
42
42
  "wandb>=0.21.1,<1",
43
+ "boto3>=1.40.54,<2",
43
44
  ]
44
45
 
45
46
  [project.optional-dependencies]
@@ -53,7 +54,11 @@ openai = [
53
54
  "openai>=1.62,<2",
54
55
  "tiktoken>=0.9,<0.10"
55
56
  ]
56
- transformers = ["transformers>=4.45.2,<5", "torch>=2.5,<3"]
57
+ transformers = [
58
+ "transformers>=4.45.2,<5",
59
+ "torch>=2.5,<3",
60
+ "accelerate>=0.30.0,<1",
61
+ ]
57
62
  accelerate = ["accelerate"]
58
63
  vllm = [
59
64
  "vllm>=0.8.5,<0.9",
@@ -87,6 +92,7 @@ eval_framework = "eval_framework.run:run"
87
92
  dev = [
88
93
  "mypy>=1.10,<2",
89
94
  "pytest>=8.3.3,<9",
95
+ "pytest-mock>=3.14.1",
90
96
  "pytest-xdist>=3.6.1,<4",
91
97
  "pytest-sugar>1.1,<2",
92
98
  "types-pyyaml>=6.0.12.20240917,<7",
@@ -172,3 +178,6 @@ markers = [
172
178
  filterwarnings = [
173
179
  "ignore::DeprecationWarning:datasets.utils._dill:",
174
180
  ]
181
+ env = [
182
+ "WANDB_MODE = disabled",
183
+ ]
@@ -42,10 +42,12 @@ class Hyperparameters(BaseModel):
42
42
  wandb_project: str | None = None
43
43
  wandb_entity: str | None = None
44
44
  wandb_run_id: str | None = None
45
+ wandb_upload_results: bool | None = None
45
46
  description: str | None = None
46
47
  task_args: TaskArgs
47
48
  llm_args: dict[str, Any] | None = {}
48
49
  extra_task_modules: list[str] | None = None
50
+ delete_output_dir_after_upload: bool | None = None
49
51
 
50
52
 
51
53
  class DeterminedContext(EvalContext):
@@ -88,7 +90,9 @@ class DeterminedContext(EvalContext):
88
90
  "wandb_project",
89
91
  "wandb_entity",
90
92
  "wandb_run_id",
93
+ "wandb_upload_results",
91
94
  "description",
95
+ "delete_output_dir_after_upload",
92
96
  ]:
93
97
  val_cli = getattr(self, name, None)
94
98
  val_hparams = getattr(self.hparams, name, None)
@@ -112,13 +116,16 @@ class DeterminedContext(EvalContext):
112
116
  if val_cli and val_hparams and val_cli != val_hparams:
113
117
  logger.info(f"CLI argument {name} ({val_cli}) is being overridden by hyperparameters: ({val_hparams}).")
114
118
 
115
- llm_name = getattr(self.hparams, "llm_name", self.llm_name)
116
- judge_model_name = getattr(self.hparams.task_args, "judge_model_name", self.judge_model_name)
119
+ # Hyperparameters take precedence over core context
120
+ llm_name = self.hparams.llm_name or self.llm_name
121
+ judge_model_name = self.hparams.task_args.judge_model_name or self.judge_model_name
117
122
 
118
123
  llm_class = _load_model(llm_name, models_path=self.models_path)
119
- llm_judge_class: type[BaseLLM] | None = None
120
- if judge_model_name is not None:
121
- llm_judge_class = _load_model(judge_model_name, models_path=self.judge_models_path, info="judge")
124
+ llm_judge_class: type[BaseLLM] | None = (
125
+ _load_model(judge_model_name, models_path=self.judge_models_path, info="judge")
126
+ if judge_model_name
127
+ else None
128
+ )
122
129
 
123
130
  # for all optional hyperparameters, resort to the respective CLI argument if the hyperparameter is not set
124
131
  self.config = EvalConfig(
@@ -139,8 +146,11 @@ class DeterminedContext(EvalContext):
139
146
  wandb_project=self.hparams.wandb_project or self.wandb_project,
140
147
  wandb_entity=self.hparams.wandb_entity or self.wandb_entity,
141
148
  wandb_run_id=self.hparams.wandb_run_id or self.wandb_run_id,
149
+ wandb_upload_results=self.hparams.wandb_upload_results or self.wandb_upload_results,
142
150
  batch_size=self.hparams.task_args.batch_size or self.batch_size,
143
151
  description=self.hparams.description or self.description,
152
+ delete_output_dir_after_upload=self.hparams.delete_output_dir_after_upload
153
+ or self.delete_output_dir_after_upload,
144
154
  )
145
155
 
146
156
  return self
@@ -61,6 +61,7 @@ class EvalContext(AbstractContextManager):
61
61
  wandb_project: str | None = None,
62
62
  wandb_entity: str | None = None,
63
63
  wandb_run_id: str | None = None,
64
+ wandb_upload_results: bool | None = None,
64
65
  hf_upload_dir: str | None = None,
65
66
  hf_upload_repo: str | None = None,
66
67
  llm_args: dict[str, Any] | None = None,
@@ -72,6 +73,7 @@ class EvalContext(AbstractContextManager):
72
73
  perturbation_type: str | None = None,
73
74
  perturbation_probability: float | None = None,
74
75
  perturbation_seed: int | None = None,
76
+ delete_output_dir_after_upload: bool | None = None,
75
77
  ) -> None:
76
78
  self.llm_name = llm_name
77
79
  self.models_path = models_path
@@ -85,6 +87,7 @@ class EvalContext(AbstractContextManager):
85
87
  self.wandb_project = wandb_project
86
88
  self.wandb_entity = wandb_entity
87
89
  self.wandb_run_id = wandb_run_id
90
+ self.wandb_upload_results = wandb_upload_results
88
91
  self.hf_upload_dir = hf_upload_dir
89
92
  self.hf_upload_repo = hf_upload_repo
90
93
  self.llm_args = llm_args if llm_args is not None else {}
@@ -93,6 +96,7 @@ class EvalContext(AbstractContextManager):
93
96
  self.judge_model_args = judge_model_args if judge_model_args is not None else {}
94
97
  self.batch_size = batch_size
95
98
  self.description = description
99
+ self.delete_output_dir_after_upload = delete_output_dir_after_upload
96
100
 
97
101
  if perturbation_type or perturbation_probability is not None:
98
102
  perturbation = {
@@ -20,7 +20,7 @@ def _load_model(llm_name: str, models_path: str | PathLike | None, *, info: str
20
20
  if models_path is None or "." in llm_name:
21
21
  # The llm_name must a a fully qualified module path
22
22
  if "." not in llm_name:
23
- raise ValueError(f"LLM {info}'{llm_name}' is not a fully qualified module path.")
23
+ raise ValueError(f"LLM {info} '{llm_name}' is not a fully qualified module path.")
24
24
  module_path, llm_class_name = llm_name.rsplit(".", 1)
25
25
  module = importlib.import_module(module_path)
26
26
  if not hasattr(module, llm_class_name):
@@ -31,7 +31,7 @@ def _load_model(llm_name: str, models_path: str | PathLike | None, *, info: str
31
31
  if llm_name not in models_dict:
32
32
  if info:
33
33
  info = f"{info.strip()} "
34
- raise ValueError(f"LLM {info}'{llm_name}' not found in {models_path}.")
34
+ raise ValueError(f"LLM {info} '{llm_name}' not found in {models_path}.")
35
35
  return models_dict[llm_name]
36
36
 
37
37
 
@@ -58,10 +58,12 @@ class LocalContext(EvalContext):
58
58
  wandb_entity=self.wandb_entity,
59
59
  wandb_project=self.wandb_project,
60
60
  wandb_run_id=self.wandb_run_id,
61
+ wandb_upload_results=self.wandb_upload_results,
61
62
  llm_judge_class=self.llm_judge_class,
62
63
  judge_model_args=self.judge_model_args,
63
64
  batch_size=self.batch_size,
64
65
  description=self.description,
66
+ delete_output_dir_after_upload=self.delete_output_dir_after_upload,
65
67
  )
66
68
 
67
69
  return self
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import json
3
3
  import logging
4
+ import math
4
5
  import os
5
6
  import random
6
7
  import re
@@ -43,6 +44,7 @@ def safe_json_loads(s: str) -> dict:
43
44
  class AlephAlphaAPIModel(BaseLLM):
44
45
  LLM_NAME: str
45
46
  DEFAULT_FORMATTER: Callable[[], BaseFormatter] | None = None
47
+ BYTES_PER_TOKEN: float = 4.0 # rule of thumb according to https://platform.openai.com/tokenizer
46
48
 
47
49
  def __init__(
48
50
  self,
@@ -53,6 +55,7 @@ class AlephAlphaAPIModel(BaseLLM):
53
55
  max_async_concurrent_requests: int = 32,
54
56
  request_timeout_seconds: int = 30 * 60 + 5,
55
57
  queue_full_timeout_seconds: int = 30 * 60 + 5,
58
+ bytes_per_token: float | None = None,
56
59
  ) -> None:
57
60
  self._formatter: BaseFormatter
58
61
  if formatter is None:
@@ -67,6 +70,12 @@ class AlephAlphaAPIModel(BaseLLM):
67
70
  self.request_timeout_seconds = request_timeout_seconds
68
71
  self.queue_full_timeout_seconds = queue_full_timeout_seconds
69
72
  self._validate_model_availability()
73
+ # set bytes_per_token_scalar for non-standard models
74
+ if bytes_per_token is not None and bytes_per_token <= 0:
75
+ raise ValueError("bytes_per_token must be positive")
76
+ self.bytes_per_token_scalar = (
77
+ 4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
78
+ )
70
79
 
71
80
  def _validate_model_availability(self) -> None:
72
81
  """
@@ -250,11 +259,14 @@ class AlephAlphaAPIModel(BaseLLM):
250
259
 
251
260
  requests = []
252
261
 
262
+ # Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
263
+ scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
264
+
253
265
  for single_messages in messages:
254
266
  requests.append(
255
267
  CompletionRequest(
256
268
  prompt=Prompt.from_text(self._formatter.format(single_messages, output_mode="string")),
257
- maximum_tokens=max_tokens,
269
+ maximum_tokens=scaled_max_tokens,
258
270
  stop_sequences=stop_sequences,
259
271
  temperature=effective_temperature,
260
272
  )
@@ -0,0 +1,180 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Sequence
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from eval_framework.shared.types import RawCompletion, RawLoglikelihood
7
+ from eval_framework.tasks.base import Sample
8
+ from template_formatting.formatter import BaseFormatter, Message
9
+
10
+
11
+ class BaseLLM(ABC):
12
+ @property
13
+ def name(self) -> str:
14
+ """
15
+ This property is used to name the results folder and identify the eval results.
16
+ Overwrite this property in the subclass with e.g. the checkpoint name/huggingface model name."""
17
+ return self.__class__.__name__
18
+
19
+ @abstractmethod
20
+ def generate_from_messages(
21
+ self,
22
+ messages: list[Sequence[Message]],
23
+ stop_sequences: list[str] | None = None,
24
+ max_tokens: int | None = None,
25
+ temperature: float | None = None,
26
+ ) -> list[RawCompletion]:
27
+ """
28
+ stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
29
+ extended with the properties of the model. This includes but is not limited to the stop tokens
30
+ by the evaluated checkpoint (e.g. <|eot_id|> for an instruction finetuned Llama3.1, <|endoftext|>
31
+ for a pretrained Llama3.1).
32
+
33
+ This function is expected to raise errors which are caught and reported when running the eval.
34
+ Please also make sure to raise an error in case of sequence length issues. We expect to always
35
+ raise an error if something impedes the expected completion of a task.
36
+
37
+ Important! The completion is expected to be detokenized and to NOT contain special tokens.
38
+
39
+ Returns: List[RawCompletion]
40
+ """
41
+ raise NotImplementedError
42
+
43
+ def generate_from_samples(
44
+ self,
45
+ samples: list[Sample],
46
+ stop_sequences: list[str] | None = None,
47
+ max_tokens: int | None = None,
48
+ temperature: float | None = None,
49
+ ) -> list[RawCompletion]:
50
+ """
51
+ stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
52
+ extended with the properties of the model. This includes but is not limited to the stop tokens
53
+ by the evaluated checkpoint (e.g. <|eot_id|> for an instruction finetuned Llama3.1, <|endoftext|>
54
+ for a pretrained Llama3.1).
55
+
56
+ This function is expected to raise errors which are caught and reported when running the eval.
57
+ Please also make sure to raise an error in case of sequence length issues. We expect to always
58
+ raise an error if something impedes the expected completion of a task.
59
+
60
+ Important! The completion is expected to be detokenized and to NOT contain special tokens.
61
+
62
+ Returns: List[RawCompletion]
63
+ """
64
+ raise NotImplementedError
65
+
66
+ @abstractmethod
67
+ def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
68
+ """
69
+ This function is expected to raise errors which are caught and reported when running the eval.
70
+ Please also make sure to raise an error in case of sequence length issues. We expect to always
71
+ raise an error if something prevents the expected completion of a task.
72
+ """
73
+ raise NotImplementedError
74
+
75
+ def generate(
76
+ self,
77
+ samples: list[Sample],
78
+ stop_sequences: list[str] | None = None,
79
+ max_tokens: int | None = None,
80
+ temperature: float | None = None,
81
+ ) -> list[RawCompletion]:
82
+ """Generates a model response for each sample.
83
+
84
+ Uses 'generate_from_samples' to generate responses if implemented,
85
+ otherwise falls back to 'generate_from_messages'.
86
+ """
87
+ try:
88
+ return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature)
89
+ except NotImplementedError:
90
+ messages: list[Sequence[Message]] = [sample.messages for sample in samples]
91
+ return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature)
92
+
93
+ def post_process_completion(self, completion: str, sample: Sample) -> str:
94
+ """
95
+ Model-specific post-processing of generated completions.
96
+
97
+ Override this method to apply model-specific cleanup or transformations
98
+ (e.g., removing specific artifacts such as reasoning traces, handling special tokens).
99
+
100
+ Args:
101
+ completion: The raw completion string from the model
102
+ sample: The sample that was used to generate the completion
103
+
104
+ Returns:
105
+ The post-processed completion string
106
+ """
107
+ return completion
108
+
109
+ def __del__(self) -> None:
110
+ """
111
+ Method for custom resource cleanup (particularly GPUs)
112
+ """
113
+ pass
114
+
115
+ def _get_final_checkpoint(
116
+ self, checkpoint_path: str | Path | None = None, model_name: str | None = None, artifact_name: str | None = None
117
+ ) -> tuple[str | Path | None, str | None]:
118
+ if (num_provided := sum(x is not None for x in [checkpoint_path, model_name, artifact_name])) == 0:
119
+ if not getattr(self, "LLM_NAME", ""):
120
+ raise ValueError("Either LLM_NAME, checkpoint_path, model_name, or artifact_name must be provided.")
121
+ return None, None # no argument given, so will use the LLM_NAME of the class
122
+ elif num_provided > 1:
123
+ raise ValueError("At most one of `checkpoint_path`, `model_name`, or `artifact_name` must be provided.")
124
+
125
+ elif checkpoint_path is not None:
126
+ return checkpoint_path, str(checkpoint_path)
127
+
128
+ elif model_name is not None:
129
+ return model_name, model_name
130
+
131
+ else:
132
+ from eval_framework.utils.file_ops import WandbFs
133
+
134
+ assert artifact_name is not None
135
+ artifact_base, version = artifact_name.split(":", 1) if ":" in artifact_name else (artifact_name, "latest")
136
+ with WandbFs() as wandb_fs:
137
+ self.artifact = wandb_fs.get_artifact(artifact_base, version) # self.artifact being read in main()
138
+ wandb_fs.download_artifact(self.artifact)
139
+ file_root = wandb_fs.find_hf_checkpoint_root_from_path_list()
140
+ if file_root is None:
141
+ raise ValueError(f"Could not find HuggingFace checkpoint in artifact {artifact_base}:{version}")
142
+ return file_root, artifact_name
143
+
144
+ def _get_final_formatter(
145
+ self,
146
+ formatter: BaseFormatter | None = None,
147
+ formatter_name: str | None = None,
148
+ formatter_kwargs: dict[str, Any] | None = None,
149
+ ) -> BaseFormatter | None:
150
+ if (num_provided := sum(x is not None for x in [formatter, formatter_name])) == 0:
151
+ return None # none given, so will use the default of the class
152
+ elif num_provided > 1:
153
+ raise ValueError("At most one of `formatter` or `formatter_name` must be provided.")
154
+
155
+ if formatter:
156
+ if formatter_kwargs:
157
+ raise ValueError("Cannot provide `formatter_kwargs` when `formatter` is provided.")
158
+ return formatter
159
+ elif formatter_name:
160
+ kwargs = formatter_kwargs or {}
161
+ match formatter_name:
162
+ case "Llama3Formatter":
163
+ from template_formatting.formatter import Llama3Formatter
164
+
165
+ return Llama3Formatter()
166
+ case "MistralFormatter" | "MagistralFormatter":
167
+ from eval_framework.llm.mistral import MagistralFormatter
168
+
169
+ return MagistralFormatter(**kwargs)
170
+ case "ConcatFormatter":
171
+ from template_formatting.formatter import ConcatFormatter
172
+
173
+ return ConcatFormatter()
174
+ case "HFFormatter":
175
+ from template_formatting.formatter import HFFormatter
176
+
177
+ return HFFormatter(**kwargs)
178
+ case _:
179
+ raise ValueError(f"Unsupported formatter: {formatter_name}.")
180
+ return None