eval-framework 0.2.7__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. {eval_framework-0.2.7 → eval_framework-0.2.9}/PKG-INFO +5 -4
  2. {eval_framework-0.2.7 → eval_framework-0.2.9}/README.md +3 -3
  3. {eval_framework-0.2.7 → eval_framework-0.2.9}/pyproject.toml +4 -1
  4. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/context/determined.py +3 -0
  5. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/context/eval.py +2 -1
  6. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/context/local.py +1 -0
  7. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/math_reasoning_completion.py +10 -9
  8. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/response_generator.py +25 -6
  9. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/run.py +8 -0
  10. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/math_reasoning.py +19 -0
  11. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/eval_config.py +3 -1
  12. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/task_names.py +1 -0
  13. {eval_framework-0.2.7 → eval_framework-0.2.9}/LICENSE +0 -0
  14. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/__init__.py +0 -0
  15. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/base_config.py +0 -0
  16. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/context/__init__.py +0 -0
  17. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/evaluation_generator.py +0 -0
  18. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/exceptions.py +0 -0
  19. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  20. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  21. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  22. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  23. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  24. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/__init__.py +0 -0
  25. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/aleph_alpha.py +0 -0
  26. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/base.py +0 -0
  27. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/huggingface.py +0 -0
  28. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/mistral.py +0 -0
  29. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/models.py +0 -0
  30. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/openai.py +0 -0
  31. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/vllm.py +0 -0
  32. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/logger.py +0 -0
  33. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/main.py +0 -0
  34. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/__init__.py +0 -0
  35. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/base.py +0 -0
  36. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/__init__.py +0 -0
  37. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  38. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  39. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/bleu.py +0 -0
  40. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/chrf.py +0 -0
  41. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
  42. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  43. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/comet.py +0 -0
  44. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  45. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  46. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  47. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  48. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/f1.py +0 -0
  49. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  50. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  51. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  52. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/json_format.py +0 -0
  53. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  54. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/length_control.py +0 -0
  55. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  56. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  57. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/repetition.py +0 -0
  58. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  59. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  60. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  61. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  62. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  63. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/ter.py +0 -0
  64. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  65. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  66. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  67. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/__init__.py +0 -0
  68. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/base.py +0 -0
  69. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  70. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  71. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  72. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  73. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  74. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  75. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  76. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  77. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  78. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  79. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  80. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  81. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  82. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  83. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  84. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  85. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  86. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  87. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  88. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  89. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  90. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  91. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  92. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  93. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  94. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/utils.py +0 -0
  95. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  96. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  97. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  98. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  99. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  100. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  101. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  102. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/py.typed +0 -0
  103. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/__init__.py +0 -0
  104. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/base.py +0 -0
  105. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  106. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/result_processor.py +0 -0
  107. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  108. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/run_direct.py +0 -0
  109. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/shared/types.py +0 -0
  110. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/__init__.py +0 -0
  111. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/base.py +0 -0
  112. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  113. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  114. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  115. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  116. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  117. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  118. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  119. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  120. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  121. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  122. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  123. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  124. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  125. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  126. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
  127. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  128. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  129. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  130. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  131. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  132. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  133. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  134. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  135. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  136. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  137. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  138. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  139. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  140. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  141. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  142. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  143. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  144. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  145. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
  146. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  147. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  148. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  149. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  150. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  151. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  152. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  153. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  154. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  155. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/perturbation.py +0 -0
  156. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/registry.py +0 -0
  157. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/task_loader.py +0 -0
  158. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/utils.py +0 -0
  159. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/constants.py +0 -0
  160. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/file_ops.py +0 -0
  161. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/generate_task_docs.py +0 -0
  162. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/helpers.py +0 -0
  163. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/logging.py +0 -0
  164. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/packaging.py +0 -0
  165. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/tqdm_handler.py +0 -0
  166. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/README.md +0 -0
  167. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/__init__.py +0 -0
  168. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/formatter.py +0 -0
  169. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/mistral_formatter.py +0 -0
  170. {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -235,6 +235,7 @@ Requires-Dist: python-iso639>=2025.2.18
235
235
  Requires-Dist: wandb>=0.23.0,<1
236
236
  Requires-Dist: boto3>=1.40.54,<2
237
237
  Requires-Dist: numpy>=1.26.4
238
+ Requires-Dist: antlr4-python3-runtime==4.11.0
238
239
  Requires-Dist: accelerate ; extra == 'accelerate'
239
240
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
240
241
  Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
@@ -283,7 +284,7 @@ Description-Content-Type: text/markdown
283
284
  [![Docs](https://img.shields.io/badge/docs-online-blue)](https://aleph-alpha-research.github.io/eval-framework/)
284
285
  [![Stars](https://img.shields.io/github/stars/Aleph-Alpha-Research/eval-framework)](https://github.com/Aleph-Alpha-Research/eval-framework/stargazers)
285
286
 
286
- ![eval-framework](docs/eval-framework.png "https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/refs/heads/main/docs/eval-framework.png")
287
+ ![eval-framework](https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/refs/heads/main/docs/eval-framework.png)
287
288
 
288
289
  </div>
289
290
 
@@ -543,6 +544,6 @@ This project has received funding from the European Union’s Digital Europe Pro
543
544
  The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
544
545
 
545
546
  <p align="center">
546
- <img src="docs/OELLM_1.png" alt="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
547
- <img src="docs/OELLM_2.png" alt="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
547
+ <img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
548
+ <img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
548
549
  </p>
@@ -13,7 +13,7 @@
13
13
  [![Docs](https://img.shields.io/badge/docs-online-blue)](https://aleph-alpha-research.github.io/eval-framework/)
14
14
  [![Stars](https://img.shields.io/github/stars/Aleph-Alpha-Research/eval-framework)](https://github.com/Aleph-Alpha-Research/eval-framework/stargazers)
15
15
 
16
- ![eval-framework](docs/eval-framework.png "https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/refs/heads/main/docs/eval-framework.png")
16
+ ![eval-framework](https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/refs/heads/main/docs/eval-framework.png)
17
17
 
18
18
  </div>
19
19
 
@@ -273,6 +273,6 @@ This project has received funding from the European Union’s Digital Europe Pro
273
273
  The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
274
274
 
275
275
  <p align="center">
276
- <img src="docs/OELLM_1.png" alt="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
277
- <img src="docs/OELLM_2.png" alt="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
276
+ <img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
277
+ <img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
278
278
  </p>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.7"
3
+ version = "0.2.9"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -42,6 +42,9 @@ dependencies = [
42
42
  "wandb>=0.23.0,<1",
43
43
  "boto3>=1.40.54,<2",
44
44
  "numpy>=1.26.4",
45
+ # is a dependency of sympy, but not explicitly listed in the requirements.txt
46
+ # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
47
+ "antlr4-python3-runtime==4.11.0",
45
48
  ]
46
49
 
47
50
  [project.optional-dependencies]
@@ -31,6 +31,7 @@ class TaskArgs(BaseModel):
31
31
  task_subjects: list[str] | None = None
32
32
  hf_revision: str | None = None
33
33
  perturbation_config: PerturbationConfig | None = None
34
+ repeats: int | None = None
34
35
 
35
36
 
36
37
  class Hyperparameters(BaseModel):
@@ -110,6 +111,7 @@ class DeterminedContext(EvalContext):
110
111
  "judge_model_name",
111
112
  "judge_model_args",
112
113
  "perturbation_config",
114
+ "repeats",
113
115
  ]:
114
116
  val_cli = getattr(self, name, None)
115
117
  val_hparams = getattr(self.hparams.task_args, name, None)
@@ -152,6 +154,7 @@ class DeterminedContext(EvalContext):
152
154
  randomize_judge_order=self.randomize_judge_order,
153
155
  delete_output_dir_after_upload=self.hparams.delete_output_dir_after_upload
154
156
  or self.delete_output_dir_after_upload,
157
+ repeats=self.hparams.task_args.repeats or self.repeats,
155
158
  )
156
159
 
157
160
  return self
@@ -75,6 +75,7 @@ class EvalContext(AbstractContextManager):
75
75
  perturbation_seed: int | None = None,
76
76
  randomize_judge_order: bool = False,
77
77
  delete_output_dir_after_upload: bool | None = None,
78
+ repeats: int | None = None,
78
79
  ) -> None:
79
80
  self.llm_name = llm_name
80
81
  self.models_path = models_path
@@ -99,7 +100,7 @@ class EvalContext(AbstractContextManager):
99
100
  self.description = description
100
101
  self.randomize_judge_order = randomize_judge_order
101
102
  self.delete_output_dir_after_upload = delete_output_dir_after_upload
102
-
103
+ self.repeats = repeats
103
104
  if perturbation_type or perturbation_probability is not None:
104
105
  perturbation = {
105
106
  "type": perturbation_type,
@@ -65,6 +65,7 @@ class LocalContext(EvalContext):
65
65
  description=self.description,
66
66
  randomize_judge_order=self.randomize_judge_order,
67
67
  delete_output_dir_after_upload=self.delete_output_dir_after_upload,
68
+ repeats=self.repeats,
68
69
  )
69
70
 
70
71
  return self
@@ -204,10 +204,15 @@ class MathReasoningCompletion(BaseMetric[Completion]):
204
204
  timeout = 10
205
205
  # latex parse all ingested ground truth values for math reasoning
206
206
  for gt in response.ground_truth_list:
207
+ if gt is None:
208
+ continue
207
209
  signal.signal(signal.SIGALRM, timeout_handler) # Set timeout signal
208
210
  signal.alarm(timeout) # Set timeout duration
209
211
  try:
210
- gt_parsed = parse_latex(gt) # NOTE: parses f(x)=0,\quadf(x)=x-1,\quadf(x)=-x+1 to Eq(f(x), 0) ONLY
212
+ gt_normalized = self.normalize_expression(gt)
213
+ gt_parsed = parse_latex(
214
+ gt_normalized
215
+ ) # NOTE: parses f(x)=0,\quadf(x)=x-1,\quadf(x)=-x+1 to Eq(f(x), 0) ONLY
211
216
  ground_truths.append(gt_parsed)
212
217
  except Exception:
213
218
  ground_truths.append(gt)
@@ -229,15 +234,11 @@ class MathReasoningCompletion(BaseMetric[Completion]):
229
234
  )
230
235
  ]
231
236
  else:
232
- # fall back to string comparison
233
- # ground truth can be list or str, we have str comparisons
234
- assert isinstance(response.ground_truth, str)
235
- str_is_correct = self._is_str_correct(normalized_response, response.ground_truth)
236
- return [
237
- MetricResult(
238
- metric_name=self.NAME, value=float(str_is_correct), higher_is_better=True, error=response.error
239
- )
237
+ normalized_ground_truths = [
238
+ self.normalize_expression(gt) for gt in response.ground_truth_list if gt is not None
240
239
  ]
240
+ res = self._any_str_correct([normalized_response], normalized_ground_truths)
241
+ return [MetricResult(metric_name=self.NAME, value=float(res), higher_is_better=True, error=response.error)]
241
242
 
242
243
  def _any_str_correct(self, response_list: list, ground_truths: list) -> bool:
243
244
  """
@@ -1,10 +1,9 @@
1
1
  import logging
2
2
  import time
3
3
  import traceback
4
- from collections.abc import Callable
4
+ from collections.abc import Callable, Iterable
5
5
  from datetime import UTC, datetime
6
6
  from functools import partial
7
- from typing import Any
8
7
 
9
8
  from eval_framework.tasks.registry import get_task
10
9
 
@@ -14,6 +13,8 @@ except ImportError:
14
13
  get_cluster_info = None # type: ignore[assignment]
15
14
 
16
15
 
16
+ from typing import Any
17
+
17
18
  from tqdm import tqdm
18
19
 
19
20
  from eval_framework import __version__ as eval_framework_version
@@ -234,18 +235,21 @@ class ResponseGenerator:
234
235
  # If samples_batch_size = 1, samples are run sequentially; in any case, we return here after finishing each
235
236
  # individual batch to honor preemption requests and save cached results.
236
237
  samples_batch_size = self.config.batch_size
238
+ repeats = self.config.repeats
237
239
 
238
240
  # Calculate total samples for progress bar - use num_samples or iterate to count
239
- total_num_samples = self.num_samples
240
- if total_num_samples is None:
241
+ if self.num_samples is None:
241
242
  # Count samples by iterating (this might be expensive for large datasets)
242
- total_num_samples = sum(1 for _ in self.task.iterate_samples(None))
243
+ total_num_samples = sum(1 for _ in self.task.iterate_samples(None)) * repeats
244
+ else:
245
+ total_num_samples = self.num_samples * repeats
243
246
 
244
247
  samples_batch: list[Sample] = []
245
248
  with tqdm(
246
249
  total=total_num_samples, desc=f"Processing {self.response_type.value}", disable=get_disable_bar_flag()
247
250
  ) as pbar:
248
- for i, sample in enumerate(self.task.iterate_samples(self.num_samples)):
251
+ samples = self.task.iterate_samples(self.num_samples)
252
+ for i, sample in enumerate(repeat_samples(samples, repeats)):
249
253
  subject = f" - Subject: {sample.subject}"
250
254
  sample_index = i + 1
251
255
 
@@ -330,6 +334,7 @@ class ResponseGenerator:
330
334
  "llm_name",
331
335
  "llm_args",
332
336
  "perturbation_config",
337
+ "repeats",
333
338
  ]
334
339
  for key in keys:
335
340
  if loaded_metadata[key] != current_metadata[key]:
@@ -349,3 +354,17 @@ class ResponseGenerator:
349
354
  logger.info("Completions generated and saved.")
350
355
 
351
356
  return responses, preempted
357
+
358
+
359
+ def repeat_samples(samples: Iterable[Sample], repeats: int) -> Iterable[Sample]:
360
+ """Flatten repeats into a single stream of samples.
361
+
362
+ After expansion original sample indices do not point to the same sample anymore. They
363
+ Original sample can be recovered by `original_index = expanded_index // repeats`.
364
+ """
365
+ for sample in samples:
366
+ base_id = sample.id * repeats
367
+ for repeat_idx in range(repeats):
368
+ repeated_sample = sample.model_copy()
369
+ repeated_sample.id = base_id + repeat_idx
370
+ yield repeated_sample
@@ -77,6 +77,13 @@ def parse_args() -> argparse.Namespace:
77
77
  parser.add_argument(
78
78
  "--num-fewshot", type=int, required=False, default=0, help="The number of fewshot examples to use."
79
79
  )
80
+ parser.add_argument(
81
+ "--repeats",
82
+ type=int,
83
+ required=False,
84
+ default=1,
85
+ help="The number of times to repeat each sample in the evaluation.",
86
+ )
80
87
  parser.add_argument("--task-name", type=str, required=False, help="The name of the task to evaluate.")
81
88
  parser.add_argument(
82
89
  "--randomize-judge-order",
@@ -319,6 +326,7 @@ def run_with_kwargs(kwargs: dict) -> None:
319
326
  num_samples=kwargs["num_samples"],
320
327
  max_tokens=kwargs["max_tokens"],
321
328
  num_fewshot=kwargs["num_fewshot"],
329
+ repeats=kwargs["repeats"],
322
330
  task_name=kwargs["task_name"],
323
331
  task_subjects=kwargs["task_subjects"],
324
332
  hf_revision=kwargs["hf_revision"],
@@ -355,6 +355,25 @@ class AIME2024(MATHReasoning):
355
355
  return item["answer"].lstrip("0") # valid answers in this dataset range from 0-999 and have leading zeros
356
356
 
357
357
 
358
+ class AIME2025(AIME2024):
359
+ """AIME 2025 dataset: https://huggingface.co/datasets/math-ai/aime25
360
+
361
+ This dataset contains a single test split of 30 questions.
362
+ Data contains
363
+ problem | answer | id
364
+
365
+ pass@1 evaluation
366
+ """
367
+
368
+ NAME = "AIME2025"
369
+ DATASET_PATH = "math-ai/aime25"
370
+ SAMPLE_SPLIT = "test"
371
+ FEWSHOT_SPLIT = "test"
372
+
373
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
374
+ return item["answer"]
375
+
376
+
358
377
  class MATH500(MATHReasoning):
359
378
  """MATH500 dataset: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
360
379
 
@@ -54,7 +54,9 @@ class EvalConfig(BaseConfig):
54
54
  save_intermediate_results: Annotated[bool, BeforeValidator(lambda v: True if v is None else v)] = True
55
55
  save_logs: Annotated[bool, BeforeValidator(lambda v: True if v is None else v)] = True
56
56
  delete_output_dir_after_upload: Annotated[bool, BeforeValidator(lambda v: False if v is None else v)] = False
57
-
57
+ # how many times to repeat a single sample
58
+ # can be used to reduce variance of tasks with low number of samples, e.g. AIME24
59
+ repeats: Annotated[int, BeforeValidator(lambda v: 1 if v is None else v), Field(ge=1)] = 1
58
60
  # Adding a new member? Remember to update KEYS_UNRELATED_TO_RESULTS if it doesn't impact eval results.
59
61
 
60
62
  @property
@@ -18,6 +18,7 @@ class TaskNameEnum(Enum):
18
18
  def register_all_tasks() -> None:
19
19
  """Register all the benchmark tasks with the eval framework."""
20
20
  register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2024")
21
+ register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2025")
21
22
  register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC")
22
23
  register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_IDK")
23
24
  register_lazy_task("eval_framework.tasks.benchmarks.arc_de.ARC_DE")
File without changes