eval-framework 0.3.8__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. {eval_framework-0.3.8 → eval_framework-0.5.1}/PKG-INFO +11 -15
  2. {eval_framework-0.3.8 → eval_framework-0.5.1}/README.md +0 -1
  3. {eval_framework-0.3.8 → eval_framework-0.5.1}/pyproject.toml +17 -38
  4. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/evaluation_generator.py +5 -9
  5. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/openai.py +2 -2
  6. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/base.py +1 -1
  7. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_assertion.py +4 -14
  8. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/response_generator.py +8 -14
  9. eval_framework-0.5.1/src/eval_framework/tasks/__init__.py +12 -0
  10. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/base.py +3 -3
  11. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/aidanbench.py +2 -2
  12. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores200.py +3 -3
  13. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores_plus.py +1 -2
  14. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gsm8k.py +30 -1
  15. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py +34 -40
  16. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/squad.py +26 -0
  17. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/triviaqa.py +32 -1
  18. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/wmt.py +2 -2
  19. {eval_framework-0.3.8/src/eval_framework/tasks/benchmarks → eval_framework-0.5.1/src/eval_framework/tasks}/dataset_revisions.py +30 -7
  20. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/eval_config.py +2 -3
  21. eval_framework-0.5.1/src/eval_framework/tasks/registry.py +301 -0
  22. eval_framework-0.5.1/src/eval_framework/tasks/task-dataset-revisions.json +62 -0
  23. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/task_names.py +4 -122
  24. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/task_style.py +64 -2
  25. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/generate_task_docs.py +6 -12
  26. eval_framework-0.3.8/src/eval_framework/metrics/completion/comet.py +0 -56
  27. eval_framework-0.3.8/src/eval_framework/tasks/__init__.py +0 -6
  28. eval_framework-0.3.8/src/eval_framework/tasks/benchmarks/task-dataset-revisions.json +0 -179
  29. eval_framework-0.3.8/src/eval_framework/tasks/registry.py +0 -186
  30. {eval_framework-0.3.8 → eval_framework-0.5.1}/LICENSE +0 -0
  31. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/__init__.py +0 -0
  32. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/base_config.py +0 -0
  33. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/context/__init__.py +0 -0
  34. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/context/determined.py +0 -0
  35. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/context/eval.py +0 -0
  36. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/context/local.py +0 -0
  37. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/exceptions.py +0 -0
  38. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/drop_process_results.py +0 -0
  39. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  40. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  41. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  42. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  43. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  44. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/__init__.py +0 -0
  45. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/aleph_alpha.py +0 -0
  46. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/base.py +0 -0
  47. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/huggingface.py +0 -0
  48. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/mistral.py +0 -0
  49. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/models.py +0 -0
  50. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/vllm.py +0 -0
  51. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/vllm_local_server.py +0 -0
  52. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/logger.py +0 -0
  53. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/main.py +0 -0
  54. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/__init__.py +0 -0
  55. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
  56. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
  57. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/__init__.py +0 -0
  58. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
  59. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  60. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/bleu.py +0 -0
  61. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/chrf.py +0 -0
  62. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
  63. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  64. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  65. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  66. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
  67. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  68. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/f1.py +0 -0
  69. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  70. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  71. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  72. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/json_format.py +0 -0
  73. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  74. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/length_control.py +0 -0
  75. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
  76. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  77. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  78. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
  79. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  80. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  81. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/repetition.py +0 -0
  82. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  83. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  84. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  85. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  86. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  87. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/ter.py +0 -0
  88. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  89. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
  90. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  91. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/__init__.py +0 -0
  92. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/base.py +0 -0
  93. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  94. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  95. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  96. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  97. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  98. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  99. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  100. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  101. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  102. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  103. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  104. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  105. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  106. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  107. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  108. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  109. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  110. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  111. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  112. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  113. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  114. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  115. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  116. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  117. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  118. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/utils.py +0 -0
  119. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
  120. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
  121. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  122. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
  123. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  124. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  125. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  126. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  127. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/py.typed +0 -0
  128. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/__init__.py +0 -0
  129. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/base.py +0 -0
  130. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  131. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/result_processor.py +0 -0
  132. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  133. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/run.py +0 -0
  134. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/run_direct.py +0 -0
  135. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/shared/types.py +0 -0
  136. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/suite.py +0 -0
  137. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
  138. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
  139. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  140. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
  141. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  142. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  143. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  144. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
  145. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  146. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  147. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  148. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
  149. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
  150. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  151. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
  152. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  153. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  154. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
  155. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
  156. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  157. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  158. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  159. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  160. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  161. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  162. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
  163. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
  164. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
  165. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  166. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
  167. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
  168. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
  169. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  170. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  171. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  172. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
  173. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  174. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
  175. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
  176. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  177. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  178. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
  179. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  180. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  181. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
  182. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  183. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  184. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/perturbation.py +0 -0
  185. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/task_loader.py +0 -0
  186. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/utils.py +0 -0
  187. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/constants.py +0 -0
  188. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/file_ops.py +0 -0
  189. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/helpers.py +0 -0
  190. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/logging.py +0 -0
  191. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/packaging.py +0 -0
  192. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/tqdm_handler.py +0 -0
  193. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/README.md +0 -0
  194. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/__init__.py +0 -0
  195. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/formatter.py +0 -0
  196. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/mistral_formatter.py +0 -0
  197. {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.3.8
3
+ Version: 0.5.1
4
4
  Summary: Evaluation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -212,16 +212,15 @@ Classifier: Programming Language :: Python :: 3 :: Only
212
212
  Classifier: Topic :: Software Development :: Libraries
213
213
  Classifier: Typing :: Typed
214
214
  Requires-Dist: pyyaml>=6.0.3,<7
215
- Requires-Dist: xmltodict>=0.15.1,<0.16
215
+ Requires-Dist: xmltodict>=1.0.4,<1.1
216
216
  Requires-Dist: pydantic>=2.13.4,<3
217
- Requires-Dist: datasets>=4.8.5,<5
217
+ Requires-Dist: datasets>=5.0.0,<6
218
218
  Requires-Dist: sacrebleu>=2.6.0,<3
219
- Requires-Dist: pycountry>=24.6.1,<25
219
+ Requires-Dist: pycountry>=26.2.16,<27
220
220
  Requires-Dist: nltk>=3.9.4,<4
221
221
  Requires-Dist: python-dotenv>=1.2.2,<2
222
222
  Requires-Dist: lingua-language-detector>=2.2.0,<3
223
223
  Requires-Dist: google-crc32c>=1.8.0,<2
224
- Requires-Dist: kubernetes>=31.0.0,<32
225
224
  Requires-Dist: langdetect>=1.0.9,<2
226
225
  Requires-Dist: spacy>=3.8.14,<4
227
226
  Requires-Dist: jsonschema>=4.26.0,<5
@@ -232,18 +231,17 @@ Requires-Dist: llm-sandbox[docker]==0.3.39
232
231
  Requires-Dist: jsonlines>=4,<5
233
232
  Requires-Dist: lxml>=6.1.1,<7
234
233
  Requires-Dist: python-iso639>=2026.4.20
235
- Requires-Dist: wandb>=0.27.0,<1
236
- Requires-Dist: boto3>=1.43.18,<2
237
- Requires-Dist: numpy>=1.26.4
234
+ Requires-Dist: wandb>=0.27.2,<1
235
+ Requires-Dist: boto3>=1.43.19,<2
236
+ Requires-Dist: numpy>=2.2.6
238
237
  Requires-Dist: antlr4-python3-runtime==4.11.0
239
- Requires-Dist: scipy>=1.17.1,<2
238
+ Requires-Dist: scipy>=1.18.0,<2
240
239
  Requires-Dist: accelerate ; extra == 'accelerate'
241
- Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
240
+ Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
242
241
  Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
243
- Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
244
242
  Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
245
243
  Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
246
- Requires-Dist: mistral-common>=1.11.2,<2 ; extra == 'mistral'
244
+ Requires-Dist: mistral-common>=1.11.3,<2 ; extra == 'mistral'
247
245
  Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
248
246
  Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
249
247
  Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
@@ -253,7 +251,7 @@ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
253
251
  Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
254
252
  Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
255
253
  Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
256
- Requires-Dist: accelerate>=0.34.2,<1 ; extra == 'transformers'
254
+ Requires-Dist: accelerate>=1.14.0,<2 ; extra == 'transformers'
257
255
  Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
258
256
  Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
259
257
  Requires-Python: >=3.12, <3.13
@@ -261,7 +259,6 @@ Project-URL: repository, https://github.com/Aleph-Alpha-Research/eval-framework
261
259
  Provides-Extra: accelerate
262
260
  Provides-Extra: all
263
261
  Provides-Extra: api
264
- Provides-Extra: comet
265
262
  Provides-Extra: determined
266
263
  Provides-Extra: mistral
267
264
  Provides-Extra: openai
@@ -319,7 +316,6 @@ pip install eval_framework
319
316
 
320
317
  There are optional extras available to unlock specific features of the library:
321
318
  - `api` for inference using the aleph-alpha client.
322
- - `comet` for the COMET metric.
323
319
  - `determined` for running jobs via determined.
324
320
  - `mistral` for inference on Mistral models.
325
321
  - `transformers` for inference using the transformers library.
@@ -47,7 +47,6 @@ pip install eval_framework
47
47
 
48
48
  There are optional extras available to unlock specific features of the library:
49
49
  - `api` for inference using the aleph-alpha client.
50
- - `comet` for the COMET metric.
51
50
  - `determined` for running jobs via determined.
52
51
  - `mistral` for inference on Mistral models.
53
52
  - `transformers` for inference using the transformers library.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.3.8"
3
+ version = "0.5.1"
4
4
  description = "Evaluation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -19,16 +19,15 @@ classifiers = [
19
19
  ]
20
20
  dependencies = [
21
21
  "pyyaml>=6.0.3,<7",
22
- "xmltodict>=0.15.1,<0.16",
22
+ "xmltodict>=1.0.4,<1.1",
23
23
  "pydantic>=2.13.4,<3",
24
- "datasets>=4.8.5,<5",
24
+ "datasets>=5.0.0,<6",
25
25
  "sacrebleu>=2.6.0,<3",
26
- "pycountry>=24.6.1,<25",
26
+ "pycountry>=26.2.16,<27",
27
27
  "nltk>=3.9.4,<4",
28
28
  "python-dotenv>=1.2.2,<2",
29
29
  "lingua-language-detector>=2.2.0,<3",
30
30
  "google-crc32c>=1.8.0,<2",
31
- "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
32
31
  "langdetect>=1.0.9,<2", # required by the original ifeval implementation
33
32
  "spacy>=3.8.14,<4",
34
33
  "jsonschema>=4.26.0,<5",
@@ -39,14 +38,13 @@ dependencies = [
39
38
  "jsonlines>=4,<5",
40
39
  "lxml>=6.1.1,<7",
41
40
  "python-iso639>=2026.4.20",
42
- "wandb>=0.27.0,<1",
43
- "boto3>=1.43.18,<2",
44
- "numpy>=1.26.4",
41
+ "wandb>=0.27.2,<1",
42
+ "boto3>=1.43.19,<2",
43
+ "numpy>=2.2.6",
45
44
  # is a dependency of sympy, but not explicitly listed in the requirements.txt
46
45
  # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
47
46
  "antlr4-python3-runtime==4.11.0",
48
- "scipy>=1.17.1,<2", # required for the aggregation of pass@k metrics
49
-
47
+ "scipy>=1.18.0,<2", # required for the aggregation of pass@k metrics
50
48
  ]
51
49
 
52
50
  [project.optional-dependencies]
@@ -64,7 +62,7 @@ openai = [
64
62
  transformers = [
65
63
  "transformers>=4.45.2,<5",
66
64
  "torch>=2.5,<3",
67
- "accelerate>=0.34.2,<1",
65
+ "accelerate>=1.14.0,<2",
68
66
  ]
69
67
  accelerate = ["accelerate"]
70
68
  vllm = [
@@ -72,21 +70,17 @@ vllm = [
72
70
  "torch>=2.5,<3"
73
71
  ]
74
72
  mistral = [
75
- "mistral-common>=1.11.2,<2",
73
+ "mistral-common>=1.11.3,<2",
76
74
  "huggingface-hub>=0.36.2,<0.37",
77
75
  "eval_framework[vllm]",
78
76
  ]
79
- # Benchmark/metric specific extras
80
- comet = [
81
- "unbabel-comet>=2.2.7,<3",
82
- ]
83
77
  # from template-formatting
84
78
  optional = [
85
79
  "transformers>=4.45.2,<5",
86
80
  "jinja2>=3.1.6,<4"
87
81
  ]
88
82
  all = [
89
- "eval_framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral]"
83
+ "eval_framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral]"
90
84
  ]
91
85
 
92
86
  [project.urls]
@@ -98,24 +92,24 @@ eval_framework = "eval_framework.run:run"
98
92
  [dependency-groups]
99
93
  dev = [
100
94
  "mypy>=2.1.0,<3",
101
- "pytest>=9.0.3,<10",
95
+ "pytest>=9.1.0,<10",
102
96
  "pytest-mock>=3.15.1",
103
97
  "pytest-xdist>=3.8.0,<4",
104
98
  "pytest-sugar>1.1,<2",
105
99
  "types-pyyaml>=6.0.12.20260518,<7",
106
100
  "types-python-dateutil>=2.9.0.20260518,<3",
107
101
  "types-requests>=2.33.0.20260518,<3",
108
- "plotly>=5.24.1,<6",
109
- "ruff>=0.15.15",
102
+ "plotly>=6.8.0,<7",
103
+ "ruff>=0.15.18",
110
104
  "pip-licenses>=5.5.5",
111
105
  ]
112
106
  flash-attn = [
113
- "flash-attn>=2.8.3,<2.9",
107
+ "flash-attn>=2.8.3.post1,<2.9",
114
108
  "torch"
115
109
  ]
116
110
 
117
111
  [build-system]
118
- requires = ["uv_build>=0.11.17,<0.11.18"]
112
+ requires = ["uv_build>=0.11.23,<0.11.24"]
119
113
  build-backend = "uv_build"
120
114
 
121
115
  [tool.uv.build-backend]
@@ -126,22 +120,6 @@ override-dependencies = [
126
120
  "requests>=2.32,<3", # fix for determined
127
121
  ]
128
122
 
129
- [tool.uv.sources]
130
- torch = [
131
- { index = "pytorch-default", marker = "sys_platform != 'linux'" },
132
- { index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
133
- ]
134
-
135
- [[tool.uv.index]]
136
- name = "pytorch-cu124"
137
- url = "https://download.pytorch.org/whl/cu124"
138
- explicit = true
139
-
140
- [[tool.uv.index]]
141
- name = "pytorch-default"
142
- url = "https://pypi.org/simple"
143
- explicit = true
144
-
145
123
  [tool.uv.extra-build-dependencies]
146
124
  # Build flash-attn with the same torch version as in the container. Details at:
147
125
  # https://docs.astral.sh/uv/concepts/projects/config/#augmenting-build-dependencies
@@ -167,6 +145,7 @@ known-third-party = ["wandb"]
167
145
 
168
146
  [tool.ruff.lint.extend-per-file-ignores]
169
147
  "__init__.py" = ["F401"]
148
+ "tests/tests_eval_framework/tasks/benchmarks/test_mmlu_de.py" = ["E501"]
170
149
 
171
150
  [tool.mypy]
172
151
  plugins = "pydantic.mypy"
@@ -18,7 +18,7 @@ from eval_framework.result_processors.base import Result, ResultProcessor
18
18
  from eval_framework.shared.types import Completion, Loglikelihood
19
19
  from eval_framework.tasks.base import ResponseType
20
20
  from eval_framework.tasks.eval_config import EvalConfig
21
- from eval_framework.tasks.registry import get_task
21
+ from eval_framework.tasks.registry import registry
22
22
  from eval_framework.utils.constants import RED, RESET
23
23
  from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
24
24
 
@@ -36,13 +36,9 @@ class EvaluationGenerator:
36
36
  self.result_processor = result_processor
37
37
  self.save_intermediate_results = config.save_intermediate_results
38
38
 
39
- task_class = get_task(config.task_name)
40
- if hasattr(task_class, "TASK_STYLER"):
41
- response_type = task_class.TASK_STYLER.response_type
42
- task_metrics = list(task_class.TASK_STYLER.metrics)
43
- else:
44
- response_type = task_class.RESPONSE_TYPE
45
- task_metrics = task_class.METRICS
39
+ eval_ = registry()[config.task_name]
40
+ response_type = eval_.response_type()
41
+ task_metrics = eval_.metrics()
46
42
 
47
43
  if response_type == ResponseType.COMPLETION:
48
44
  self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
@@ -51,7 +47,7 @@ class EvaluationGenerator:
51
47
  else:
52
48
  raise NotImplementedError
53
49
 
54
- self.task_name = task_class.NAME
50
+ self.task_name = eval_.task_class().NAME
55
51
 
56
52
  def _run_metric_calculators(self, responses: list[Completion | Loglikelihood]) -> list[Result]:
57
53
  results: list[Result] = self.result_processor.load_metrics_results()
@@ -55,7 +55,7 @@ class OpenAIModel(BaseLLM):
55
55
  formatter: BaseFormatter | None = None,
56
56
  temperature: float | None = None,
57
57
  top_p: float | None = None,
58
- api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
58
+ api_key: str | None = None,
59
59
  organization: str | None = None,
60
60
  base_url: str | None = None,
61
61
  bytes_per_token: float | None = None,
@@ -86,7 +86,7 @@ class OpenAIModel(BaseLLM):
86
86
  self._top_p = top_p
87
87
 
88
88
  self._client = OpenAI(
89
- api_key=api_key,
89
+ api_key=api_key if api_key is not None else os.getenv("OPENAI_API_KEY", ""),
90
90
  organization=organization,
91
91
  base_url=base_url,
92
92
  )
@@ -36,7 +36,7 @@ class BaseMetric[Response](ABC):
36
36
  # macro averaging the overall computation default.
37
37
  AGGREGATORS: list[Aggregator] = []
38
38
  # Set by the evaluation generator before calculate(); controls how infra failures are handled.
39
- fail_on_error: bool = False
39
+ fail_on_error: bool = True
40
40
 
41
41
  @classproperty
42
42
  def NAMES(cls) -> list[str]:
@@ -1,7 +1,7 @@
1
1
  from llm_sandbox.exceptions import SandboxTimeoutError
2
2
 
3
3
  from eval_framework.metrics.base import BaseMetric, MetricResult
4
- from eval_framework.shared.types import Completion, Error
4
+ from eval_framework.shared.types import Completion
5
5
  from eval_framework.tasks.utils import run_python_code
6
6
 
7
7
 
@@ -16,7 +16,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
16
16
  code = response.completion
17
17
  try:
18
18
  output = run_python_code(code, image="python:3.12-slim")
19
- except SandboxTimeoutError as e:
19
+ except SandboxTimeoutError:
20
20
  # The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra
21
21
  # problem.
22
22
  import traceback
@@ -26,7 +26,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
26
26
  metric_name=self.NAME,
27
27
  value=0.0,
28
28
  higher_is_better=True,
29
- error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
29
+ code_execution_trace=traceback.format_exc(),
30
30
  )
31
31
  ]
32
32
  except Exception as e:
@@ -42,22 +42,12 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
42
42
  last_output = output_parts[-1]
43
43
 
44
44
  success = last_output == "True"
45
- error = (
46
- None
47
- if success
48
- else Error(
49
- error_class="CodeCompletionAssertionError",
50
- message=f"Expected 'True' but got '{last_output}'",
51
- traceback=output,
52
- )
53
- )
54
-
55
45
  return [
56
46
  MetricResult(
57
47
  metric_name=self.NAME,
58
48
  value=1.0 if success else 0.0,
59
49
  higher_is_better=True,
60
- error=error,
50
+ error=None,
61
51
  code_execution_trace=output,
62
52
  )
63
53
  ]
@@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable
5
5
  from datetime import UTC, datetime
6
6
  from functools import partial
7
7
 
8
- from eval_framework.tasks.registry import get_task
8
+ from eval_framework.tasks.registry import registry
9
9
 
10
10
  try:
11
11
  from determined._info import get_cluster_info
@@ -28,7 +28,6 @@ from eval_framework.shared.types import (
28
28
  )
29
29
  from eval_framework.tasks.base import Language, ResponseType, Sample
30
30
  from eval_framework.tasks.eval_config import EvalConfig
31
- from eval_framework.tasks.perturbation import create_perturbation_class
32
31
  from eval_framework.tasks.utils import raise_errors
33
32
  from eval_framework.utils.constants import RED, RESET
34
33
  from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
@@ -54,7 +53,6 @@ def map_language_to_value(
54
53
 
55
54
  class ResponseGenerator:
56
55
  def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFileProcessor) -> None:
57
- self.few_shot = config.num_fewshot
58
56
  self.task_name = config.task_name
59
57
  self.llm = llm
60
58
  self.config = config
@@ -62,20 +60,16 @@ class ResponseGenerator:
62
60
  self.num_samples = config.num_samples
63
61
  self.save_intermediate_results = config.save_intermediate_results
64
62
 
65
- task_class = get_task(config.task_name)
66
-
67
63
  if config.perturbation_config is not None:
68
- perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config)
69
- self.task = perturbation_task_class.with_overwrite(
70
- self.few_shot,
71
- custom_subjects=self.config.task_subjects,
72
- custom_hf_revision=self.config.hf_revision,
64
+ self.task = registry()[config.task_name].create_perturbation(
65
+ config.perturbation_config,
66
+ config.num_fewshot,
67
+ config.task_subjects,
68
+ config.hf_revision,
73
69
  )
74
70
  else:
75
- self.task = task_class.with_overwrite(
76
- self.few_shot,
77
- custom_subjects=self.config.task_subjects,
78
- custom_hf_revision=self.config.hf_revision,
71
+ self.task = registry()[config.task_name].create(
72
+ config.num_fewshot, config.task_subjects, config.hf_revision
79
73
  )
80
74
 
81
75
  self.response_type = self.task.get_response_type()
@@ -0,0 +1,12 @@
1
+ # Register all tasks on import
2
+ from pathlib import Path
3
+
4
+ from .dataset_revisions import DatasetRevision
5
+ from .task_names import register_all_tasks
6
+
7
+ DatasetRevision.add_revision_file(Path(__file__).parent / "task-dataset-revisions.json")
8
+
9
+ register_all_tasks()
10
+
11
+ del register_all_tasks
12
+ del DatasetRevision
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
15
15
  from pydantic import BaseModel, ConfigDict
16
16
 
17
17
  from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
18
- from eval_framework.tasks.benchmarks.dataset_revisions import get_pinned_dataset_revision
18
+ from eval_framework.tasks.dataset_revisions import DatasetRevision
19
19
  from eval_framework.tasks.utils import classproperty, raise_errors
20
20
  from template_formatting.formatter import Message, Role
21
21
 
@@ -118,7 +118,7 @@ class BaseTask[SubjectType](ABC):
118
118
  # Applied once at instance creation; not refreshed if the pin file changes mid-run.
119
119
  if custom_hf_revision:
120
120
  self.HF_REVISION = custom_hf_revision
121
- elif self.HF_REVISION is None and (pinned := get_pinned_dataset_revision(self.__class__.__name__)):
121
+ elif self.HF_REVISION is None and (pinned := DatasetRevision.pinned_revision(self.__class__.__name__)):
122
122
  self.HF_REVISION = pinned
123
123
 
124
124
  @classmethod
@@ -359,7 +359,7 @@ class BaseTask[SubjectType](ABC):
359
359
  samples: list[Sample],
360
360
  stop_sequences: list[str] | None = None,
361
361
  max_tokens: int | None = None,
362
- fail_on_error: bool = False,
362
+ fail_on_error: bool = True,
363
363
  ) -> list[Completion]:
364
364
  """
365
365
  Generates completions for the sample.
@@ -109,7 +109,7 @@ class AidanBenchOriginal(BaseTask[str]):
109
109
  stop_sequences: list[str] | None,
110
110
  max_tokens: int | None,
111
111
  initial_samples: list[Sample],
112
- fail_on_error: bool = False,
112
+ fail_on_error: bool = True,
113
113
  ) -> tuple[list[list[Message]], list[Union["Error", None]]]:
114
114
  initial_messages = [s.messages for s in initial_samples]
115
115
  samples = [(s, False) for s in initial_samples] # (sample, is_done)
@@ -170,7 +170,7 @@ class AidanBenchOriginal(BaseTask[str]):
170
170
  samples: list[Sample],
171
171
  stop_sequences: list[str] | None = None,
172
172
  max_tokens: int | None = None,
173
- fail_on_error: bool = False,
173
+ fail_on_error: bool = True,
174
174
  ) -> list[Completion]:
175
175
  assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
176
176
  "Each sample must have exactly one USER message."
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  import random
3
3
  from pathlib import Path
4
- from typing import Any
4
+ from typing import Any, cast
5
5
 
6
6
  import pycountry
7
7
  from datasets import DatasetDict, DownloadConfig, load_dataset
@@ -100,11 +100,11 @@ class Flores200(BaseTask[str]):
100
100
 
101
101
  def _get_instruction_text(self, item: dict[str, Any]) -> str:
102
102
  source_key = item["subject"].split("-")[0]
103
- source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
103
+ source_language = cast(Any, pycountry.languages.get(alpha_3=source_key.split("_")[0])).name
104
104
  source = item[f"sentence_{source_key}"]
105
105
  instruction = f"{source_language} sentence: {source}\n"
106
106
  target_key = item["subject"].split("-")[1]
107
- target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
107
+ target_language = cast(Any, pycountry.languages.get(alpha_3=target_key.split("_")[0])).name
108
108
 
109
109
  return f"{instruction}{target_language} sentence:"
110
110
 
@@ -4,7 +4,6 @@ from typing import Any
4
4
 
5
5
  from eval_framework.metrics.completion.bleu import BLEU
6
6
  from eval_framework.metrics.completion.chrf import CHRF
7
- from eval_framework.metrics.completion.comet import COMET
8
7
  from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
9
8
  from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
10
9
 
@@ -29,7 +28,7 @@ class FloresPlus(BaseTask[str]):
29
28
  SAMPLE_SPLIT = "dev"
30
29
  FEWSHOT_SPLIT = "devtest"
31
30
  RESPONSE_TYPE = ResponseType.COMPLETION
32
- METRICS = [BLEU, CHRF, COMET]
31
+ METRICS = [BLEU, CHRF]
33
32
  SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
34
33
  PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
35
34
  LANGUAGE = {
@@ -4,6 +4,7 @@ from typing import Any
4
4
 
5
5
  from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion, AccuracyCompletionOLMES
6
6
  from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
7
+ from eval_framework.tasks.task_style import BPBStyle
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
@@ -95,7 +96,6 @@ class GSM8KEvalHarness(BaseTask[str]):
95
96
 
96
97
  NAME = "GSM8KEvalHarness"
97
98
  DATASET_PATH = "openai/gsm8k"
98
- HF_REVISION = "main"
99
99
  SAMPLE_SPLIT = "test"
100
100
  FEWSHOT_SPLIT = "train"
101
101
  RESPONSE_TYPE = ResponseType.COMPLETION
@@ -216,3 +216,32 @@ class GSM8K_OLMES(GSM8K):
216
216
 
217
217
  def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
218
218
  return self._clean_short_answer(completion_text)
219
+
220
+
221
+ class GSM8KBPB(GSM8K_OLMES):
222
+ NAME = "GSM8KBPB"
223
+ TASK_STYLER = BPBStyle(cue_text="Answer:", leading_space_continuations=False)
224
+
225
+ # BPBStyle already adds "Answer:" as that separate assistant message. But the methods we inherit
226
+ # still put "Answer:" at the end of the question text and leave it out of the fewshot answer.
227
+ # So we override them here: remove "Answer:" from the question, and add it back in front of the
228
+ # fewshot answer. Without this, the question ends in "Answer:Answer:" and fewshot answers have
229
+ # no "Answer:" label at all.
230
+
231
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
232
+ return f"Question: {item['question']}\n"
233
+
234
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
235
+ return f"Answer:{self.normalize_answer_str(item)}"
236
+
237
+ def _get_raw_question(self, item: dict[str, Any]) -> str:
238
+ return item["question"]
239
+
240
+ def _get_choices(self, item: dict[str, Any]) -> list[str]:
241
+ return [self.normalize_answer_str(item)]
242
+
243
+ def _get_correct_index(self, item: dict[str, Any]) -> int:
244
+ return 0
245
+
246
+ def _get_ground_truth(self, item: dict[str, Any]) -> str:
247
+ return self._get_choices(item)[0]
@@ -14,8 +14,8 @@ from eval_framework.metrics.completion.minerva_math_utils import (
14
14
  extract_answers,
15
15
  normalized_gold_from_solution,
16
16
  )
17
- from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
18
17
  from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
18
+ from eval_framework.tasks.task_style import BPBStyle
19
19
 
20
20
  # Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB)
21
21
  MATH_SUBJECTS = [
@@ -612,44 +612,6 @@ class MATH500Minerva(MATHMinerva):
612
612
  super().__init__(num_fewshot)
613
613
 
614
614
 
615
- class MATHMinervaBPB(MATHReasoning):
616
- """
617
- MATH (Hendrycks) with Minerva-style prompt, evaluated via loglikelihood of the
618
- gold answer string (bits-per-byte).
619
- Same prompt as MATHMinerva; scores P(normalized_gold_answer | prompt).
620
- """
621
-
622
- NAME = "MATHMinervaBPB"
623
- DATASET_PATH = "EleutherAI/hendrycks_math"
624
- SAMPLE_SPLIT = "test"
625
- FEWSHOT_SPLIT = "train"
626
- RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
627
- METRICS = [BitsPerByteLoglikelihood]
628
- SUBJECTS = MATH_SUBJECTS
629
- LANGUAGE = Language.ENG
630
-
631
- def _get_instruction_text(self, item: dict[str, Any]) -> str:
632
- return "Problem:\n" + item["problem"] + "\n\n" + "Solution:"
633
-
634
- def _get_cue_text(self, item: dict[str, Any]) -> str:
635
- return ""
636
-
637
- def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
638
- normalized = self._normalized_gold_from_solution(item["solution"])
639
- if normalized is None:
640
- return None
641
- return " " + normalized
642
-
643
- def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
644
- normalized = self._normalized_gold_from_solution(item["solution"])
645
- if normalized is None:
646
- return None
647
- return [" " + normalized]
648
-
649
- def _normalized_gold_from_solution(self, solution: str) -> str | None:
650
- return normalized_gold_from_solution(solution)
651
-
652
-
653
615
  class MATHLvl5(MATH):
654
616
  NAME = "Math Lvl 5"
655
617
 
@@ -742,7 +704,7 @@ Answer:"""
742
704
 
743
705
 
744
706
  _OLMES_FEWSHOTS = [
745
- ## https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
707
+ # https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
746
708
  {
747
709
  "problem": "Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
748
710
  "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so "
@@ -790,3 +752,35 @@ class MATHMinerva_OLMES(MATHMinerva):
790
752
 
791
753
  def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
792
754
  return _OLMES_FEWSHOTS[: self.num_fewshot]
755
+
756
+
757
+ class MATHMinervaBPB(MATHMinerva_OLMES):
758
+ NAME = "MATHMinervaBPB"
759
+ TASK_STYLER = BPBStyle(cue_text="Solution:")
760
+
761
+ # BPBStyle already adds "Solution:" as that separate assistant message. But the methods we inherit
762
+ # still put "Solution:" at the end of the question text and leave it out of the fewshot answer.
763
+ # So we override them here: remove "Solution:" from the question, and add it back in front of the
764
+ # fewshot answer. Without this, the question ends in "Solution:Solution:" and fewshot answers have
765
+ # no "Solution:" label at all.
766
+
767
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
768
+ return "Problem:\n" + item["problem"] + "\n\n"
769
+
770
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
771
+ return f"Solution: {item['solution']}"
772
+
773
+ def _get_choices(self, item: dict[str, Any]) -> list[str]:
774
+ answer = normalized_gold_from_solution(item["solution"])
775
+ template = f"\nFinal Answer: The final answer is {answer}. I hope it is correct."
776
+
777
+ return [item["solution"] + template]
778
+
779
+ def _get_correct_index(self, item: dict[str, Any]) -> int:
780
+ return 0
781
+
782
+ def _get_raw_question(self, item: dict[str, Any]) -> str:
783
+ return item["problem"]
784
+
785
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
786
+ return self._get_choices(item)[0]