eval-framework 0.2.14__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. {eval_framework-0.2.14 → eval_framework-0.3.1}/PKG-INFO +3 -2
  2. {eval_framework-0.2.14 → eval_framework-0.3.1}/pyproject.toml +4 -3
  3. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/determined.py +1 -0
  4. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/eval.py +2 -0
  5. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/local.py +1 -0
  6. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/evaluation_generator.py +68 -4
  7. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/main.py +1 -1
  8. eval_framework-0.3.1/src/eval_framework/metrics/aggregators/aggregators.py +139 -0
  9. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/base.py +5 -0
  10. eval_framework-0.3.1/src/eval_framework/metrics/completion/accuracy_completion.py +116 -0
  11. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/code_assertion.py +13 -1
  12. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +60 -7
  13. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/drop_completion.py +18 -9
  14. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/f1.py +41 -2
  15. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/math_minerva_completion.py +2 -0
  16. eval_framework-0.3.1/src/eval_framework/metrics/completion/multipl_e_assertion.py +206 -0
  17. eval_framework-0.3.1/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +98 -0
  18. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +13 -3
  19. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/response_generator.py +24 -9
  20. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/run.py +33 -1
  21. eval_framework-0.3.1/src/eval_framework/suite.py +387 -0
  22. eval_framework-0.3.1/src/eval_framework/tasks/Dockerfile_codebench +9 -0
  23. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/base.py +39 -7
  24. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc_de.py +2 -1
  25. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/bigcodebench.py +77 -1
  26. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/csqa.py +8 -1
  27. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/drop.py +75 -13
  28. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/global_mmlu.py +9 -2
  29. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/gsm8k.py +68 -1
  30. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/hellaswag.py +5 -0
  31. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +2 -1
  32. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py +53 -0
  33. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/medqa.py +6 -0
  34. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu.py +1 -1
  35. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu_de.py +2 -1
  36. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmmlu.py +7 -1
  37. eval_framework-0.3.1/src/eval_framework/tasks/benchmarks/multipl_e.py +234 -0
  38. eval_framework-0.3.1/src/eval_framework/tasks/benchmarks/naturalqs_open.py +103 -0
  39. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/piqa.py +3 -1
  40. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/sciq.py +2 -0
  41. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/social_iqa.py +12 -1
  42. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/squad.py +39 -1
  43. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/tablebench.py +5 -5
  44. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winogrande.py +55 -1
  45. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/eval_config.py +4 -1
  46. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/task_names.py +22 -0
  47. eval_framework-0.3.1/src/eval_framework/tasks/task_style.py +387 -0
  48. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/utils.py +93 -12
  49. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/generate_task_docs.py +9 -4
  50. eval_framework-0.3.1/src/template_formatting/py.typed +0 -0
  51. eval_framework-0.2.14/src/eval_framework/metrics/completion/accuracy_completion.py +0 -16
  52. eval_framework-0.2.14/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -51
  53. eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -100
  54. {eval_framework-0.2.14 → eval_framework-0.3.1}/LICENSE +0 -0
  55. {eval_framework-0.2.14 → eval_framework-0.3.1}/README.md +0 -0
  56. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/__init__.py +0 -0
  57. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/base_config.py +0 -0
  58. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/__init__.py +0 -0
  59. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/exceptions.py +0 -0
  60. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/drop_process_results.py +0 -0
  61. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/README.md +0 -0
  62. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
  63. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
  64. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
  65. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
  66. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/__init__.py +0 -0
  67. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/aleph_alpha.py +0 -0
  68. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/base.py +0 -0
  69. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/huggingface.py +0 -0
  70. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/mistral.py +0 -0
  71. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/models.py +0 -0
  72. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/openai.py +0 -0
  73. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/vllm.py +0 -0
  74. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/logger.py +0 -0
  75. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/__init__.py +0 -0
  76. {eval_framework-0.2.14/src/eval_framework/metrics/efficiency → eval_framework-0.3.1/src/eval_framework/metrics/aggregators}/__init__.py +0 -0
  77. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/__init__.py +0 -0
  78. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
  79. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/bleu.py +0 -0
  80. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/chrf.py +0 -0
  81. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/comet.py +0 -0
  82. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
  83. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/csv_format.py +0 -0
  84. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
  85. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
  86. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/format_checker.py +0 -0
  87. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
  88. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/ifeval.py +0 -0
  89. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/json_format.py +0 -0
  90. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/language_checker.py +0 -0
  91. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/length_control.py +0 -0
  92. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
  93. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
  94. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
  95. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
  96. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/repetition.py +0 -0
  97. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
  98. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
  99. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
  100. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
  101. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
  102. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/ter.py +0 -0
  103. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/text_counter.py +0 -0
  104. {eval_framework-0.2.14/src/eval_framework/metrics/llm → eval_framework-0.3.1/src/eval_framework/metrics/efficiency}/__init__.py +0 -0
  105. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
  106. {eval_framework-0.2.14/src/eval_framework/metrics/loglikelihood → eval_framework-0.3.1/src/eval_framework/metrics/llm}/__init__.py +0 -0
  107. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/base.py +0 -0
  108. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
  109. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
  110. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
  111. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
  112. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
  113. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
  114. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
  115. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/language.py +0 -0
  116. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
  117. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/models.py +0 -0
  118. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
  119. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
  120. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
  121. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
  122. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
  123. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
  124. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
  125. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
  126. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
  127. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
  128. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
  129. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
  130. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
  131. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
  132. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
  133. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/utils.py +0 -0
  134. {eval_framework-0.2.14/src/eval_framework/result_processors → eval_framework-0.3.1/src/eval_framework/metrics/loglikelihood}/__init__.py +0 -0
  135. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
  136. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
  137. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
  138. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
  139. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
  140. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/py.typed +0 -0
  141. {eval_framework-0.2.14/src/eval_framework/tasks/benchmarks → eval_framework-0.3.1/src/eval_framework/result_processors}/__init__.py +0 -0
  142. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/result_processors/base.py +0 -0
  143. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/result_processors/hf_uploader.py +0 -0
  144. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/result_processors/result_processor.py +0 -0
  145. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
  146. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/run_direct.py +0 -0
  147. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/shared/types.py +0 -0
  148. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/__init__.py +0 -0
  149. {eval_framework-0.2.14/src/template_formatting → eval_framework-0.3.1/src/eval_framework/tasks/benchmarks}/__init__.py +0 -0
  150. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
  151. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
  152. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
  153. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
  154. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
  155. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
  156. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
  157. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
  158. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
  159. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
  160. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
  161. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
  162. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
  163. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
  164. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
  165. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/include.py +0 -0
  166. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
  167. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
  168. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
  169. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
  170. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
  171. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
  172. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
  173. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
  174. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
  175. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
  176. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
  177. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
  178. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
  179. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
  180. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
  181. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
  182. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/perturbation.py +0 -0
  183. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/registry.py +0 -0
  184. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/task_loader.py +0 -0
  185. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/constants.py +0 -0
  186. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/file_ops.py +0 -0
  187. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/helpers.py +0 -0
  188. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/logging.py +0 -0
  189. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/packaging.py +0 -0
  190. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/tqdm_handler.py +0 -0
  191. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/template_formatting/README.md +0 -0
  192. /eval_framework-0.2.14/src/template_formatting/py.typed → /eval_framework-0.3.1/src/template_formatting/__init__.py +0 -0
  193. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/template_formatting/formatter.py +0 -0
  194. {eval_framework-0.2.14 → eval_framework-0.3.1}/src/template_formatting/mistral_formatter.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-framework
3
- Version: 0.2.14
3
+ Version: 0.3.1
4
4
  Summary: Evalulation Framework
5
5
  Author: Aleph Alpha Research
6
6
  License: Apache License
@@ -228,7 +228,7 @@ Requires-Dist: jsonschema>=4.23.0,<5
228
228
  Requires-Dist: mysql-connector-python>=9.0.0,<10
229
229
  Requires-Dist: psycopg2-binary>=2.9.9,<3
230
230
  Requires-Dist: sympy>=1.13.1,<2
231
- Requires-Dist: llm-sandbox[docker]>=0.1.8,<0.2
231
+ Requires-Dist: llm-sandbox[docker]==0.3.37
232
232
  Requires-Dist: jsonlines>=4,<5
233
233
  Requires-Dist: lxml>=6,<7
234
234
  Requires-Dist: python-iso639>=2025.2.18
@@ -236,6 +236,7 @@ Requires-Dist: wandb>=0.23.0,<1
236
236
  Requires-Dist: boto3>=1.40.54,<2
237
237
  Requires-Dist: numpy>=1.26.4
238
238
  Requires-Dist: antlr4-python3-runtime==4.11.0
239
+ Requires-Dist: scipy>=1.14.0,<2
239
240
  Requires-Dist: accelerate ; extra == 'accelerate'
240
241
  Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
241
242
  Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "eval-framework"
3
- version = "0.2.14"
3
+ version = "0.3.1"
4
4
  description = "Evalulation Framework"
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -35,7 +35,7 @@ dependencies = [
35
35
  "mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
36
36
  "psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
37
37
  "sympy>=1.13.1,<2",
38
- "llm-sandbox[docker]>=0.1.8,<0.2",
38
+ "llm-sandbox[docker]==0.3.37",
39
39
  "jsonlines>=4,<5",
40
40
  "lxml>=6,<7",
41
41
  "python-iso639>=2025.2.18",
@@ -45,6 +45,8 @@ dependencies = [
45
45
  # is a dependency of sympy, but not explicitly listed in the requirements.txt
46
46
  # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
47
47
  "antlr4-python3-runtime==4.11.0",
48
+ "scipy>=1.14.0,<2", # required for the aggregation of pass@k metrics
49
+
48
50
  ]
49
51
 
50
52
  [project.optional-dependencies]
@@ -105,7 +107,6 @@ dev = [
105
107
  "types-requests>=2.32.0.20250328,<3",
106
108
  "plotly>=5.24.1,<6",
107
109
  "ruff>=0.12.8",
108
- "scipy>=1.14.0,<2", # for tests comparing our Hungarian implementation to scipy
109
110
  ]
110
111
  flash-attn = [
111
112
  "flash-attn>=2.7.2.post1,<2.8",
@@ -148,6 +148,7 @@ class DeterminedContext(EvalContext):
148
148
  wandb_project=self.hparams.wandb_project or self.wandb_project,
149
149
  wandb_entity=self.hparams.wandb_entity or self.wandb_entity,
150
150
  wandb_run_id=self.hparams.wandb_run_id or self.wandb_run_id,
151
+ wandb_group=self.wandb_group,
151
152
  wandb_upload_results=self.hparams.wandb_upload_results or self.wandb_upload_results,
152
153
  batch_size=self.hparams.task_args.batch_size or self.batch_size,
153
154
  description=self.hparams.description or self.description,
@@ -61,6 +61,7 @@ class EvalContext(AbstractContextManager):
61
61
  wandb_project: str | None = None,
62
62
  wandb_entity: str | None = None,
63
63
  wandb_run_id: str | None = None,
64
+ wandb_group: str | None = None,
64
65
  wandb_upload_results: bool | None = None,
65
66
  hf_upload_dir: str | None = None,
66
67
  hf_upload_repo: str | None = None,
@@ -89,6 +90,7 @@ class EvalContext(AbstractContextManager):
89
90
  self.wandb_project = wandb_project
90
91
  self.wandb_entity = wandb_entity
91
92
  self.wandb_run_id = wandb_run_id
93
+ self.wandb_group = wandb_group
92
94
  self.wandb_upload_results = wandb_upload_results
93
95
  self.hf_upload_dir = hf_upload_dir
94
96
  self.hf_upload_repo = hf_upload_repo
@@ -58,6 +58,7 @@ class LocalContext(EvalContext):
58
58
  wandb_entity=self.wandb_entity,
59
59
  wandb_project=self.wandb_project,
60
60
  wandb_run_id=self.wandb_run_id,
61
+ wandb_group=self.wandb_group,
61
62
  wandb_upload_results=self.wandb_upload_results,
62
63
  llm_judge_class=self.llm_judge_class,
63
64
  judge_model_args=self.judge_model_args,
@@ -37,10 +37,17 @@ class EvaluationGenerator:
37
37
  self.save_intermediate_results = config.save_intermediate_results
38
38
 
39
39
  task_class = get_task(config.task_name)
40
- if task_class.RESPONSE_TYPE == ResponseType.COMPLETION:
41
- self.metrics = task_class.METRICS + [BytesCompletion, SequencePositionsCompletion]
42
- elif task_class.RESPONSE_TYPE == ResponseType.LOGLIKELIHOODS:
43
- self.metrics = task_class.METRICS + [BytesLoglikelihood, SequencePositionsLoglikelihood]
40
+ if hasattr(task_class, "TASK_STYLER"):
41
+ response_type = task_class.TASK_STYLER.response_type
42
+ task_metrics = list(task_class.TASK_STYLER.metrics)
43
+ else:
44
+ response_type = task_class.RESPONSE_TYPE
45
+ task_metrics = task_class.METRICS
46
+
47
+ if response_type == ResponseType.COMPLETION:
48
+ self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
49
+ elif response_type == ResponseType.LOGLIKELIHOODS:
50
+ self.metrics = task_metrics + [BytesLoglikelihood, SequencePositionsLoglikelihood]
44
51
  else:
45
52
  raise NotImplementedError
46
53
 
@@ -243,6 +250,61 @@ class EvaluationGenerator:
243
250
 
244
251
  return aggregated_results
245
252
 
253
+ def _aggregate_results_with_aggregators(self, results: list[Result]) -> dict[str, float | None]:
254
+ data = pd.DataFrame([r.model_dump() for r in results])
255
+ if len(data) == 0:
256
+ return {}
257
+ data = data.fillna({"key": ""})
258
+ aggregated_results: dict[str, float | None] = {}
259
+ data = data.loc[data.error.isnull()]
260
+
261
+ for (metric_name, current_metric_class), metric_group in data.groupby(["metric_name", "metric_class_name"]):
262
+ # The reason we groupby over both metric_name and metric_class_name is because we want to aggregate
263
+ # results for a single metric. Two metric classes can implement the same metric name. We want to separate
264
+ # those cases. We cannot group over only metric_class_name because each metric class can implement
265
+ # multiple metrics with different names.
266
+ current_metric = None
267
+ # now loop over the self.metrics list and find the metric class that matches the current_metric_class
268
+ for metric_class in self.metrics:
269
+ if metric_class.__name__ == current_metric_class:
270
+ current_metric = metric_class
271
+ break
272
+ if current_metric is None:
273
+ raise ValueError(f"Metric {metric_name} not found in metrics list")
274
+
275
+ for aggregator in current_metric.AGGREGATORS:
276
+ aggregated_results[f"{aggregator.name} {current_metric_class}.{metric_name}"] = (
277
+ aggregator(metric_group, ["prompt"]) # Compute the aggregator, grouped by the prompt...
278
+ .groupby(["key", "subject"]) # ... then group by key, subject...
279
+ .agg({"value": "mean"})["value"] # ...and average scores over each key, subject group...
280
+ .mean() # ...and lastly average the scores across all groups giving equal weight to every
281
+ .item() # key, subject group.
282
+ )
283
+
284
+ # Loop to additionally compute per-subject/per-key breakdown metric scores, e.g. for only subject="algebra"
285
+ for (key, subject, metric_name, current_metric_class), ksm_group in data.groupby(
286
+ ["key", "subject", "metric_name", "metric_class_name"]
287
+ ):
288
+ current_metric = None
289
+ # now loop over the self.metrics list and find the metric class that matches the current_metric_class
290
+ for metric_class in self.metrics:
291
+ if metric_class.__name__ == current_metric_class:
292
+ current_metric = metric_class
293
+ break
294
+
295
+ if current_metric is None:
296
+ raise ValueError(f"Metric {metric_name} not found in metrics list. This should never happen.")
297
+
298
+ for aggregator in current_metric.AGGREGATORS:
299
+ save_string = (
300
+ f"{aggregator.name} {metric_name} - {subject}"
301
+ if not key
302
+ else f"{aggregator.name} {metric_name} - {key} - {subject}"
303
+ )
304
+ aggregated_results[save_string] = aggregator(ksm_group, ["prompt"])["value"].mean().mean().item()
305
+
306
+ return aggregated_results
307
+
246
308
  def run_eval(self) -> list[Result]:
247
309
  """Runs evaluation using saved completions."""
248
310
  logger.info("Running evaluation...")
@@ -252,6 +314,8 @@ class EvaluationGenerator:
252
314
 
253
315
  metrics_results = self._run_metric_calculators(responses)
254
316
  aggregated_results = self._aggregate_results(metrics_results)
317
+ results_with_aggregators = self._aggregate_results_with_aggregators(metrics_results)
318
+ aggregated_results.update(results_with_aggregators)
255
319
 
256
320
  wandb.log(aggregated_results)
257
321
  self.result_processor.save_aggregated_results(aggregated_results)
@@ -66,7 +66,7 @@ def main(
66
66
  with wandb.init(
67
67
  entity=config.wandb_entity,
68
68
  project=config.wandb_project,
69
- group=llm.name[:127],
69
+ group=(config.wandb_group or llm.name)[:127],
70
70
  job_type=config.task_name[:63],
71
71
  id=wandb_run_id, # (potentially resuming run after preemption)
72
72
  config=response_generator._get_metadata(),
@@ -0,0 +1,139 @@
1
+ from typing import Any, Protocol
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy.special import comb
6
+
7
+
8
+ class Aggregator(Protocol):
9
+ """Base class for metric aggregators.
10
+
11
+ An aggregator collapses multiple evaluation rows for the same problem (i.e. prompt) into a
12
+ single score per problem. The input DataFrame has one row per (problem, attempt)
13
+ pair; the output has one row per problem with a new ``value``.
14
+
15
+ Args:
16
+ response_df: DataFrame where each row is one evaluation attempt. Must contain
17
+ a ``value`` column (the per-attempt score) and all ``identifier_columns``.
18
+ identifier_columns: Columns that uniquely identify a problem (e.g. ``["prompt"]``).
19
+ Rows sharing the same identifier are different attempts at the same problem.
20
+
21
+ Returns:
22
+ DataFrame with one row per unique problem and a ``value`` column holding
23
+ the aggregated score. All non-identifier, non-value columns are preserved
24
+ (typically via ``"first"``).
25
+
26
+ Example input (``identifier_columns=["prompt"]``, 3 attempts per problem):
27
+
28
+ | prompt | value | subject |
29
+ |----------------|-------|---------|
30
+ | "What is 2+2?" | 1.0 | algebra |
31
+ | "What is 2+2?" | 1.0 | algebra |
32
+ | "What is 2+2?" | 0.0 | algebra |
33
+ | "Solve x^2=4" | 0.0 | algebra |
34
+ | "Solve x^2=4" | 1.0 | algebra |
35
+ | "Solve x^2=4" | 0.0 | algebra |
36
+ """
37
+
38
+ name: str
39
+
40
+ def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame: ...
41
+
42
+
43
+ def closed_form_passatk(n: int, c: int, k: int) -> float:
44
+ """Closed-form pass@k estimator (see HumanEval paper).
45
+
46
+ pass@k = 1 - C(n-c, k) / C(n, k)
47
+
48
+ Given n total samples with c correct, this is the probability that at least one of k
49
+ randomly chosen samples is correct. The ratio C(n-c,k)/C(n,k) is the chance all k picks
50
+ are wrong; subtracting from 1 gives success probability. When n-c < k there aren't enough
51
+ wrong samples to fill k slots, so the result is trivially 1.
52
+ """
53
+ if n < k:
54
+ return 1.0 if c > 0 else 0.0
55
+ if n - c < k:
56
+ return 1.0
57
+ return 1.0 - comb(n - c, k, exact=False) / comb(n, k, exact=False)
58
+
59
+
60
+ class PassAtK(Aggregator):
61
+ """Computes pass@k: the probability that at least one of k random attempts is correct.
62
+
63
+ Groups rows by ``identifier_columns``, counts correct (``c = sum(value)``) and
64
+ total (``n = count(value)``) attempts per problem, then applies the closed-form
65
+ estimator.
66
+
67
+ Expects ``value`` to be binary (0 or 1). For k=1 this is equivalent to the mean.
68
+
69
+ Example (k=2, continuing from the Aggregator docstring example):
70
+ "What is 2+2?": n=3, c=2, k=2 -> 1.0 (guaranteed correct pick)
71
+ "Solve x^2=4": n=3, c=1, k=2 -> 0.667 (as computed by the `closed_form_passatk`)
72
+
73
+ Output:
74
+ | prompt | value | subject |
75
+ |----------------|-------|---------|
76
+ | "What is 2+2?" | 1.000 | algebra |
77
+ | "Solve x^2=4" | 0.667 | algebra |
78
+ """
79
+
80
+ def __init__(self, k: int = 1) -> None:
81
+ self.k = k
82
+ self.name = f"Pass@{k}"
83
+
84
+ def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
85
+ # agg_dict decides how each column (`agg_dict` key) will get aggregated (`agg_dict` value).
86
+ # For the `value` column, we compute both the sum and the count, for all other columns we simply pick the first
87
+ # entry (as they are identical anyway).
88
+ other_cols = [c for c in response_df.columns if c not in identifier_columns and c != "value"]
89
+ agg_dict = {"value": ["sum", "count"], **{c: "first" for c in other_cols}}
90
+ agg = response_df.groupby(identifier_columns).agg(agg_dict)
91
+ # flatten multi-index columns from value agg: ("value", "sum") / ("value", "count")
92
+ c = agg[("value", "sum")].values
93
+ n = agg[("value", "count")].values
94
+ scores = np.array([closed_form_passatk(n_i, c_i, self.k) for n_i, c_i in zip(n, c)])
95
+ out = agg.drop(columns=[("value", "sum"), ("value", "count")])
96
+ if isinstance(out.columns, pd.MultiIndex):
97
+ out.columns = out.columns.droplevel(1)
98
+ return out.assign(value=scores).reset_index()
99
+
100
+
101
+ class IdentifierMean(Aggregator):
102
+ """Computes the arithmetic mean of ``value`` across attempts per problem.
103
+
104
+ Example (continuing from the Aggregator docstring example):
105
+
106
+ "What is 2+2?": mean(1.0, 1.0, 0.0) = 0.667
107
+ "Solve x^2=4": mean(0.0, 1.0, 0.0) = 0.333
108
+
109
+ Output:
110
+ | prompt | value | subject |
111
+ |----------------|-------|---------|
112
+ | "What is 2+2?" | 0.667 | algebra |
113
+ | "Solve x^2=4" | 0.333 | algebra |
114
+ """
115
+
116
+ def __init__(self) -> None:
117
+ self.name = "IdentifierMean"
118
+
119
+ def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
120
+ agg_dict = {
121
+ "value": "mean",
122
+ }
123
+ other_cols = [c for c in response_df.columns if c not in identifier_columns and c != "value"]
124
+ agg_dict.update({c: "first" for c in other_cols})
125
+ return response_df.groupby(identifier_columns).agg(agg_dict).reset_index()
126
+
127
+
128
+ class Identity:
129
+ """No-op aggregator — returns the input unchanged.
130
+
131
+ Use for metrics where each row is already a final score and no cross-attempt
132
+ aggregation is needed (e.g. when ``num_samples=1``).
133
+ """
134
+
135
+ def __init__(self) -> None:
136
+ self.name = "Identity"
137
+
138
+ def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
139
+ return response_df
@@ -3,6 +3,7 @@ from typing import Any
3
3
 
4
4
  from pydantic import BaseModel, ConfigDict
5
5
 
6
+ from eval_framework.metrics.aggregators.aggregators import Aggregator
6
7
  from eval_framework.shared.types import Error
7
8
 
8
9
 
@@ -28,6 +29,10 @@ class classproperty:
28
29
  class BaseMetric[Response](ABC):
29
30
  NAME: str
30
31
  KEYS: list[str] | None = None
32
+ # The aggregator determines how to aggregate the results of a metric for a single
33
+ # sample over multiple runs (LLM calls). We default to averaging and thus making
34
+ # macro averaging the overall computation default.
35
+ AGGREGATORS: list[Aggregator] = []
31
36
 
32
37
  @classproperty
33
38
  def NAMES(cls) -> list[str]:
@@ -0,0 +1,116 @@
1
+ import re
2
+ import string
3
+ from typing import Any
4
+
5
+ import numpy as np
6
+
7
+ from eval_framework.metrics.base import BaseMetric, MetricResult
8
+ from eval_framework.shared.types import Completion
9
+
10
+
11
+ class AccuracyCompletion(BaseMetric[Completion]):
12
+ NAME = "Accuracy Completion"
13
+
14
+ def calculate(self, response: Completion) -> list[MetricResult]:
15
+ if response.error is not None:
16
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
17
+
18
+ ground_truths = response.ground_truth_list
19
+ is_correct = any(response.completion == gt for gt in ground_truths)
20
+ return [
21
+ MetricResult(metric_name=self.NAME, value=float(is_correct), higher_is_better=True, error=response.error)
22
+ ]
23
+
24
+
25
+ class AccuracyCompletionWithEvaluate(AccuracyCompletion):
26
+ def __init__(self, regexes_to_ignore: list[str], ignore_case: bool = False, ignore_punctuation: bool = False):
27
+ self.regexes_to_ignore = regexes_to_ignore
28
+ self.ignore_case = ignore_case
29
+ self.ignore_punctuation = ignore_punctuation
30
+
31
+ def calculate(self, response: Completion) -> list[MetricResult]:
32
+ if response.error is not None:
33
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
34
+ ground_truths = response.ground_truth_list
35
+ model_answer = response.completion
36
+
37
+ is_correct = exact_match_hf_evaluate(
38
+ predictions=[model_answer] * len(ground_truths),
39
+ references=ground_truths, # type: ignore[arg-type]
40
+ regexes_to_ignore=self.regexes_to_ignore,
41
+ ignore_case=self.ignore_case,
42
+ ignore_punctuation=self.ignore_punctuation,
43
+ )["exact_match"]
44
+ return [
45
+ MetricResult(metric_name=self.NAME, value=float(is_correct), higher_is_better=True, error=response.error)
46
+ ]
47
+
48
+
49
+ class AccuracyCompletionOLMES(AccuracyCompletionWithEvaluate):
50
+ # If we did a functools partial, code fails as there an issubclass check that
51
+ # doesn't work with partial. These specific regexes are taken from
52
+ # https://github.com/allenai/olmes/blob/main/oe_eval/tasks/oe_eval_tasks/gsm8k.py#L70
53
+ def __init__(self) -> None:
54
+ super().__init__(regexes_to_ignore=[",", "\\$", "(?s).*#### ", "\\.$"])
55
+
56
+
57
+ # The following code is (largely) reproduced from https://github.com/allenai/olmes/blob/main/oe_eval/dependencies/hf_evaluate/exact_match.py#L25
58
+ # Olmes released under Apache 2.0 license and so is the HF evaluate library.
59
+ # Some cosmetic modifications have been made to fit our codebase and linting rules.
60
+ # -------------------------------------------------------------------------------------
61
+
62
+ ### Code ported from Huggingface's `evaluate` library at
63
+ ### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
64
+ ### which is under the apache license.
65
+ ### Port taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/api/metrics.py used
66
+ ### to fix the issue: https://github.com/EleutherAI/lm-evaluation-harness/pull/2045
67
+
68
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
69
+
70
+ # Licensed under the Apache License, Version 2.0 (the "License");
71
+ # you may not use this file except in compliance with the License.
72
+ # You may obtain a copy of the License at
73
+
74
+ # http://www.apache.org/licenses/LICENSE-2.0
75
+
76
+
77
+ # Unless required by applicable law or agreed to in writing, software
78
+ # distributed under the License is distributed on an "AS IS" BASIS,
79
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
80
+ # See the License for the specific language governing permissions and
81
+ # limitations under the License.
82
+ def exact_match_hf_evaluate(
83
+ predictions: list[str],
84
+ references: list[str],
85
+ regexes_to_ignore: list[str] | None = None,
86
+ ignore_case: bool = False,
87
+ ignore_punctuation: bool = False,
88
+ ignore_numbers: bool = False,
89
+ ) -> dict[str, Any]: # type: ignore
90
+ if regexes_to_ignore is not None:
91
+ for s in regexes_to_ignore:
92
+ predictions = np.array([re.sub(s, "", x) for x in predictions]) # type: ignore
93
+ references = np.array([re.sub(s, "", x) for x in references]) # type: ignore
94
+ else:
95
+ predictions = np.asarray(predictions) # type: ignore
96
+ references = np.asarray(references) # type: ignore
97
+
98
+ if ignore_case:
99
+ predictions = np.char.lower(predictions) # type: ignore
100
+ references = np.char.lower(references) # type: ignore
101
+
102
+ if ignore_punctuation:
103
+ repl_table = string.punctuation.maketrans("", "", string.punctuation)
104
+ predictions = np.char.translate(predictions, table=repl_table) # type: ignore
105
+ references = np.char.translate(references, table=repl_table) # type: ignore
106
+
107
+ if ignore_numbers:
108
+ repl_table = string.digits.maketrans("", "", string.digits)
109
+ predictions = np.char.translate(predictions, table=repl_table) # type: ignore
110
+ references = np.char.translate(references, table=repl_table) # type: ignore
111
+
112
+ # NOTE: For multiple ground-truths OLMES returns the mean over their scores. The max over
113
+ # it would be more meaningful, but we leave it here for parity.
114
+ score_list = predictions == references
115
+
116
+ return {"exact_match": np.mean(score_list)}
@@ -12,7 +12,19 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
12
12
 
13
13
  # this will always be a list, if return is "" this will be an empty list
14
14
  code = response.completion
15
- output = run_python_code(code, image="python:3.12-slim")
15
+ try:
16
+ output = run_python_code(code, image="python:3.12-slim")
17
+ except Exception as e:
18
+ import traceback
19
+
20
+ return [
21
+ MetricResult(
22
+ metric_name=self.NAME,
23
+ value=0.0,
24
+ higher_is_better=True,
25
+ error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
26
+ )
27
+ ]
16
28
 
17
29
  # Split and filter out empty strings
18
30
  output_parts = [part for part in output.split() if part.strip()]
@@ -1,3 +1,4 @@
1
+ import importlib.resources
1
2
  import traceback
2
3
  from collections.abc import Callable
3
4
  from typing import Self
@@ -5,8 +6,17 @@ from typing import Self
5
6
  from pydantic import Field
6
7
 
7
8
  from eval_framework.metrics.base import BaseMetric, MetricResult
8
- from eval_framework.shared.types import BaseMetricContext, Completion, Error, extract_context_metric
9
- from eval_framework.tasks.utils import CallableSerializer, ExecutionResult, execute_python_code_with_tests
9
+ from eval_framework.shared.types import (
10
+ BaseMetricContext,
11
+ Completion,
12
+ Error,
13
+ extract_context_metric,
14
+ )
15
+ from eval_framework.tasks.utils import (
16
+ CallableSerializer,
17
+ ExecutionResult,
18
+ execute_python_code_with_tests,
19
+ )
10
20
 
11
21
 
12
22
  class CodeExecutionBaseContext(BaseMetricContext):
@@ -65,7 +75,14 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
65
75
 
66
76
  def calculate(self, response: Completion) -> list[MetricResult]:
67
77
  if response.error is not None:
68
- return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
78
+ return [
79
+ MetricResult(
80
+ metric_name=self.NAME,
81
+ value=None,
82
+ higher_is_better=True,
83
+ error=response.error,
84
+ )
85
+ ]
69
86
  try:
70
87
  context = extract_context_metric(response, CodeExecutionPassAtOneContext)
71
88
  parsed_context = RealtimeCodeExectionContext.from_context(context)
@@ -76,8 +93,19 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
76
93
  try:
77
94
  c, output = self._count_correct_samples(response.completion, parsed_context)
78
95
  except Exception as e:
79
- error = Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc())
80
- return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=error)]
96
+ error = Error(
97
+ error_class=e.__class__.__name__,
98
+ message=str(e),
99
+ traceback=traceback.format_exc(),
100
+ )
101
+ return [
102
+ MetricResult(
103
+ metric_name=self.NAME,
104
+ value=None,
105
+ higher_is_better=True,
106
+ error=error,
107
+ )
108
+ ]
81
109
 
82
110
  pass_at_k_value = estimate_pass_at_k(n, c, self.k)
83
111
  return [
@@ -90,15 +118,40 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
90
118
  )
91
119
  ]
92
120
 
121
+ def _count_correct_samples(self, completion: str, context: RealtimeCodeExectionContext) -> tuple[int, str]:
122
+ try:
123
+ result = execute_python_code_with_tests(
124
+ code=completion,
125
+ test_code=context.test_code,
126
+ package_mapping=context.package_downloads,
127
+ merge_code_fn=context.snippet_merge_fn,
128
+ image=context.run_env,
129
+ timeout=context.benchmark_timeout,
130
+ parse_output_fn=context.output_parse_fn,
131
+ dockerfile=None,
132
+ )
133
+ except Exception as e:
134
+ return (0, str(e))
135
+ return (1 if result.success else 0), result.output
136
+
137
+
138
+ class CodeExecutionPassAtOneWithCodebench(CodeExecutionPassAtOne):
139
+ NAME = "code-execution-pass@1-codebench"
140
+
141
+ def __init__(self) -> None:
142
+ super().__init__()
143
+ self.dockerfile = str(importlib.resources.files("eval_framework.tasks") / "Dockerfile_codebench")
144
+
93
145
  def _count_correct_samples(self, completion: str, context: RealtimeCodeExectionContext) -> tuple[int, str]:
94
146
  result = execute_python_code_with_tests(
95
147
  code=completion,
96
148
  test_code=context.test_code,
97
- package_mapping=context.package_downloads,
149
+ package_mapping={}, # the docker contains everything
98
150
  merge_code_fn=context.snippet_merge_fn,
99
- image=context.run_env,
151
+ image=None, # dockerfile provided
100
152
  timeout=context.benchmark_timeout,
101
153
  parse_output_fn=context.output_parse_fn,
154
+ dockerfile=self.dockerfile,
102
155
  )
103
156
  return (1 if result.success else 0), result.output
104
157
 
@@ -2,7 +2,11 @@
2
2
 
3
3
  from eval_framework.external.drop_process_results import process_results
4
4
  from eval_framework.metrics.base import BaseMetric, MetricResult
5
- from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
5
+ from eval_framework.shared.types import (
6
+ BaseMetricContext,
7
+ Completion,
8
+ extract_context_metric,
9
+ )
6
10
 
7
11
 
8
12
  class DropMetricContext(BaseMetricContext):
@@ -20,10 +24,13 @@ class DropF1ExactMatch(BaseMetric[Completion]):
20
24
  def calculate(self, response: Completion) -> list[MetricResult]:
21
25
  if response.error is not None:
22
26
  return [
23
- MetricResult(metric_name=f"{self.NAME}/f1", value=None, higher_is_better=True, error=response.error),
24
27
  MetricResult(
25
- metric_name=f"{self.NAME}/exact_match", value=None, higher_is_better=True, error=response.error
26
- ),
28
+ metric_name=name,
29
+ value=None,
30
+ higher_is_better=True,
31
+ error=response.error,
32
+ )
33
+ for name in [n.strip() for n in self.NAME.split("/")]
27
34
  ]
28
35
 
29
36
  context = extract_context_metric(response, DropMetricContext)
@@ -36,12 +43,14 @@ class DropF1ExactMatch(BaseMetric[Completion]):
36
43
  pred_spans = [raw]
37
44
 
38
45
  doc = {"answers": answer_tuples}
39
- results = [pred_spans]
40
- out = process_results(doc, results)
46
+ out = process_results(doc, pred_spans)
41
47
 
42
48
  return [
43
- MetricResult(metric_name="DROP F1", value=out["f1"], higher_is_better=True, error=response.error),
44
49
  MetricResult(
45
- metric_name="Exact Match", value=out["exact_match"], higher_is_better=True, error=response.error
46
- ),
50
+ metric_name=name,
51
+ value=out[key],
52
+ higher_is_better=True,
53
+ error=response.error,
54
+ )
55
+ for name, key in zip([n.strip() for n in self.NAME.split("/")], self.KEYS)
47
56
  ]